diff --git a/src/cublas/mod.rs b/src/cublas/mod.rs index 6673676..5296742 100644 --- a/src/cublas/mod.rs +++ b/src/cublas/mod.rs @@ -1,5 +1,15 @@ -//! Wrappers around the [cublas API](https://docs.nvidia.com/cuda/cublas/index.html), -//! in three levels. See crate documentation for description of each. +//! [CudaBlas] wraps around the [cublas API](https://docs.nvidia.com/cuda/cublas/index.html). +//! +//! To use: +//! +//! 1. Instantiate a [CudaBlas] handle with [CudaBlas::new()] +//! 2. Choose your operation: [Gemm], [Gemv], and [Asum] traits, which [CudaBlas] implements. +//! 3. f16/bf16/f32/f64 are all supported at the trait level. +//! 4. Instantiate your corresponding config: [GemmConfig], [StridedBatchedConfig], [GemvConfig], [AsumConfig] +//! 5. Call using [CudaBlas::gemm()], [CudaBlas::gemv()], or [CudaBlas::asum()] +//! +//! Note that all above apis work with [crate::driver::DevicePtr]/[crate::driver::DevicePtrMut], so they +//! accept [crate::driver::CudaSlice], [crate::driver::CudaView], and [crate::driver::CudaViewMut]. pub mod result; pub mod safe; diff --git a/src/cublaslt/mod.rs b/src/cublaslt/mod.rs index 8ee3ba0..f78e712 100644 --- a/src/cublaslt/mod.rs +++ b/src/cublaslt/mod.rs @@ -1,3 +1,11 @@ +//! [CudaBlasLT] wraps around [cuBLASLt](https://docs.nvidia.com/cuda/cublas/index.html#using-the-cublaslt-api) via: +//! +//! 1. Instantiate a [CudaBlasLT] handle with [CudaBlasLT::new()] +//! 2. Execute a gemm using [CudaBlasLT::matmul()] +//! +//! Note that all above apis work with [crate::driver::DevicePtr]/[crate::driver::DevicePtrMut], so they +//! accept [crate::driver::CudaSlice], [crate::driver::CudaView], and [crate::driver::CudaViewMut]. + pub mod result; pub mod safe; #[allow(warnings)] diff --git a/src/cufile/mod.rs b/src/cufile/mod.rs index 8ee3ba0..c10dde3 100644 --- a/src/cufile/mod.rs +++ b/src/cufile/mod.rs @@ -1,3 +1,12 @@ +//! [Cufile] wraps around [cuFILE](https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html) via: +//! +//! 1. Instantiate a new handle to the api with [Cufile::new()] +//! 2. Register a file with [Cufile::register()], this accepts a [std::fs::File]. +//! 3. Read/write from filesystem using [FileHandle::sync_read], [FileHandle::sync_write], [crate::driver::CudaStream::memcpy_dtof()], [crate::driver::CudaStream::memcpy_ftod()]. +//! +//! Note that all safe apis work with [crate::driver::DevicePtr] and [crate::driver::DevicePtrMut], meaning they accept both +//! [crate::driver::CudaSlice] and [crate::driver::CudaView]/[crate::driver::CudaViewMut]. + pub mod result; pub mod safe; #[allow(warnings)] diff --git a/src/curand/mod.rs b/src/curand/mod.rs index 29bb9cf..5d954d0 100644 --- a/src/curand/mod.rs +++ b/src/curand/mod.rs @@ -1,5 +1,10 @@ -//! Wrappers around the [cuRAND API](https://docs.nvidia.com/cuda/curand/index.html) -//! in three levels. See crate documentation for description of each. +//! [CudaRng] safe bindings around [cuRAND](https://docs.nvidia.com/cuda/curand/index.html). +//! +//! Instantiate with [CudaRng::new()], and then fill existing [crate::driver::CudaSlice]/[crate::driver::CudaViewMut] +//! with three different +//! 1. Uniform - [CudaRng::fill_with_uniform()] +//! 2. Normal - [CudaRng::fill_with_normal()] +//! 3. LogNormal - [CudaRng::fill_with_log_normal()] pub mod result; pub mod safe; diff --git a/src/nccl/mod.rs b/src/nccl/mod.rs index 9135e1f..7563ff1 100644 --- a/src/nccl/mod.rs +++ b/src/nccl/mod.rs @@ -1,5 +1,13 @@ -//! Wrappers around the [NCCL API](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) -//! in three levels. See crate documentation for description of each. +//! [Comm] wraps around the [NCCL API](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html), via: +//! +//! 1. Instantiate with [Comm::from_devices()] or [Comm::from_rank()] +//! 2. Peer to peer with [Comm::send()]/[Comm::recv()] +//! 3. Broadcast [Comm::broadcast()]/[Comm::broadcast_in_place()] +//! 4. Reduce: [Comm::reduce()]/[Comm::reduce_in_place()] +//! 5. Gather & Reduce [Comm::all_gather()]/[Comm::all_reduce()]/[Comm::all_reduce_in_place()] +//! +//! Note that all above apis work with [crate::driver::DevicePtr]/[crate::driver::DevicePtrMut], so they +//! accept [crate::driver::CudaSlice], [crate::driver::CudaView], and [crate::driver::CudaViewMut]. pub mod result; pub mod safe; diff --git a/src/nccl/safe.rs b/src/nccl/safe.rs index 2953689..6a4e84a 100644 --- a/src/nccl/safe.rs +++ b/src/nccl/safe.rs @@ -210,6 +210,7 @@ impl Comm { } impl Comm { + /// Send data to one peer, see [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclsend) pub fn send, T: NcclType>( &self, data: &S, @@ -229,6 +230,7 @@ impl Comm { Ok(()) } + /// Receive data from one peer, see [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclrecv) pub fn recv, T: NcclType>( &self, buff: &mut R,