coreylowman · coreylowman · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025
diff --git a/src/cublas/mod.rs b/src/cublas/mod.rs
@@ -1,5 +1,15 @@
-//! Wrappers around the [cublas API](https://docs.nvidia.com/cuda/cublas/index.html),
-//! in three levels. See crate documentation for description of each.
+//! [CudaBlas] wraps around the [cublas API](https://docs.nvidia.com/cuda/cublas/index.html).
+//! 
+//! To use:
+//! 
+//! 1. Instantiate a [CudaBlas] handle with [CudaBlas::new()]
+//! 2. Choose your operation: [Gemm], [Gemv], and [Asum] traits, which [CudaBlas] implements.
+//! 3. f16/bf16/f32/f64 are all supported at the trait level.
+//! 4. Instantiate your corresponding config: [GemmConfig], [StridedBatchedConfig], [GemvConfig], [AsumConfig]
+//! 5. Call using [CudaBlas::gemm()], [CudaBlas::gemv()], or [CudaBlas::asum()]
+//! 
+//! Note that all above apis work with [crate::driver::DevicePtr]/[crate::driver::DevicePtrMut], so they
+//! accept [crate::driver::CudaSlice], [crate::driver::CudaView], and [crate::driver::CudaViewMut].
 
 pub mod result;
 pub mod safe;

diff --git a/src/cublaslt/mod.rs b/src/cublaslt/mod.rs
@@ -1,3 +1,11 @@
+//! [CudaBlasLT] wraps around [cuBLASLt](https://docs.nvidia.com/cuda/cublas/index.html#using-the-cublaslt-api) via:
+//! 
+//! 1. Instantiate a [CudaBlasLT] handle with [CudaBlasLT::new()]
+//! 2. Execute a gemm using [CudaBlasLT::matmul()]
+//! 
+//! Note that all above apis work with [crate::driver::DevicePtr]/[crate::driver::DevicePtrMut], so they
+//! accept [crate::driver::CudaSlice], [crate::driver::CudaView], and [crate::driver::CudaViewMut].
+
 pub mod result;
 pub mod safe;
 #[allow(warnings)]

diff --git a/src/cufile/mod.rs b/src/cufile/mod.rs
@@ -1,3 +1,12 @@
+//! [Cufile] wraps around [cuFILE](https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html) via:
+//! 
+//! 1. Instantiate a new handle to the api with [Cufile::new()]
+//! 2. Register a file with [Cufile::register()], this accepts a [std::fs::File].
+//! 3. Read/write from filesystem using [FileHandle::sync_read], [FileHandle::sync_write], [crate::driver::CudaStream::memcpy_dtof()], [crate::driver::CudaStream::memcpy_ftod()].
+//! 
+//! Note that all safe apis work with [crate::driver::DevicePtr] and [crate::driver::DevicePtrMut], meaning they accept both
+//! [crate::driver::CudaSlice] and [crate::driver::CudaView]/[crate::driver::CudaViewMut].
+
 pub mod result;
 pub mod safe;
 #[allow(warnings)]

diff --git a/src/curand/mod.rs b/src/curand/mod.rs
@@ -1,5 +1,10 @@
-//! Wrappers around the [cuRAND API](https://docs.nvidia.com/cuda/curand/index.html)
-//! in three levels. See crate documentation for description of each.
+//! [CudaRng] safe bindings around [cuRAND](https://docs.nvidia.com/cuda/curand/index.html).
+//! 
+//! Instantiate with [CudaRng::new()], and then fill existing [crate::driver::CudaSlice]/[crate::driver::CudaViewMut]
+//! with three different 
+//! 1. Uniform - [CudaRng::fill_with_uniform()]
+//! 2. Normal - [CudaRng::fill_with_normal()]
+//! 3. LogNormal - [CudaRng::fill_with_log_normal()] 
 
 pub mod result;
 pub mod safe;

diff --git a/src/nccl/mod.rs b/src/nccl/mod.rs
@@ -1,5 +1,13 @@
-//! Wrappers around the [NCCL API](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html)
-//! in three levels. See crate documentation for description of each.
+//! [Comm] wraps around the [NCCL API](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html), via:
+//! 
+//! 1. Instantiate with [Comm::from_devices()] or [Comm::from_rank()]
+//! 2. Peer to peer with [Comm::send()]/[Comm::recv()]
+//! 3. Broadcast [Comm::broadcast()]/[Comm::broadcast_in_place()]
+//! 4. Reduce: [Comm::reduce()]/[Comm::reduce_in_place()]
+//! 5. Gather & Reduce [Comm::all_gather()]/[Comm::all_reduce()]/[Comm::all_reduce_in_place()]
+//! 
+//! Note that all above apis work with [crate::driver::DevicePtr]/[crate::driver::DevicePtrMut], so they
+//! accept [crate::driver::CudaSlice], [crate::driver::CudaView], and [crate::driver::CudaViewMut].
 
 pub mod result;
 pub mod safe;

diff --git a/src/nccl/safe.rs b/src/nccl/safe.rs
@@ -210,6 +210,7 @@ impl Comm {
 }
 
 impl Comm {
+    /// Send data to one peer, see [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclsend)
     pub fn send<S: DevicePtr<T>, T: NcclType>(
         &self,
         data: &S,
@@ -229,6 +230,7 @@ impl Comm {
         Ok(())
     }
 
+    /// Receive data from one peer, see [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclrecv)
     pub fn recv<R: DevicePtrMut<T>, T: NcclType>(
         &self,
         buff: &mut R,