16 files changed, 95 insertions, 30 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d6394549..a52429cf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,7 @@
 # Changelog
 This documents the main changes to the `candle` crate.
 
-## Unreleased
+## v0.2.1 - Unreleased
 
 ### Added
 
diff --git a/Cargo.toml b/Cargo.toml
index dc48ccd8..ce41876a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ exclude = [
 resolver = "2"
 
 [workspace.package]
-version = "0.2.0"
+version = "0.2.1"
 edition = "2021"
 description = "Minimalist ML framework."
 repository = "https://github.com/huggingface/candle"
diff --git a/candle-book/Cargo.toml b/candle-book/Cargo.toml
index 6cd0a487..320fb887 100644
--- a/candle-book/Cargo.toml
+++ b/candle-book/Cargo.toml
@@ -11,11 +11,11 @@ readme = "README.md"
 
 [dependencies]
 accelerate-src = { workspace = true, optional = true }
-candle = { path = "../candle-core", version = "0.2.0", package = "candle-core" }
-candle-datasets = { path = "../candle-datasets", version = "0.2.0" }
-candle-nn = { path = "../candle-nn", version = "0.2.0" }
-candle-transformers = { path = "../candle-transformers", version = "0.2.0" }
-candle-flash-attn = { path = "../candle-flash-attn", version = "0.2.0", optional = true }
+candle = { path = "../candle-core", version = "0.2.1", package = "candle-core" }
+candle-datasets = { path = "../candle-datasets", version = "0.2.1" }
+candle-nn = { path = "../candle-nn", version = "0.2.1" }
+candle-transformers = { path = "../candle-transformers", version = "0.2.1" }
+candle-flash-attn = { path = "../candle-flash-attn", version = "0.2.1", optional = true }
 safetensors = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml
index 698cdae0..e7213919 100644
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@@ -12,7 +12,7 @@ readme = "README.md"
 [dependencies]
 accelerate-src = { workspace = true, optional = true }
 byteorder = { workspace = true }
-candle-kernels = { path = "../candle-kernels", version = "0.2.0", optional = true }
+candle-kernels = { path = "../candle-kernels", version = "0.2.1", optional = true }
 cudarc = { workspace = true, optional = true }
 gemm = { workspace = true }
 half = { workspace = true }
diff --git a/candle-core/src/tensor.rs b/candle-core/src/tensor.rs
index 12e98029..ec89af12 100644
--- a/candle-core/src/tensor.rs
+++ b/candle-core/src/tensor.rs
@@ -730,18 +730,24 @@ impl Tensor {
         self.sum_impl(mean_dims, false)? * scale
     }
 
+    /// Gathers the maximum value across the selected dimension. The resulting shape has the same
+    /// number of dimensions as the original tensor and the select dimension has a single element.
     pub fn max_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
         self.reduce_impl(dim, true, ReduceOp::Max)
     }
 
+    /// Similar to `max_keepdim` but the target dimension is squeezed.
     pub fn max<D: Dim>(&self, dim: D) -> Result<Self> {
         self.reduce_impl(dim, false, ReduceOp::Max)
     }
 
+    /// Gathers the minimum value across the selected dimension. The resulting shape has the same
+    /// number of dimensions as the original tensor and the select dimension has a single element.
     pub fn min_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
         self.reduce_impl(dim, true, ReduceOp::Min)
     }
 
+    /// Similar to `min_keepdim` but the target dimension is squeezed.
     pub fn min<D: Dim>(&self, dim: D) -> Result<Self> {
         self.reduce_impl(dim, false, ReduceOp::Min)
     }
@@ -750,6 +756,7 @@ impl Tensor {
         self.reduce_impl(dim, true, ReduceOp::ArgMax)
     }
 
+    /// Similar to `argmax_keepdim` but the target dimension is squeezed.
     pub fn argmax<D: Dim>(&self, dim: D) -> Result<Self> {
         self.reduce_impl(dim, false, ReduceOp::ArgMax)
     }
@@ -758,10 +765,15 @@ impl Tensor {
         self.reduce_impl(dim, true, ReduceOp::ArgMin)
     }
 
+    /// Similar to `argmin_keepdim` but the target dimension is squeezed.
     pub fn argmin<D: Dim>(&self, dim: D) -> Result<Self> {
         self.reduce_impl(dim, false, ReduceOp::ArgMin)
     }
 
+    /// Element-wise comparison between two tensors, e.g. equality, greater than, ... The actual
+    /// comparison operation is specified by the `op` argument.
+    ///
+    /// The returned tensor has the same shape as the original tensors and uses `u8` elements.
     pub fn cmp(&self, rhs: &Self, op: CmpOp) -> Result<Self> {
         let shape = self.same_shape_binary_op(rhs, "cmp")?;
         let storage = self
@@ -771,30 +783,45 @@ impl Tensor {
         Ok(from_storage(storage, shape.dims(), op, false))
     }
 
+    /// Element-wise equality.
     pub fn eq(&self, rhs: &Self) -> Result<Self> {
         self.cmp(rhs, CmpOp::Eq)
     }
 
+    /// Element-wise non-equality.
     pub fn ne(&self, rhs: &Self) -> Result<Self> {
         self.cmp(rhs, CmpOp::Ne)
     }
 
+    /// Element-wise comparison with lower-than, the returned tensor uses value 1 where `self <
+    /// rhs` and 0 otherwise.
     pub fn lt(&self, rhs: &Self) -> Result<Self> {
         self.cmp(rhs, CmpOp::Lt)
     }
 
+    /// Element-wise comparison with greater-than, the returned tensor uses value 1 where `self >
+    /// rhs` and 0 otherwise.
     pub fn gt(&self, rhs: &Self) -> Result<Self> {
         self.cmp(rhs, CmpOp::Gt)
     }
 
+    /// Element-wise comparison with greater-equal, the returned tensor uses value 1 where `self >=
+    /// rhs` and 0 otherwise.
     pub fn ge(&self, rhs: &Self) -> Result<Self> {
         self.cmp(rhs, CmpOp::Ge)
     }
 
+    /// Element-wise comparison with lower-equal, the returned tensor uses value 1 where `self <=
+    /// rhs` and 0 otherwise.
     pub fn le(&self, rhs: &Self) -> Result<Self> {
         self.cmp(rhs, CmpOp::Le)
     }
 
+    /// Upsample the input tensor to the `(target_h, target_w)` size, taking the value of the
+    /// nearest element.
+    ///
+    /// The input tensor should have four dimensions, `(batch, channels, h, w)`, the returned
+    /// tensor also has four dimensions, `(batch, channels, target_h, target_w)`.
     pub fn upsample_nearest2d(&self, target_h: usize, target_w: usize) -> Result<Self> {
         let (n, c, _h, _w) = self.dims4()?;
         let op = BackpropOp::new1(self, Op::UpsampleNearest2D);
@@ -804,11 +831,19 @@ impl Tensor {
         Ok(from_storage(storage, (n, c, target_h, target_w), op, false))
     }
 
+    /// 2D average pooling over an input tensor with multiple channels.
+    ///
+    /// The input tensor should have four dimensions, `(batch, channels, h, w)`, the returned
+    /// tensor also has four dimensions, `(batch, channels, h', w')`. The pooling is performed on
+    /// the two last dimensions using a kernel of size `sz`. The returned element is the average
+    /// value over the kernel window.
     pub fn avg_pool2d<T: crate::ToUsize2>(&self, sz: T) -> Result<Self> {
         let sz = sz.to_usize2();
         self.avg_pool2d_with_stride(sz, sz)
     }
 
+    /// Same as `avg_pool2d` but with a `stride` that can be set to a value different from the
+    /// kernel size.
     pub fn avg_pool2d_with_stride<T: crate::ToUsize2>(
         &self,
         kernel_size: T,
@@ -831,11 +866,19 @@ impl Tensor {
         Ok(from_storage(storage, (n, c, h_out, w_out), op, false))
     }
 
+    /// 2D max pooling over an input tensor with multiple channels.
+    ///
+    /// The input tensor should have four dimensions, `(batch, channels, h, w)`, the returned
+    /// tensor also has four dimensions, `(batch, channels, h', w')`. The pooling is performed on
+    /// the two last dimensions using a kernel of size `sz`, the returned element is the maximum
+    /// value over the kernel window.
     pub fn max_pool2d<T: crate::ToUsize2>(&self, sz: T) -> Result<Self> {
         let sz = sz.to_usize2();
         self.max_pool2d_with_stride(sz, sz)
     }
 
+    /// Same as `max_pool2d` but with a `stride` that can be set to a value different from the
+    /// kernel size.
     pub fn max_pool2d_with_stride<T: crate::ToUsize2>(
         &self,
         kernel_size: T,
@@ -1022,6 +1065,7 @@ impl Tensor {
         Ok(from_storage(storage, self.shape(), op, false))
     }
 
+    /// Accumulate element from `source` at indexes `indexes` and add them to `self`.
     pub fn index_add<D: Dim>(&self, indexes: &Self, source: &Self, dim: D) -> Result<Self> {
         let dim = dim.to_index(self.shape(), "index-add")?;
         let source_dims = source.dims();
@@ -1070,6 +1114,17 @@ impl Tensor {
         Ok(from_storage(storage, self.shape(), op, false))
     }
 
+    /// Gather values across the target dimension.
+    ///
+    /// # Arguments
+    ///
+    /// * `self` - The input tensor.
+    /// * `indexes` - The indices of elements to gather, this should have the same shape as `self`
+    ///   but can have a different number of elements on the target dimension.
+    /// * `dim` - the target dimension.
+    ///
+    /// The resulting tensor has the same shape as `indexes` and use values from `self` indexed on
+    /// dimension `dim` by the values in `indexes`.
     pub fn gather<D: Dim>(&self, indexes: &Self, dim: D) -> Result<Self> {
         let dim = dim.to_index(self.shape(), "gather")?;
         let self_dims = self.dims();
@@ -1100,6 +1155,13 @@ impl Tensor {
         Ok(from_storage(storage, indexes.shape(), op, false))
     }
 
+    /// Select values for the input tensor at the target indexes across the specified dimension.
+    ///
+    /// The `indexes` is argument is an int tensor with a single dimension.
+    /// The output has the same number of dimension as the `self` input. The target dimension of
+    /// the output has length the length of `indexes` and the values are taken from `self` using
+    /// the index from `indexes`. Other dimensions have the same number of elements as the input
+    /// tensor.
     pub fn index_select<D: Dim>(&self, indexes: &Self, dim: D) -> Result<Self> {
         let dim = dim.to_index(self.shape(), "index-select")?;
         let indexes_len = match indexes.dims() {
@@ -1858,6 +1920,8 @@ impl Tensor {
         Ok(from_storage(storage, shape, op, false))
     }
 
+    /// Pad the input tensor using 0s along dimension `dim`. This adds `left` elements before the
+    /// input tensor values and `right` elements after.
     pub fn pad_with_zeros<D: Dim>(&self, dim: D, left: usize, right: usize) -> Result<Self> {
         if left == 0 && right == 0 {
             Ok(self.clone())
@@ -1884,6 +1948,7 @@ impl Tensor {
         }
     }
 
+    /// Run the `forward` method of `m` on `self`.
     pub fn apply<M: crate::Module>(&self, m: &M) -> Result<Self> {
         m.forward(self)
     }
diff --git a/candle-datasets/Cargo.toml b/candle-datasets/Cargo.toml
index f4472a08..d69318e1 100644
--- a/candle-datasets/Cargo.toml
+++ b/candle-datasets/Cargo.toml
@@ -11,8 +11,8 @@ readme = "README.md"
 
 [dependencies]
 byteorder = { workspace = true }
-candle = { path = "../candle-core", version = "0.2.0", package = "candle-core" }
-candle-nn = { path = "../candle-nn", version = "0.2.0" }
+candle = { path = "../candle-core", version = "0.2.1", package = "candle-core" }
+candle-nn = { path = "../candle-nn", version = "0.2.1" }
 hf-hub = { workspace = true}
 intel-mkl-src = { workspace = true, optional = true }
 memmap2 = { workspace = true }
diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml
index 39d2bc72..9035eae0 100644
--- a/candle-examples/Cargo.toml
+++ b/candle-examples/Cargo.toml
@@ -11,11 +11,11 @@ readme = "README.md"
 
 [dependencies]
 accelerate-src = { workspace = true, optional = true }
-candle = { path = "../candle-core", version = "0.2.0", package = "candle-core" }
-candle-datasets = { path = "../candle-datasets", version = "0.2.0" }
-candle-nn = { path = "../candle-nn", version = "0.2.0" }
-candle-transformers = { path = "../candle-transformers", version = "0.2.0" }
-candle-flash-attn = { path = "../candle-flash-attn", version = "0.2.0", optional = true }
+candle = { path = "../candle-core", version = "0.2.1", package = "candle-core" }
+candle-datasets = { path = "../candle-datasets", version = "0.2.1" }
+candle-nn = { path = "../candle-nn", version = "0.2.1" }
+candle-transformers = { path = "../candle-transformers", version = "0.2.1" }
+candle-flash-attn = { path = "../candle-flash-attn", version = "0.2.1", optional = true }
 safetensors = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
diff --git a/candle-examples/examples/mnist-training/main.rs b/candle-examples/examples/mnist-training/main.rs
index a90904c4..16c4bb60 100644
--- a/candle-examples/examples/mnist-training/main.rs
+++ b/candle-examples/examples/mnist-training/main.rs
@@ -242,7 +242,7 @@ struct Args {
     #[arg(long)]
     load: Option<String>,
 
-    /// The file where to load the trained weights from, in safetensors format.
+    /// The directory where to load the dataset from, in ubyte format.
     #[arg(long)]
     local_mnist: Option<String>,
 }
diff --git a/candle-flash-attn/Cargo.toml b/candle-flash-attn/Cargo.toml
index f51c277d..0d130519 100644
--- a/candle-flash-attn/Cargo.toml
+++ b/candle-flash-attn/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "candle-flash-attn"
-version = "0.2.0"
+version = "0.2.1"
 edition = "2021"
 
 description = "Flash attention layer for the candle ML framework."
@@ -11,7 +11,7 @@ license = "MIT OR Apache-2.0"
 readme = "README.md"
 
 [dependencies]
-candle = { path = "../candle-core", features = ["cuda"], version = "0.2.0", package = "candle-core" }
+candle = { path = "../candle-core", features = ["cuda"], version = "0.2.1", package = "candle-core" }
 half = { version = "2.3.1", features = ["num-traits"] }
 
 [build-dependencies]
@@ -21,4 +21,4 @@ rayon = "1.7.0"
 
 [dev-dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
-candle-nn = { path = "../candle-nn", version = "0.2.0", features = ["cuda"] }
+candle-nn = { path = "../candle-nn", version = "0.2.1", features = ["cuda"] }
diff --git a/candle-kernels/Cargo.toml b/candle-kernels/Cargo.toml
index 5f5c356e..576c52ea 100644
--- a/candle-kernels/Cargo.toml
+++ b/candle-kernels/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "candle-kernels"
-version = "0.2.0"
+version = "0.2.1"
 edition = "2021"
 
 description = "CUDA kernels for Candle"
diff --git a/candle-nn/Cargo.toml b/candle-nn/Cargo.toml
index 7ee779f8..aa055583 100644
--- a/candle-nn/Cargo.toml
+++ b/candle-nn/Cargo.toml
@@ -11,7 +11,7 @@ readme = "README.md"
 
 [dependencies]
 accelerate-src = { workspace = true, optional = true }
-candle = { path = "../candle-core", version = "0.2.0", package = "candle-core" }
+candle = { path = "../candle-core", version = "0.2.1", package = "candle-core" }
 thiserror = { workspace = true }
 intel-mkl-src = { workspace = true, optional = true }
 safetensors = { workspace = true }
diff --git a/candle-pyo3/Cargo.toml b/candle-pyo3/Cargo.toml
index 98c5f936..60272c9b 100644
--- a/candle-pyo3/Cargo.toml
+++ b/candle-pyo3/Cargo.toml
@@ -15,7 +15,7 @@ crate-type = ["cdylib"]
 doc = false
 
 [dependencies]
-candle = { path = "../candle-core", version = "0.2.0", package = "candle-core" }
+candle = { path = "../candle-core", version = "0.2.1", package = "candle-core" }
 half = { workspace = true }
 pyo3 = { version = "0.19.0", features = ["extension-module"] }
 
diff --git a/candle-transformers/Cargo.toml b/candle-transformers/Cargo.toml
index 7549dfa5..a05b9bb7 100644
--- a/candle-transformers/Cargo.toml
+++ b/candle-transformers/Cargo.toml
@@ -11,8 +11,8 @@ readme = "README.md"
 
 [dependencies]
 accelerate-src = { workspace = true, optional = true }
-candle = { path = "../candle-core", version = "0.2.0", package = "candle-core" }
-candle-nn = { path = "../candle-nn", version = "0.2.0" }
+candle = { path = "../candle-core", version = "0.2.1", package = "candle-core" }
+candle-nn = { path = "../candle-nn", version = "0.2.1" }
 intel-mkl-src = { workspace = true, optional = true }
 rand = { workspace = true }
 wav = { workspace = true }
diff --git a/candle-wasm-examples/llama2-c/Cargo.toml b/candle-wasm-examples/llama2-c/Cargo.toml
index 95309e1f..d862a833 100644
--- a/candle-wasm-examples/llama2-c/Cargo.toml
+++ b/candle-wasm-examples/llama2-c/Cargo.toml
@@ -9,8 +9,8 @@ categories.workspace = true
 license.workspace = true
 
 [dependencies]
-candle = { path = "../../candle-core", version = "0.2.0", package = "candle-core" }
-candle-nn = { path = "../../candle-nn", version = "0.2.0" }
+candle = { path = "../../candle-core", version = "0.2.1", package = "candle-core" }
+candle-nn = { path = "../../candle-nn", version = "0.2.1" }
 num-traits = { workspace = true }
 tokenizers = { workspace = true, features = ["unstable_wasm"] }
 
diff --git a/candle-wasm-examples/whisper/Cargo.toml b/candle-wasm-examples/whisper/Cargo.toml
index 9578d66e..47e7e094 100644
--- a/candle-wasm-examples/whisper/Cargo.toml
+++ b/candle-wasm-examples/whisper/Cargo.toml
@@ -9,8 +9,8 @@ categories.workspace = true
 license.workspace = true
 
 [dependencies]
-candle = { path = "../../candle-core", version = "0.2.0", package = "candle-core" }
-candle-nn = { path = "../../candle-nn", version = "0.2.0" }
+candle = { path = "../../candle-core", version = "0.2.1", package = "candle-core" }
+candle-nn = { path = "../../candle-nn", version = "0.2.1" }
 num-traits = { workspace = true }
 tokenizers = { workspace = true, features = ["unstable_wasm"] }
 
diff --git a/candle-wasm-examples/yolo/Cargo.toml b/candle-wasm-examples/yolo/Cargo.toml
index cf0fbf8d..b4daf6e6 100644
--- a/candle-wasm-examples/yolo/Cargo.toml
+++ b/candle-wasm-examples/yolo/Cargo.toml
@@ -9,8 +9,8 @@ categories.workspace = true
 license.workspace = true
 
 [dependencies]
-candle = { path = "../../candle-core", version = "0.2.0", package = "candle-core" }
-candle-nn = { path = "../../candle-nn", version = "0.2.0" }
+candle = { path = "../../candle-core", version = "0.2.1", package = "candle-core" }
+candle-nn = { path = "../../candle-nn", version = "0.2.1" }
 num-traits = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }