Add the quantize command. (#624)

* Add the quantize command. * Bugfix for writing gguf files. * And add a comment.
author: Laurent Mazare <laurent.mazare@gmail.com> 2023-08-27 11:35:19 +0100
committer: GitHub <noreply@github.com> 2023-08-27 11:35:19 +0100
commit: 7151f2cf63b312049fa53713ebf6f0f174cf2fc9 (patch)
tree: 474f3d189707726e8b0b21b0856772f1cca80ebf /candle-core/examples
parent: 6e485f2deb65bf21d21c85b4913149e7d2c65c6b (diff)
download: candle-7151f2cf63b312049fa53713ebf6f0f174cf2fc9.tar.gz
candle-7151f2cf63b312049fa53713ebf6f0f174cf2fc9.tar.bz2
candle-7151f2cf63b312049fa53713ebf6f0f174cf2fc9.zip
1 files changed, 75 insertions, 1 deletions
diff --git a/candle-core/examples/tensor-tools.rs b/candle-core/examples/tensor-tools.rs
index 67e6aa1e..f45cbc7e 100644
--- a/candle-core/examples/tensor-tools.rs
+++ b/candle-core/examples/tensor-tools.rs
@@ -1,5 +1,16 @@
-use candle_core::Result;
+use candle_core::{Device, Result};
 use clap::{Parser, Subcommand, ValueEnum};
+use rayon::prelude::*;
+
+#[derive(ValueEnum, Debug, Clone)]
+enum Quantization {
+    Q2k,
+    Q3k,
+    Q4k,
+    Q5k,
+    Q6k,
+    Q8k,
+}
 
 #[derive(ValueEnum, Debug, Clone)]
 enum Format {
@@ -41,6 +52,17 @@ enum Command {
         #[arg(short, long)]
         verbose: bool,
     },
+
+    Quantize {
+        /// The input file, in gguf format.
+        in_file: std::path::PathBuf,
+        /// The output file, in gguf format.
+        out_file: std::path::PathBuf,
+
+        /// The quantization schema to apply.
+        #[arg(long, value_enum)]
+        quantization: Quantization,
+    },
 }
 
 #[derive(Parser, Debug, Clone)]
@@ -144,6 +166,53 @@ fn run_ls(file: &std::path::PathBuf, format: Option<Format>, verbose: bool) -> R
     Ok(())
 }
 
+fn run_quantize(
+    in_file: std::path::PathBuf,
+    out_file: std::path::PathBuf,
+    q: Quantization,
+) -> Result<()> {
+    use candle_core::quantized::{gguf_file, k_quants, QTensor};
+    // Open the out file early so as to fail directly on missing directories etc.
+    let mut out_file = std::fs::File::create(out_file)?;
+    let mut in_ = std::fs::File::open(&in_file)?;
+    let content = gguf_file::Content::read(&mut in_)?;
+    println!("tensors: {}", content.tensor_infos.len());
+
+    let qtensors = content
+        .tensor_infos
+        .par_iter()
+        .map(|(name, _)| {
+            println!("  quantizing {name}");
+            let mut in_file = std::fs::File::open(&in_file)?;
+            let tensor = content.tensor(&mut in_file, name)?;
+            let tensor = tensor.dequantize(&Device::Cpu)?;
+            // TODO: Only quantize the linear weights, and quantize the final layer weights
+            // differently from the rest.
+            let tensor = match q {
+                Quantization::Q2k => QTensor::quantize::<k_quants::BlockQ2K>(&tensor)?,
+                Quantization::Q3k => QTensor::quantize::<k_quants::BlockQ3K>(&tensor)?,
+                Quantization::Q4k => QTensor::quantize::<k_quants::BlockQ4K>(&tensor)?,
+                Quantization::Q5k => QTensor::quantize::<k_quants::BlockQ5K>(&tensor)?,
+                Quantization::Q6k => QTensor::quantize::<k_quants::BlockQ6K>(&tensor)?,
+                Quantization::Q8k => QTensor::quantize::<k_quants::BlockQ8K>(&tensor)?,
+            };
+            Ok((name, tensor))
+        })
+        .collect::<Result<Vec<_>>>()?;
+    let qtensors = qtensors
+        .iter()
+        .map(|(k, v)| (k.as_str(), v))
+        .collect::<Vec<_>>();
+
+    let metadata = content
+        .metadata
+        .iter()
+        .map(|(k, v)| (k.as_str(), v))
+        .collect::<Vec<_>>();
+    gguf_file::write(&mut out_file, metadata.as_slice(), &qtensors)?;
+    Ok(())
+}
+
 fn main() -> anyhow::Result<()> {
     let args = Args::parse();
     match args.command {
@@ -160,6 +229,11 @@ fn main() -> anyhow::Result<()> {
                 run_ls(file, format.clone(), verbose)?
             }
         }
+        Command::Quantize {
+            in_file,
+            out_file,
+            quantization,
+        } => run_quantize(in_file, out_file, quantization)?,
     }
     Ok(())
 }
author	Laurent Mazare <laurent.mazare@gmail.com>	2023-08-27 11:35:19 +0100
committer	GitHub <noreply@github.com>	2023-08-27 11:35:19 +0100
commit	7151f2cf63b312049fa53713ebf6f0f174cf2fc9 (patch)
tree	474f3d189707726e8b0b21b0856772f1cca80ebf /candle-core/examples
parent	6e485f2deb65bf21d21c85b4913149e7d2c65c6b (diff)
download	candle-7151f2cf63b312049fa53713ebf6f0f174cf2fc9.tar.gz candle-7151f2cf63b312049fa53713ebf6f0f174cf2fc9.tar.bz2 candle-7151f2cf63b312049fa53713ebf6f0f174cf2fc9.zip