summaryrefslogtreecommitdiff
path: root/candle-core/examples
diff options
context:
space:
mode:
authorLaurent Mazare <laurent.mazare@gmail.com>2023-08-27 11:35:19 +0100
committerGitHub <noreply@github.com>2023-08-27 11:35:19 +0100
commit7151f2cf63b312049fa53713ebf6f0f174cf2fc9 (patch)
tree474f3d189707726e8b0b21b0856772f1cca80ebf /candle-core/examples
parent6e485f2deb65bf21d21c85b4913149e7d2c65c6b (diff)
downloadcandle-7151f2cf63b312049fa53713ebf6f0f174cf2fc9.tar.gz
candle-7151f2cf63b312049fa53713ebf6f0f174cf2fc9.tar.bz2
candle-7151f2cf63b312049fa53713ebf6f0f174cf2fc9.zip
Add the quantize command. (#624)
* Add the quantize command. * Bugfix for writing gguf files. * And add a comment.
Diffstat (limited to 'candle-core/examples')
-rw-r--r--candle-core/examples/tensor-tools.rs76
1 files changed, 75 insertions, 1 deletions
diff --git a/candle-core/examples/tensor-tools.rs b/candle-core/examples/tensor-tools.rs
index 67e6aa1e..f45cbc7e 100644
--- a/candle-core/examples/tensor-tools.rs
+++ b/candle-core/examples/tensor-tools.rs
@@ -1,5 +1,16 @@
-use candle_core::Result;
+use candle_core::{Device, Result};
use clap::{Parser, Subcommand, ValueEnum};
+use rayon::prelude::*;
+
+#[derive(ValueEnum, Debug, Clone)]
+enum Quantization {
+ Q2k,
+ Q3k,
+ Q4k,
+ Q5k,
+ Q6k,
+ Q8k,
+}
#[derive(ValueEnum, Debug, Clone)]
enum Format {
@@ -41,6 +52,17 @@ enum Command {
#[arg(short, long)]
verbose: bool,
},
+
+ Quantize {
+ /// The input file, in gguf format.
+ in_file: std::path::PathBuf,
+ /// The output file, in gguf format.
+ out_file: std::path::PathBuf,
+
+ /// The quantization schema to apply.
+ #[arg(long, value_enum)]
+ quantization: Quantization,
+ },
}
#[derive(Parser, Debug, Clone)]
@@ -144,6 +166,53 @@ fn run_ls(file: &std::path::PathBuf, format: Option<Format>, verbose: bool) -> R
Ok(())
}
+fn run_quantize(
+ in_file: std::path::PathBuf,
+ out_file: std::path::PathBuf,
+ q: Quantization,
+) -> Result<()> {
+ use candle_core::quantized::{gguf_file, k_quants, QTensor};
+ // Open the out file early so as to fail directly on missing directories etc.
+ let mut out_file = std::fs::File::create(out_file)?;
+ let mut in_ = std::fs::File::open(&in_file)?;
+ let content = gguf_file::Content::read(&mut in_)?;
+ println!("tensors: {}", content.tensor_infos.len());
+
+ let qtensors = content
+ .tensor_infos
+ .par_iter()
+ .map(|(name, _)| {
+ println!(" quantizing {name}");
+ let mut in_file = std::fs::File::open(&in_file)?;
+ let tensor = content.tensor(&mut in_file, name)?;
+ let tensor = tensor.dequantize(&Device::Cpu)?;
+ // TODO: Only quantize the linear weights, and quantize the final layer weights
+ // differently from the rest.
+ let tensor = match q {
+ Quantization::Q2k => QTensor::quantize::<k_quants::BlockQ2K>(&tensor)?,
+ Quantization::Q3k => QTensor::quantize::<k_quants::BlockQ3K>(&tensor)?,
+ Quantization::Q4k => QTensor::quantize::<k_quants::BlockQ4K>(&tensor)?,
+ Quantization::Q5k => QTensor::quantize::<k_quants::BlockQ5K>(&tensor)?,
+ Quantization::Q6k => QTensor::quantize::<k_quants::BlockQ6K>(&tensor)?,
+ Quantization::Q8k => QTensor::quantize::<k_quants::BlockQ8K>(&tensor)?,
+ };
+ Ok((name, tensor))
+ })
+ .collect::<Result<Vec<_>>>()?;
+ let qtensors = qtensors
+ .iter()
+ .map(|(k, v)| (k.as_str(), v))
+ .collect::<Vec<_>>();
+
+ let metadata = content
+ .metadata
+ .iter()
+ .map(|(k, v)| (k.as_str(), v))
+ .collect::<Vec<_>>();
+ gguf_file::write(&mut out_file, metadata.as_slice(), &qtensors)?;
+ Ok(())
+}
+
fn main() -> anyhow::Result<()> {
let args = Args::parse();
match args.command {
@@ -160,6 +229,11 @@ fn main() -> anyhow::Result<()> {
run_ls(file, format.clone(), verbose)?
}
}
+ Command::Quantize {
+ in_file,
+ out_file,
+ quantization,
+ } => run_quantize(in_file, out_file, quantization)?,
}
Ok(())
}