diff options
author | Laurent Mazare <laurent.mazare@gmail.com> | 2023-08-27 11:35:19 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-08-27 11:35:19 +0100 |
commit | 7151f2cf63b312049fa53713ebf6f0f174cf2fc9 (patch) | |
tree | 474f3d189707726e8b0b21b0856772f1cca80ebf /candle-core/examples | |
parent | 6e485f2deb65bf21d21c85b4913149e7d2c65c6b (diff) | |
download | candle-7151f2cf63b312049fa53713ebf6f0f174cf2fc9.tar.gz candle-7151f2cf63b312049fa53713ebf6f0f174cf2fc9.tar.bz2 candle-7151f2cf63b312049fa53713ebf6f0f174cf2fc9.zip |
Add the quantize command. (#624)
* Add the quantize command.
* Bugfix for writing gguf files.
* And add a comment.
Diffstat (limited to 'candle-core/examples')
-rw-r--r-- | candle-core/examples/tensor-tools.rs | 76 |
1 files changed, 75 insertions, 1 deletions
diff --git a/candle-core/examples/tensor-tools.rs b/candle-core/examples/tensor-tools.rs index 67e6aa1e..f45cbc7e 100644 --- a/candle-core/examples/tensor-tools.rs +++ b/candle-core/examples/tensor-tools.rs @@ -1,5 +1,16 @@ -use candle_core::Result; +use candle_core::{Device, Result}; use clap::{Parser, Subcommand, ValueEnum}; +use rayon::prelude::*; + +#[derive(ValueEnum, Debug, Clone)] +enum Quantization { + Q2k, + Q3k, + Q4k, + Q5k, + Q6k, + Q8k, +} #[derive(ValueEnum, Debug, Clone)] enum Format { @@ -41,6 +52,17 @@ enum Command { #[arg(short, long)] verbose: bool, }, + + Quantize { + /// The input file, in gguf format. + in_file: std::path::PathBuf, + /// The output file, in gguf format. + out_file: std::path::PathBuf, + + /// The quantization schema to apply. + #[arg(long, value_enum)] + quantization: Quantization, + }, } #[derive(Parser, Debug, Clone)] @@ -144,6 +166,53 @@ fn run_ls(file: &std::path::PathBuf, format: Option<Format>, verbose: bool) -> R Ok(()) } +fn run_quantize( + in_file: std::path::PathBuf, + out_file: std::path::PathBuf, + q: Quantization, +) -> Result<()> { + use candle_core::quantized::{gguf_file, k_quants, QTensor}; + // Open the out file early so as to fail directly on missing directories etc. + let mut out_file = std::fs::File::create(out_file)?; + let mut in_ = std::fs::File::open(&in_file)?; + let content = gguf_file::Content::read(&mut in_)?; + println!("tensors: {}", content.tensor_infos.len()); + + let qtensors = content + .tensor_infos + .par_iter() + .map(|(name, _)| { + println!(" quantizing {name}"); + let mut in_file = std::fs::File::open(&in_file)?; + let tensor = content.tensor(&mut in_file, name)?; + let tensor = tensor.dequantize(&Device::Cpu)?; + // TODO: Only quantize the linear weights, and quantize the final layer weights + // differently from the rest. + let tensor = match q { + Quantization::Q2k => QTensor::quantize::<k_quants::BlockQ2K>(&tensor)?, + Quantization::Q3k => QTensor::quantize::<k_quants::BlockQ3K>(&tensor)?, + Quantization::Q4k => QTensor::quantize::<k_quants::BlockQ4K>(&tensor)?, + Quantization::Q5k => QTensor::quantize::<k_quants::BlockQ5K>(&tensor)?, + Quantization::Q6k => QTensor::quantize::<k_quants::BlockQ6K>(&tensor)?, + Quantization::Q8k => QTensor::quantize::<k_quants::BlockQ8K>(&tensor)?, + }; + Ok((name, tensor)) + }) + .collect::<Result<Vec<_>>>()?; + let qtensors = qtensors + .iter() + .map(|(k, v)| (k.as_str(), v)) + .collect::<Vec<_>>(); + + let metadata = content + .metadata + .iter() + .map(|(k, v)| (k.as_str(), v)) + .collect::<Vec<_>>(); + gguf_file::write(&mut out_file, metadata.as_slice(), &qtensors)?; + Ok(()) +} + fn main() -> anyhow::Result<()> { let args = Args::parse(); match args.command { @@ -160,6 +229,11 @@ fn main() -> anyhow::Result<()> { run_ls(file, format.clone(), verbose)? } } + Command::Quantize { + in_file, + out_file, + quantization, + } => run_quantize(in_file, out_file, quantization)?, } Ok(()) } |