diff options
author | Laurent Mazare <laurent.mazare@gmail.com> | 2023-08-16 12:41:07 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-08-16 12:41:07 +0100 |
commit | 3071134788334c972d9e356f53887d2b2ff026b7 (patch) | |
tree | adbb58e3babee5d62fa6150bde4f9bb03770607c /candle-core | |
parent | fec87e86f50da78656a0fb28fc254390435fb3fd (diff) | |
download | candle-3071134788334c972d9e356f53887d2b2ff026b7.tar.gz candle-3071134788334c972d9e356f53887d2b2ff026b7.tar.bz2 candle-3071134788334c972d9e356f53887d2b2ff026b7.zip |
Get the ggml based llama to generate some text. (#464)
* Add more stats to the ggml example.
* Build a quantized model from the file content.
* Move the tensor retrieval in the main crate.
* Start adding the forward pass.
* Add more to the forward pass of the quantized llama.
* Apply the attention layers.
* Add the sampling loop.
* Get the sampling loop to work.
* Minor tweak.
* Add a quantize/dequantize test.
* Bugfix.
* Add a comment + swap the order.
* Bugfixes.
Diffstat (limited to 'candle-core')
-rw-r--r-- | candle-core/src/error.rs | 4 | ||||
-rw-r--r-- | candle-core/src/quantized/ggml_file.rs | 18 | ||||
-rw-r--r-- | candle-core/src/quantized/k_quants.rs | 13 | ||||
-rw-r--r-- | candle-core/src/quantized/mod.rs | 31 | ||||
-rw-r--r-- | candle-core/tests/quantized_tests.rs | 35 |
5 files changed, 75 insertions, 26 deletions
diff --git a/candle-core/src/error.rs b/candle-core/src/error.rs index c18b43c6..1cf20a84 100644 --- a/candle-core/src/error.rs +++ b/candle-core/src/error.rs @@ -210,6 +210,10 @@ impl Error { Self::Wrapped(Box::new(err)) } + pub fn msg(err: impl std::error::Error + Send + Sync + 'static) -> Self { + Self::Msg(err.to_string()) + } + pub fn bt(self) -> Self { let backtrace = std::backtrace::Backtrace::capture(); match backtrace.status() { diff --git a/candle-core/src/quantized/ggml_file.rs b/candle-core/src/quantized/ggml_file.rs index ee23cdde..7afb8670 100644 --- a/candle-core/src/quantized/ggml_file.rs +++ b/candle-core/src/quantized/ggml_file.rs @@ -3,6 +3,7 @@ use super::{k_quants, GgmlDType}; use crate::Result; use byteorder::{LittleEndian, ReadBytesExt}; +use std::collections::HashMap; // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/llama.h#L37 #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -163,6 +164,9 @@ fn read_one_tensor<R: std::io::Seek + std::io::Read>( let ggml_dtype = GgmlDType::from_u32(ggml_dtype)?; let mut dims = vec![0u32; n_dims as usize]; reader.read_u32_into::<LittleEndian>(&mut dims)?; + // The dimensions are stored in reverse order, see for example: + // https://github.com/ggerganov/llama.cpp/blob/b5ffb2849d23afe73647f68eec7b68187af09be6/convert.py#L969 + dims.reverse(); let mut name = vec![0u8; name_len as usize]; reader.read_exact(&mut name)?; let name = String::from_utf8_lossy(&name).into_owned(); @@ -174,7 +178,6 @@ fn read_one_tensor<R: std::io::Seek + std::io::Read>( let dims = dims.iter().map(|&u| u as usize).collect::<Vec<_>>(); let tensor_elems = dims.iter().product::<usize>(); let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.blck_size(); - println!("{name} {ggml_dtype:?} {dims:?}"); // TODO: Mmap version to avoid copying the data around? let mut raw_data = vec![0u8; size_in_bytes]; reader.read_exact(&mut raw_data)?; @@ -188,7 +191,7 @@ pub struct Content { pub magic: VersionedMagic, pub hparams: HParams, pub vocab: Vocab, - pub tensors: Vec<(String, super::QTensor)>, + pub tensors: HashMap<String, super::QTensor>, } impl Content { @@ -199,11 +202,11 @@ impl Content { let magic = VersionedMagic::read(reader)?; let hparams = HParams::read(reader)?; let vocab = Vocab::read(reader, hparams.n_vocab as usize)?; - let mut tensors = vec![]; + let mut tensors = HashMap::new(); while reader.stream_position()? != last_position { let (name, tensor) = read_one_tensor(reader, magic)?; - tensors.push((name, tensor)) + tensors.insert(name, tensor); } Ok(Self { magic, @@ -212,4 +215,11 @@ impl Content { tensors, }) } + + pub fn remove(&mut self, name: &str) -> Result<super::QTensor> { + match self.tensors.remove(name) { + None => crate::bail!("cannot find tensor with name '{name}'"), + Some(tensor) => Ok(tensor), + } + } } diff --git a/candle-core/src/quantized/k_quants.rs b/candle-core/src/quantized/k_quants.rs index 53f3dc65..f7611897 100644 --- a/candle-core/src/quantized/k_quants.rs +++ b/candle-core/src/quantized/k_quants.rs @@ -531,20 +531,21 @@ impl GgmlType for BlockQ4_0 { // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1525 fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> { let k = ys.len(); - if k % QK4_0 != 0 { - crate::bail!("dequantize_row_q4_0: {k} is not divisible by {QK4_0}") + let qk = Self::BLCK_SIZE; + if k % qk != 0 { + crate::bail!("dequantize_row_q4_0: {k} is not divisible by {qk}") } - let nb = k / QK4_0; + let nb = k / qk; for i in 0..nb { let d = xs[i].d.to_f32(); - for j in 0..(QK4_0 / 2) { + for j in 0..(qk / 2) { let x0 = (xs[i].qs[j] & 0x0F) as i16 - 8; let x1 = (xs[i].qs[j] >> 4) as i16 - 8; - ys[i * QK4_0 + j] = (x0 as f32) * d; - ys[i * QK4_0 + j + QK4_0 / 2] = (x1 as f32) * d; + ys[i * qk + j] = (x0 as f32) * d; + ys[i * qk + j + qk / 2] = (x1 as f32) * d; } } Ok(()) diff --git a/candle-core/src/quantized/mod.rs b/candle-core/src/quantized/mod.rs index 842b519b..52dddcf5 100644 --- a/candle-core/src/quantized/mod.rs +++ b/candle-core/src/quantized/mod.rs @@ -50,7 +50,8 @@ impl GgmlDType { Ok(dtype) } - fn type_size(&self) -> usize { + /// The type size for blocks in bytes. + pub fn type_size(&self) -> usize { use k_quants::*; match self { Self::F32 => 4, @@ -71,7 +72,8 @@ impl GgmlDType { } } - fn blck_size(&self) -> usize { + /// The block size, i.e. the number of elements stored in each block. + pub fn blck_size(&self) -> usize { match self { Self::F32 => 1, Self::F16 => 1, @@ -143,16 +145,15 @@ impl QTensor { } } -#[derive(Debug, Clone)] -pub struct QMatMul(std::sync::Arc<QTensor>); +pub struct QMatMul(std::sync::Arc<Box<dyn crate::CustomOp1>>); impl QMatMul { - pub fn new(qtensor: std::sync::Arc<QTensor>) -> Self { - Self(qtensor) + pub fn from_qtensor(qtensor: QTensor) -> Self { + Self(std::sync::Arc::new(Box::new(qtensor))) } } -impl crate::CustomOp1 for QMatMul { +impl crate::CustomOp1 for QTensor { fn name(&self) -> &'static str { "qmatmul" } @@ -166,17 +167,15 @@ impl crate::CustomOp1 for QMatMul { crate::bail!("input tensor is not contiguous {layout:?}") } let src_shape = layout.shape(); - let (k, n) = self.0.shape.dims2()?; + // self is transposed so n is first then k. + let (n, k) = self.shape.dims2()?; if src_shape.rank() < 2 { crate::bail!("input tensor has only one dimension {layout:?}") } let mut dst_shape = src_shape.dims().to_vec(); let last_k = dst_shape.pop().unwrap(); if last_k != k { - crate::bail!( - "input tensor {layout:?} incompatible with {:?}", - self.0.shape - ) + crate::bail!("input tensor {layout:?} incompatible with {:?}", self.shape) } dst_shape.push(n); let dst_shape = Shape::from(dst_shape); @@ -184,7 +183,7 @@ impl crate::CustomOp1 for QMatMul { let storage = &storage[layout.start_offset()..layout.start_offset() + src_shape.elem_count()]; let mut dst_storage = vec![0f32; dst_shape.elem_count()]; - self.0.matmul_t( + self.matmul_t( (dst_shape.elem_count() / n, k, n), storage, &mut dst_storage, @@ -192,3 +191,9 @@ impl crate::CustomOp1 for QMatMul { Ok((crate::CpuStorage::F32(dst_storage), dst_shape)) } } + +impl QMatMul { + pub fn forward(&self, xs: &Tensor) -> Result<Tensor> { + xs.custom_op1_arc(self.0.clone()) + } +} diff --git a/candle-core/tests/quantized_tests.rs b/candle-core/tests/quantized_tests.rs index 2c05abb4..babd71a8 100644 --- a/candle-core/tests/quantized_tests.rs +++ b/candle-core/tests/quantized_tests.rs @@ -30,9 +30,9 @@ fn quantized_matmul() -> Result<()> { ] ); - let qtensor = quantized::QTensor::new(rhs_t, (64, 4)); - let op = quantized::QMatMul::new(std::sync::Arc::new(qtensor)); - let res = tensor_lhs.custom_op1(op)?; + let qtensor = quantized::QTensor::new(rhs_t, (4, 64)); + let matmul = quantized::QMatMul::from_qtensor(qtensor); + let res = matmul.forward(&tensor_lhs)?; assert_eq!( res.to_vec2::<f32>()?, &[ @@ -44,3 +44,32 @@ fn quantized_matmul() -> Result<()> { Ok(()) } + +#[test] +fn quantize_q4_0() -> Result<()> { + use k_quants::BlockQ4_0; + + let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>(); + let mut dst = vec![0f32; 32 * 4]; + let mut quant = vec![BlockQ4_0::zeros(); 4]; + BlockQ4_0::from_float(&src, &mut quant)?; + BlockQ4_0::to_float(&quant, dst.as_mut_slice())?; + assert_eq!( + dst, + &[ + -0.0, -0.0, 3.875, 3.875, 3.875, 3.875, 7.75, 7.75, 7.75, 7.75, 11.625, 11.625, 11.625, + 11.625, 15.5, 15.5, 15.5, 15.5, 19.375, 19.375, 19.375, 19.375, 23.25, 23.25, 23.25, + 23.25, 27.125, 27.125, 27.125, 27.125, 31.0, 31.0, 31.5, 31.5, 31.5, 31.5, 39.375, + 39.375, 39.375, 39.375, 39.375, 39.375, 39.375, 39.375, 47.25, 47.25, 47.25, 47.25, + 47.25, 47.25, 47.25, 47.25, 55.125, 55.125, 55.125, 55.125, 55.125, 55.125, 55.125, + 55.125, 63.0, 63.0, 63.0, 63.0, 59.375, 59.375, 71.25, 71.25, 71.25, 71.25, 71.25, + 71.25, 71.25, 71.25, 71.25, 71.25, 71.25, 71.25, 83.125, 83.125, 83.125, 83.125, + 83.125, 83.125, 83.125, 83.125, 83.125, 83.125, 83.125, 83.125, 95.0, 95.0, 95.0, 95.0, + 95.0, 95.0, 95.25, 95.25, 95.25, 95.25, 95.25, 95.25, 95.25, 95.25, 111.125, 111.125, + 111.125, 111.125, 111.125, 111.125, 111.125, 111.125, 111.125, 111.125, 111.125, + 111.125, 111.125, 111.125, 111.125, 111.125, 127.0, 127.0, 127.0, 127.0, 127.0, 127.0, + 127.0, 127.0 + ] + ); + Ok(()) +} |