use crate::{Device, Result, Shape, Tensor}; #[cfg(target_feature = "avx")] pub mod avx; pub mod ggml_file; pub mod k_quants; pub use k_quants::GgmlType; pub struct QTensor { data: Box, shape: Shape, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum GgmlDType { F32, F16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, Q8_1, Q2K, Q3K, Q4K, Q5K, Q6K, Q8K, } impl GgmlDType { pub(crate) fn from_u32(u: u32) -> Result { let dtype = match u { 0 => Self::F32, 1 => Self::F16, 2 => Self::Q4_0, 3 => Self::Q4_1, 6 => Self::Q5_0, 7 => Self::Q5_1, 8 => Self::Q8_0, 9 => Self::Q8_1, 10 => Self::Q2K, 11 => Self::Q3K, 12 => Self::Q4K, 13 => Self::Q5K, 14 => Self::Q6K, 15 => Self::Q8K, _ => crate::bail!("unknown dtype for tensor {u}"), }; Ok(dtype) } /// The type size for blocks in bytes. pub fn type_size(&self) -> usize { use k_quants::*; match self { Self::F32 => 4, Self::F16 => 2, Self::Q4_0 => std::mem::size_of::(), Self::Q4_1 => std::mem::size_of::(), Self::Q5_0 => std::mem::size_of::(), Self::Q5_1 => std::mem::size_of::(), // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L932 Self::Q8_0 => std::mem::size_of::(), Self::Q8_1 => std::mem::size_of::(), Self::Q2K => std::mem::size_of::(), Self::Q3K => std::mem::size_of::(), Self::Q4K => std::mem::size_of::(), Self::Q5K => std::mem::size_of::(), Self::Q6K => std::mem::size_of::(), Self::Q8K => std::mem::size_of::(), } } /// The block size, i.e. the number of elements stored in each block. pub fn blck_size(&self) -> usize { match self { Self::F32 => 1, Self::F16 => 1, Self::Q4_0 => k_quants::QK4_0, Self::Q4_1 => k_quants::QK4_1, Self::Q5_0 => k_quants::QK5_0, Self::Q5_1 => k_quants::QK5_1, Self::Q8_0 => k_quants::QK8_0, Self::Q8_1 => k_quants::QK8_1, Self::Q2K | Self::Q3K | Self::Q4K | Self::Q5K | Self::Q6K | Self::Q8K => k_quants::QK_K, } } } // A version of GgmlType without `vec_dot` so that it can be dyn boxed. pub trait QuantizedType: Send + Sync { fn dtype(&self) -> GgmlDType; fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut [f32]) -> Result<()>; fn to_float(&self, ys: &mut [f32]) -> Result<()>; } impl QuantizedType for Vec { fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut [f32]) -> Result<()> { k_quants::matmul(mkn, lhs, self.as_slice(), dst) } fn dtype(&self) -> GgmlDType { T::DTYPE } fn to_float(&self, ys: &mut [f32]) -> Result<()> { T::to_float(self.as_slice(), ys) } } impl std::fmt::Debug for QTensor { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "QTensor[{:?}; {:?}]", self.shape, self.dtype()) } } fn check_shape(shape: &Shape) -> Result<()> { let dims = shape.dims(); if dims.is_empty() { crate::bail!("scalar tensor cannot be quantized {shape:?}") } if dims[dims.len() - 1] % T::BLCK_SIZE != 0 { crate::bail!( "quantized tensor must have their last dim divisible by block size {shape:?} {}", T::BLCK_SIZE ) } Ok(()) } impl QTensor { pub fn new, T: k_quants::GgmlType + Send + Sync + 'static>( data: Vec, shape: S, ) -> Result { let shape = shape.into(); check_shape::(&shape)?; Ok(Self { data: Box::new(data), shape, }) } pub fn quantize(src: &Tensor) -> Result { let shape = src.shape(); check_shape::(shape)?; let src = src .to_dtype(crate::DType::F32)? .flatten_all()? .to_vec1::()?; if src.len() % T::BLCK_SIZE != 0 { crate::bail!( "tensor size ({shape:?}) is not divisible by block size {}", T::BLCK_SIZE ) } let mut data = vec![T::zeros(); src.len() / T::BLCK_SIZE]; T::from_float(&src, &mut data)?; Ok(Self { data: Box::new(data), shape: shape.clone(), }) } pub fn dtype(&self) -> GgmlDType { self.data.dtype() } pub fn shape(&self) -> &Shape { &self.shape } pub fn dequantize(&self, device: &Device) -> Result { let mut f32_data = vec![0f32; self.shape.elem_count()]; self.data.to_float(&mut f32_data)?; Tensor::from_vec(f32_data, &self.shape, device) } pub fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut [f32]) -> Result<()> { self.data.matmul_t(mkn, lhs, dst) } } #[derive(Debug)] pub struct QMatMul(QTensor); impl QMatMul { pub fn from_qtensor(qtensor: QTensor) -> Self { Self(qtensor) } } impl crate::CustomOp1 for QTensor { fn name(&self) -> &'static str { "qmatmul" } fn cpu_fwd( &self, storage: &crate::CpuStorage, layout: &crate::Layout, ) -> Result<(crate::CpuStorage, Shape)> { if !layout.is_contiguous() { crate::bail!("input tensor is not contiguous {layout:?}") } let src_shape = layout.shape(); // self is transposed so n is first then k. let (n, k) = self.shape.dims2()?; if src_shape.rank() < 2 { crate::bail!("input tensor has only one dimension {layout:?}") } let mut dst_shape = src_shape.dims().to_vec(); let last_k = dst_shape.pop().unwrap(); if last_k != k { crate::bail!("input tensor {layout:?} incompatible with {:?}", self.shape) } dst_shape.push(n); let dst_shape = Shape::from(dst_shape); let storage = storage.as_slice::()?; let storage = &storage[layout.start_offset()..layout.start_offset() + src_shape.elem_count()]; let mut dst_storage = vec![0f32; dst_shape.elem_count()]; self.matmul_t( (dst_shape.elem_count() / n, k, n), storage, &mut dst_storage, )?; Ok((crate::CpuStorage::F32(dst_storage), dst_shape)) } } impl QMatMul { pub fn forward(&self, xs: &Tensor) -> Result { xs.apply_op1_no_bwd(&self.0) } }