Get the ggml based llama to generate some text. (#464)

* Add more stats to the ggml example. * Build a quantized model from the file content. * Move the tensor retrieval in the main crate. * Start adding the forward pass. * Add more to the forward pass of the quantized llama. * Apply the attention layers. * Add the sampling loop. * Get the sampling loop to work. * Minor tweak. * Add a quantize/dequantize test. * Bugfix. * Add a comment + swap the order. * Bugfixes.
author: Laurent Mazare <laurent.mazare@gmail.com> 2023-08-16 12:41:07 +0100
committer: GitHub <noreply@github.com> 2023-08-16 12:41:07 +0100
commit: 3071134788334c972d9e356f53887d2b2ff026b7 (patch)
tree: adbb58e3babee5d62fa6150bde4f9bb03770607c /candle-core/src/quantized/k_quants.rs
parent: fec87e86f50da78656a0fb28fc254390435fb3fd (diff)
download: candle-3071134788334c972d9e356f53887d2b2ff026b7.tar.gz
candle-3071134788334c972d9e356f53887d2b2ff026b7.tar.bz2
candle-3071134788334c972d9e356f53887d2b2ff026b7.zip
1 files changed, 7 insertions, 6 deletions
diff --git a/candle-core/src/quantized/k_quants.rs b/candle-core/src/quantized/k_quants.rs
index 53f3dc65..f7611897 100644
--- a/candle-core/src/quantized/k_quants.rs
+++ b/candle-core/src/quantized/k_quants.rs
@@ -531,20 +531,21 @@ impl GgmlType for BlockQ4_0 {
     // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1525
     fn to_float(xs: &[Self], ys: &mut [f32]) -> Result<()> {
         let k = ys.len();
-        if k % QK4_0 != 0 {
-            crate::bail!("dequantize_row_q4_0: {k} is not divisible by {QK4_0}")
+        let qk = Self::BLCK_SIZE;
+        if k % qk != 0 {
+            crate::bail!("dequantize_row_q4_0: {k} is not divisible by {qk}")
         }
 
-        let nb = k / QK4_0;
+        let nb = k / qk;
         for i in 0..nb {
             let d = xs[i].d.to_f32();
 
-            for j in 0..(QK4_0 / 2) {
+            for j in 0..(qk / 2) {
                 let x0 = (xs[i].qs[j] & 0x0F) as i16 - 8;
                 let x1 = (xs[i].qs[j] >> 4) as i16 - 8;
 
-                ys[i * QK4_0 + j] = (x0 as f32) * d;
-                ys[i * QK4_0 + j + QK4_0 / 2] = (x1 as f32) * d;
+                ys[i * qk + j] = (x0 as f32) * d;
+                ys[i * qk + j + qk / 2] = (x1 as f32) * d;
             }
         }
         Ok(())
author	Laurent Mazare <laurent.mazare@gmail.com>	2023-08-16 12:41:07 +0100
committer	GitHub <noreply@github.com>	2023-08-16 12:41:07 +0100
commit	3071134788334c972d9e356f53887d2b2ff026b7 (patch)
tree	adbb58e3babee5d62fa6150bde4f9bb03770607c /candle-core/src/quantized/k_quants.rs
parent	fec87e86f50da78656a0fb28fc254390435fb3fd (diff)
download	candle-3071134788334c972d9e356f53887d2b2ff026b7.tar.gz candle-3071134788334c972d9e356f53887d2b2ff026b7.tar.bz2 candle-3071134788334c972d9e356f53887d2b2ff026b7.zip