summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--candle-core/src/quantized/k_quants.rs8
1 files changed, 5 insertions, 3 deletions
diff --git a/candle-core/src/quantized/k_quants.rs b/candle-core/src/quantized/k_quants.rs
index 65fd6a6e..a0fe455c 100644
--- a/candle-core/src/quantized/k_quants.rs
+++ b/candle-core/src/quantized/k_quants.rs
@@ -85,7 +85,7 @@ const _: () = assert!(std::mem::size_of::<BlockQ8_0>() == 34);
pub struct BlockQ8_1 {
pub(crate) d: f16,
pub(crate) s: f16,
- pub(crate) qs: [u8; QK8_1],
+ pub(crate) qs: [i8; QK8_1],
}
const _: () = assert!(std::mem::size_of::<BlockQ8_1>() == 36);
@@ -278,6 +278,7 @@ impl GgmlType for BlockQ4_1 {
}
sumf += sumi as f32 * f16::to_f32(xs.d) * f16::to_f32(ys.d)
+ + f16::to_f32(xs.m) * f16::to_f32(ys.s)
}
Ok(sumf)
}
@@ -471,6 +472,7 @@ impl GgmlType for BlockQ5_1 {
}
sumf += sumi as f32 * f16::to_f32(xs.d) * f16::to_f32(ys.d)
+ + f16::to_f32(xs.m) * f16::to_f32(ys.s)
}
Ok(sumf)
}
@@ -652,8 +654,8 @@ impl GgmlType for BlockQ8_1 {
for j in 0..Self::BLCK_SIZE / 2 {
let v0 = xs[j] * id;
let v1 = xs[j + Self::BLCK_SIZE / 2] * id;
- ys.qs[j] = f32::round(v0) as u8;
- ys.qs[j + Self::BLCK_SIZE / 2] = f32::round(v1) as u8;
+ ys.qs[j] = f32::round(v0) as i8;
+ ys.qs[j + Self::BLCK_SIZE / 2] = f32::round(v1) as i8;
sum += ys.qs[j] as i32 + ys.qs[j + Self::BLCK_SIZE / 2] as i32;
}
ys.s = f16::from_f32(sum as f32) * ys.d;