diff options
author | Laurent Mazare <laurent.mazare@gmail.com> | 2024-01-07 20:21:49 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-07 20:21:49 +0100 |
commit | 0eb90ed7831d451e2e420ecd158151b44dc5b2ba (patch) | |
tree | 19da338c2598680addd1e5f65b41d827b03a7ca9 /candle-core/tests/quantized_tests.rs | |
parent | 89b5a068585b73193d2004a7293d5b2fa6c30bfd (diff) | |
download | candle-0eb90ed7831d451e2e420ecd158151b44dc5b2ba.tar.gz candle-0eb90ed7831d451e2e420ecd158151b44dc5b2ba.tar.bz2 candle-0eb90ed7831d451e2e420ecd158151b44dc5b2ba.zip |
Simpler repro for the neon optimization issue + bugfix (#1544)
* Simpler repro for the neon optimization issue.
* Bugfix for q4k.
* Improve the fix, share the dot-prod bit.
* Clippy fixes.
* Fix for q6k.
* Also fix for q2k.
* Use the new shared dotprod.
* Add more testing.
Diffstat (limited to 'candle-core/tests/quantized_tests.rs')
-rw-r--r-- | candle-core/tests/quantized_tests.rs | 57 |
1 files changed, 41 insertions, 16 deletions
diff --git a/candle-core/tests/quantized_tests.rs b/candle-core/tests/quantized_tests.rs index 716cca8d..e7a2ea7f 100644 --- a/candle-core/tests/quantized_tests.rs +++ b/candle-core/tests/quantized_tests.rs @@ -1,4 +1,5 @@ use candle_core::{ + bail, quantized::{self, GgmlDType}, test_utils::to_vec2_round, Device, Module, Result, Tensor, @@ -265,7 +266,8 @@ fn compare_with_error(values: &[f32], expected: &[f32], tolerance: f32) { } } -/// Creates a vector simillarly to the one used in GGML unit tests: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L26-L30 +/// Creates a vector similar to the ones used in GGML unit tests: +/// https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L26-L30 fn create_ggml_like_vector(offset: f32) -> Vec<f32> { (0..GGML_TEST_SIZE) .map(|i| 0.1 + 2.0 * (i as f32 + offset).cos()) @@ -284,14 +286,15 @@ fn calculate_rmse(a: &[f32], b: &[f32]) -> f32 { sum / a.len() as f32 } -/// Mirrores the GGML quanitzation unit test: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L43-L50 +/// Similar to the GGML quantization unit test: +/// https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L43-L50 fn ggml_quantization_error_test<T: GgmlType>(max_error: f32) -> Result<()> { let src = create_ggml_like_vector(0.0); let mut dst = vec![0.0; GGML_TEST_SIZE]; let _quant = quantize_roundtrip::<T>(src.as_slice(), dst.as_mut_slice())?; let error = calculate_rmse(src.as_slice(), dst.as_slice()); if error > max_error { - candle_core::bail!( + bail!( "Quantization error {} exceeds max error {}", error, max_error @@ -487,54 +490,66 @@ fn ggml_reference_matmul_error(dtype: GgmlDType) -> Result<f32> { GgmlDType::Q5K => 0.000740, GgmlDType::Q6K => 0.000952, GgmlDType::Q4_0 => 0.001143, - GgmlDType::Q4_1 => 0.007784, + GgmlDType::Q4_1 => 0.008, GgmlDType::Q5_0 => 0.001353, - GgmlDType::Q5_1 => 0.001363, + GgmlDType::Q5_1 => 0.00149, GgmlDType::Q8_0 => 0.000092, // Not from the ggml repo. GgmlDType::Q8K => 0.00065, - _ => candle_core::bail!("No GGML results for quantization type {dtype:?}",), + _ => bail!("No GGML results for quantization type {dtype:?}",), }; Ok(err) } -/// Mirrores the GGML matmul unit test: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L76-L91 +/// Similar to the GGML matmul unit test: +/// https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L76-L91 fn ggml_matmul_error_test<T: GgmlType>() -> Result<()> { let a = create_ggml_like_vector(0.0); let b = create_ggml_like_vector(1.0); + ggml_matmul_error_test_::<T>(a.as_slice(), b.as_slice(), 1.0)?; + // Another example that is more likely to trigger the overflow reported in #1526 + let a = (0..GGML_TEST_SIZE) + .map(|i| i as f32 / GGML_TEST_SIZE as f32) + .collect::<Vec<_>>(); + let b = (0..GGML_TEST_SIZE) + .map(|i| i as f32 / GGML_TEST_SIZE as f32) + .collect::<Vec<_>>(); + ggml_matmul_error_test_::<T>(a.as_slice(), b.as_slice(), 2.0)?; + Ok(()) +} + +fn ggml_matmul_error_test_<T: GgmlType>(a: &[f32], b: &[f32], err_m: f32) -> Result<()> { let length = a.len(); let mut a_quant = vec![T::zeros(); length / T::BLCK_SIZE]; let mut b_quant = vec![T::VecDotType::zeros(); length / T::VecDotType::BLCK_SIZE]; - T::from_float(&a, &mut a_quant)?; - T::VecDotType::from_float(&b, &mut b_quant)?; + T::from_float(a, &mut a_quant)?; + T::VecDotType::from_float(b, &mut b_quant)?; let result = T::vec_dot(length, &a_quant, &b_quant)?; let result_unopt = T::vec_dot_unopt(length, &a_quant, &b_quant)?; - let reference_result = vec_dot_reference(&a, &b); + let reference_result = vec_dot_reference(a, b); if (result - result_unopt).abs() / length as f32 > 1e-6 { - candle_core::bail!( + bail!( "the opt and unopt vec-dot returned different values, opt {result}, unopt {result_unopt}" ) } let error = (result - reference_result).abs() / length as f32; - let ggml_error = ggml_reference_matmul_error(T::DTYPE)?; + let ggml_error = ggml_reference_matmul_error(T::DTYPE)? * err_m; if !error.is_finite() || error > GGML_MAX_DOT_PRODUCT_ERROR { - candle_core::bail!( - "Dot product error {error} exceeds max error {GGML_MAX_DOT_PRODUCT_ERROR}", - ); + bail!("Dot product error {error} exceeds max error {GGML_MAX_DOT_PRODUCT_ERROR}",); } // We diverge slightly due to different rounding behavior / f16 to f32 conversions in GGML // => we use a slightly higher error threshold const ERROR_LENIENCY: f32 = 0.00001; if error - ERROR_LENIENCY > ggml_error { - candle_core::bail!( + bail!( "Dot product error {} exceeds ggml reference error {}", error, ggml_error @@ -543,6 +558,16 @@ fn ggml_matmul_error_test<T: GgmlType>() -> Result<()> { Ok(()) } +#[test] +fn quantized_mm() -> Result<()> { + ggml_matmul_error_test::<k_quants::BlockQ4_0>()?; + ggml_matmul_error_test::<k_quants::BlockQ4_1>()?; + ggml_matmul_error_test::<k_quants::BlockQ5_0>()?; + ggml_matmul_error_test::<k_quants::BlockQ5_1>()?; + ggml_matmul_error_test::<k_quants::BlockQ8_0>()?; + Ok(()) +} + /// generates random tensors of size `m x k` and `n x k` and calculates their expected matrix multiplication result. fn get_random_tensors( m: usize, |