summaryrefslogtreecommitdiff
path: root/candle-core/examples
diff options
context:
space:
mode:
authorLaurent Mazare <laurent.mazare@gmail.com>2023-08-17 07:03:32 +0100
committerGitHub <noreply@github.com>2023-08-17 07:03:32 +0100
commit306c8eee7ac96d23d1d6a7a13b4311edc6c4f98a (patch)
tree517f84b8b462d7703311edc59b127edeb87f6c56 /candle-core/examples
parent098909de40b1478dfd6fba92f9907b8cd88984a6 (diff)
downloadcandle-306c8eee7ac96d23d1d6a7a13b4311edc6c4f98a.tar.gz
candle-306c8eee7ac96d23d1d6a7a13b4311edc6c4f98a.tar.bz2
candle-306c8eee7ac96d23d1d6a7a13b4311edc6c4f98a.zip
AVX version of the vecdot for q4_0. (#474)
* AVX version of the vecdot for q4_0. * Tweak the avx bits. * Add a qmatmul benchmark. * Fix the quantized test.
Diffstat (limited to 'candle-core/examples')
-rw-r--r--candle-core/examples/cpu_benchmarks.rs24
1 files changed, 24 insertions, 0 deletions
diff --git a/candle-core/examples/cpu_benchmarks.rs b/candle-core/examples/cpu_benchmarks.rs
index 6c40269f..ef27131e 100644
--- a/candle-core/examples/cpu_benchmarks.rs
+++ b/candle-core/examples/cpu_benchmarks.rs
@@ -5,6 +5,7 @@ extern crate intel_mkl_src;
#[cfg(feature = "accelerate")]
extern crate accelerate_src;
+use candle_core::quantized::GgmlType;
use candle_core::{Device, Result, Tensor, D};
use clap::{Parser, Subcommand};
@@ -81,6 +82,27 @@ impl Benchmark for Matmul {
const ITERS: usize = 100;
}
+// This benchmark is similar to:
+// https://github.com/ggerganov/llama.cpp/blob/master/examples/benchmark/benchmark-matmult.cpp
+struct QMatMul;
+impl Benchmark for QMatMul {
+ type PreProcessData = (candle_core::quantized::QMatMul, Tensor);
+ type RunResult = Tensor;
+ fn preprocess() -> Result<Self::PreProcessData> {
+ let zeros = vec![candle_core::quantized::k_quants::BlockQ4_0::zeros(); 4096 * 11008 / 32];
+ let mm = candle_core::quantized::QTensor::new(zeros, (4096, 11008));
+ let mm = candle_core::quantized::QMatMul::from_qtensor(mm);
+ let arg = Tensor::randn(0f32, 1., (128, 11008), &Device::Cpu)?;
+ Ok((mm, arg))
+ }
+
+ fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
+ d.0.forward(&d.1)
+ }
+
+ const ITERS: usize = 100;
+}
+
struct Softmax;
impl Benchmark for Softmax {
type PreProcessData = Tensor;
@@ -116,6 +138,7 @@ enum Task {
Conv1d,
Conv2d,
Matmul,
+ Qmatmul,
Softmax,
}
@@ -137,6 +160,7 @@ fn main() -> Result<()> {
Task::Conv2d => run::<Conv2d>(args.iters)?,
Task::Matmul => run::<Matmul>(args.iters)?,
Task::Softmax => run::<Softmax>(args.iters)?,
+ Task::Qmatmul => run::<QMatMul>(args.iters)?,
}
Ok(())
}