diff options
Diffstat (limited to 'candle-core/examples/cpu_benchmarks.rs')
-rw-r--r-- | candle-core/examples/cpu_benchmarks.rs | 166 |
1 files changed, 0 insertions, 166 deletions
diff --git a/candle-core/examples/cpu_benchmarks.rs b/candle-core/examples/cpu_benchmarks.rs deleted file mode 100644 index 13175ac1..00000000 --- a/candle-core/examples/cpu_benchmarks.rs +++ /dev/null @@ -1,166 +0,0 @@ -/// This example contains some simple benchmarks so that it's easy to run them in perf etc. -#[cfg(feature = "mkl")] -extern crate intel_mkl_src; - -#[cfg(feature = "accelerate")] -extern crate accelerate_src; - -use candle_core::quantized::GgmlType; -use candle_core::{Device, Result, Tensor, D}; -use clap::{Parser, Subcommand}; - -fn softmax<D: candle_core::shape::Dim>(xs: &Tensor, dim: D) -> Result<Tensor> { - let dim = dim.to_index(xs.shape(), "softmax")?; - let max = xs.max_keepdim(dim)?; - let diff = xs.broadcast_sub(&max)?; - let num = diff.exp()?; - let den = num.sum_keepdim(dim)?; - num.broadcast_div(&den) -} - -trait Benchmark { - type PreProcessData; - type RunResult; - - fn preprocess() -> Result<Self::PreProcessData>; - fn run_one(_: &Self::PreProcessData) -> Result<Self::RunResult>; - - const ITERS: usize; -} - -// Conv1d example as used in whisper. -struct Conv1d; -impl Benchmark for Conv1d { - type PreProcessData = (Tensor, Tensor); - type RunResult = Tensor; - fn preprocess() -> Result<Self::PreProcessData> { - let inp = Tensor::randn(0f32, 1., (1, 384, 3000), &Device::Cpu)?; - let w = Tensor::randn(0f32, 1., (384, 384, 3), &Device::Cpu)?; - Ok((inp, w)) - } - - fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> { - d.0.conv1d(&d.1, 0, 1, 1, 1) - } - - const ITERS: usize = 5; -} - -// Conv2d example as used in stable-diffusion. -struct Conv2d; -impl Benchmark for Conv2d { - type PreProcessData = (Tensor, Tensor); - type RunResult = Tensor; - - fn preprocess() -> Result<Self::PreProcessData> { - let inp = Tensor::randn(0f32, 1., (2, 320, 96, 96), &Device::Cpu)?; - let w = Tensor::randn(0f32, 1., (320, 320, 3, 3), &Device::Cpu)?; - Ok((inp, w)) - } - - fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> { - d.0.conv2d(&d.1, 0, 1, 1, 1) - } - - const ITERS: usize = 1; -} - -struct Matmul; -impl Benchmark for Matmul { - type PreProcessData = (Tensor, Tensor); - type RunResult = Tensor; - fn preprocess() -> Result<Self::PreProcessData> { - let lhs = Tensor::randn(0f32, 1., (1024, 1024), &Device::Cpu)?; - let rhs = Tensor::randn(0f32, 1., (1024, 1024), &Device::Cpu)?; - Ok((lhs, rhs)) - } - - fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> { - d.0.matmul(&d.1) - } - - const ITERS: usize = 100; -} - -// This benchmark is similar to: -// https://github.com/ggerganov/llama.cpp/blob/master/examples/benchmark/benchmark-matmult.cpp -struct QMatMul; -impl Benchmark for QMatMul { - type PreProcessData = (candle_core::quantized::QMatMul, Tensor); - type RunResult = Tensor; - fn preprocess() -> Result<Self::PreProcessData> { - let zeros = vec![candle_core::quantized::k_quants::BlockQ4_0::zeros(); 4096 * 11008 / 32]; - let mm = candle_core::quantized::QTensor::new(zeros, (4096, 11008))?; - let mm = candle_core::quantized::QMatMul::from_qtensor(mm); - let arg = Tensor::randn(0f32, 1., (128, 11008), &Device::Cpu)?; - Ok((mm, arg)) - } - - fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> { - d.0.forward(&d.1) - } - - const ITERS: usize = 100; -} - -struct Softmax; -impl Benchmark for Softmax { - type PreProcessData = Tensor; - type RunResult = Tensor; - fn preprocess() -> Result<Self::PreProcessData> { - // Typical whisper tiny size. - let x = Tensor::randn(0f32, 1., (1, 6, 200, 1500), &Device::Cpu)?; - Ok(x) - } - - fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> { - softmax(d, D::Minus1) - } - - const ITERS: usize = 100; -} - -fn run<B: Benchmark>(iters: Option<usize>) -> Result<()> { - use std::hint::black_box; - - let iters = iters.unwrap_or(B::ITERS); - let d = B::preprocess()?; - let start = std::time::Instant::now(); - for _iter in 0..iters { - let _res = black_box(B::run_one(black_box(&d))?); - } - println!("{:?}", start.elapsed() / iters as u32); - Ok(()) -} - -#[derive(Subcommand, Debug, Clone)] -enum Task { - Conv1d, - Conv2d, - Matmul, - Qmatmul, - Softmax, -} - -#[derive(Parser, Debug)] -#[command(author, version, about, long_about = None)] -pub struct Args { - /// The benchmark to be run. - #[command(subcommand)] - task: Task, - - #[arg(long)] - iters: Option<usize>, -} - -fn main() -> Result<()> { - let args = Args::parse(); - match args.task { - Task::Conv1d => run::<Conv1d>(args.iters)?, - Task::Conv2d => run::<Conv2d>(args.iters)?, - Task::Matmul => run::<Matmul>(args.iters)?, - Task::Softmax => run::<Softmax>(args.iters)?, - Task::Qmatmul => run::<QMatMul>(args.iters)?, - } - Ok(()) -} |