summaryrefslogtreecommitdiff
path: root/candle-core/examples/cpu_benchmarks.rs
diff options
context:
space:
mode:
Diffstat (limited to 'candle-core/examples/cpu_benchmarks.rs')
-rw-r--r--candle-core/examples/cpu_benchmarks.rs166
1 files changed, 0 insertions, 166 deletions
diff --git a/candle-core/examples/cpu_benchmarks.rs b/candle-core/examples/cpu_benchmarks.rs
deleted file mode 100644
index 13175ac1..00000000
--- a/candle-core/examples/cpu_benchmarks.rs
+++ /dev/null
@@ -1,166 +0,0 @@
-/// This example contains some simple benchmarks so that it's easy to run them in perf etc.
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use candle_core::quantized::GgmlType;
-use candle_core::{Device, Result, Tensor, D};
-use clap::{Parser, Subcommand};
-
-fn softmax<D: candle_core::shape::Dim>(xs: &Tensor, dim: D) -> Result<Tensor> {
- let dim = dim.to_index(xs.shape(), "softmax")?;
- let max = xs.max_keepdim(dim)?;
- let diff = xs.broadcast_sub(&max)?;
- let num = diff.exp()?;
- let den = num.sum_keepdim(dim)?;
- num.broadcast_div(&den)
-}
-
-trait Benchmark {
- type PreProcessData;
- type RunResult;
-
- fn preprocess() -> Result<Self::PreProcessData>;
- fn run_one(_: &Self::PreProcessData) -> Result<Self::RunResult>;
-
- const ITERS: usize;
-}
-
-// Conv1d example as used in whisper.
-struct Conv1d;
-impl Benchmark for Conv1d {
- type PreProcessData = (Tensor, Tensor);
- type RunResult = Tensor;
- fn preprocess() -> Result<Self::PreProcessData> {
- let inp = Tensor::randn(0f32, 1., (1, 384, 3000), &Device::Cpu)?;
- let w = Tensor::randn(0f32, 1., (384, 384, 3), &Device::Cpu)?;
- Ok((inp, w))
- }
-
- fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
- d.0.conv1d(&d.1, 0, 1, 1, 1)
- }
-
- const ITERS: usize = 5;
-}
-
-// Conv2d example as used in stable-diffusion.
-struct Conv2d;
-impl Benchmark for Conv2d {
- type PreProcessData = (Tensor, Tensor);
- type RunResult = Tensor;
-
- fn preprocess() -> Result<Self::PreProcessData> {
- let inp = Tensor::randn(0f32, 1., (2, 320, 96, 96), &Device::Cpu)?;
- let w = Tensor::randn(0f32, 1., (320, 320, 3, 3), &Device::Cpu)?;
- Ok((inp, w))
- }
-
- fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
- d.0.conv2d(&d.1, 0, 1, 1, 1)
- }
-
- const ITERS: usize = 1;
-}
-
-struct Matmul;
-impl Benchmark for Matmul {
- type PreProcessData = (Tensor, Tensor);
- type RunResult = Tensor;
- fn preprocess() -> Result<Self::PreProcessData> {
- let lhs = Tensor::randn(0f32, 1., (1024, 1024), &Device::Cpu)?;
- let rhs = Tensor::randn(0f32, 1., (1024, 1024), &Device::Cpu)?;
- Ok((lhs, rhs))
- }
-
- fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
- d.0.matmul(&d.1)
- }
-
- const ITERS: usize = 100;
-}
-
-// This benchmark is similar to:
-// https://github.com/ggerganov/llama.cpp/blob/master/examples/benchmark/benchmark-matmult.cpp
-struct QMatMul;
-impl Benchmark for QMatMul {
- type PreProcessData = (candle_core::quantized::QMatMul, Tensor);
- type RunResult = Tensor;
- fn preprocess() -> Result<Self::PreProcessData> {
- let zeros = vec![candle_core::quantized::k_quants::BlockQ4_0::zeros(); 4096 * 11008 / 32];
- let mm = candle_core::quantized::QTensor::new(zeros, (4096, 11008))?;
- let mm = candle_core::quantized::QMatMul::from_qtensor(mm);
- let arg = Tensor::randn(0f32, 1., (128, 11008), &Device::Cpu)?;
- Ok((mm, arg))
- }
-
- fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
- d.0.forward(&d.1)
- }
-
- const ITERS: usize = 100;
-}
-
-struct Softmax;
-impl Benchmark for Softmax {
- type PreProcessData = Tensor;
- type RunResult = Tensor;
- fn preprocess() -> Result<Self::PreProcessData> {
- // Typical whisper tiny size.
- let x = Tensor::randn(0f32, 1., (1, 6, 200, 1500), &Device::Cpu)?;
- Ok(x)
- }
-
- fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
- softmax(d, D::Minus1)
- }
-
- const ITERS: usize = 100;
-}
-
-fn run<B: Benchmark>(iters: Option<usize>) -> Result<()> {
- use std::hint::black_box;
-
- let iters = iters.unwrap_or(B::ITERS);
- let d = B::preprocess()?;
- let start = std::time::Instant::now();
- for _iter in 0..iters {
- let _res = black_box(B::run_one(black_box(&d))?);
- }
- println!("{:?}", start.elapsed() / iters as u32);
- Ok(())
-}
-
-#[derive(Subcommand, Debug, Clone)]
-enum Task {
- Conv1d,
- Conv2d,
- Matmul,
- Qmatmul,
- Softmax,
-}
-
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-pub struct Args {
- /// The benchmark to be run.
- #[command(subcommand)]
- task: Task,
-
- #[arg(long)]
- iters: Option<usize>,
-}
-
-fn main() -> Result<()> {
- let args = Args::parse();
- match args.task {
- Task::Conv1d => run::<Conv1d>(args.iters)?,
- Task::Conv2d => run::<Conv2d>(args.iters)?,
- Task::Matmul => run::<Matmul>(args.iters)?,
- Task::Softmax => run::<Softmax>(args.iters)?,
- Task::Qmatmul => run::<QMatMul>(args.iters)?,
- }
- Ok(())
-}