summaryrefslogtreecommitdiff
path: root/candle-nn/examples/cpu_benchmarks.rs
diff options
context:
space:
mode:
Diffstat (limited to 'candle-nn/examples/cpu_benchmarks.rs')
-rw-r--r--candle-nn/examples/cpu_benchmarks.rs302
1 files changed, 302 insertions, 0 deletions
diff --git a/candle-nn/examples/cpu_benchmarks.rs b/candle-nn/examples/cpu_benchmarks.rs
new file mode 100644
index 00000000..204a7109
--- /dev/null
+++ b/candle-nn/examples/cpu_benchmarks.rs
@@ -0,0 +1,302 @@
+/// This example contains some simple benchmarks so that it's easy to run them in perf etc.
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use candle::quantized::GgmlType;
+use candle::{CpuStorage, Device, Layout, Result, Shape, Tensor, D};
+use clap::{Parser, Subcommand};
+
+const CHECK_CONV2D: bool = false;
+
+trait Benchmark {
+ type PreProcessData;
+ type RunResult;
+
+ fn preprocess() -> Result<Self::PreProcessData>;
+ fn run_one(_: &Self::PreProcessData) -> Result<Self::RunResult>;
+
+ const ITERS: usize;
+}
+
+struct Im2Col {
+ h_k: usize,
+ w_k: usize,
+ stride: usize,
+ dilation: usize,
+ padding: usize,
+}
+
+impl Im2Col {
+ fn hw_out(&self, h: usize, w: usize) -> (usize, usize) {
+ let h_out = (h + 2 * self.padding - self.dilation * (self.h_k - 1) - 1) / self.stride + 1;
+ let w_out = (w + 2 * self.padding - self.dilation * (self.w_k - 1) - 1) / self.stride + 1;
+ (h_out, w_out)
+ }
+}
+
+impl candle::CustomOp1 for Im2Col {
+ fn name(&self) -> &'static str {
+ "im2col"
+ }
+
+ fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)> {
+ let &Self {
+ h_k,
+ w_k,
+ stride,
+ dilation,
+ padding,
+ } = self;
+ let (b, c, h, w) = layout.shape().dims4()?;
+ let (h_out, w_out) = self.hw_out(h, w);
+ let slice = storage.as_slice::<f32>()?;
+ let src = &slice[layout.start_offset()..];
+ let mut dst = vec![0f32; b * h_out * w_out * c * h_k * w_k];
+ let (src_s0, src_s1, src_s2, src_s3) = {
+ let s = layout.stride();
+ (s[0], s[1], s[2], s[3])
+ };
+ // TODO: provide specialized kernels for the common use cases.
+ // - h_k = w_k = 1
+ // - padding = 0
+ // - stride = 1
+ // - dilation = 1
+ for b_idx in 0..b {
+ let src_idx = b_idx * src_s0;
+ let dst_idx = b_idx * h_out * w_out * c * h_k * w_k;
+ for h_idx in 0..h_out {
+ let dst_idx = dst_idx + h_idx * w_out * c * h_k * w_k;
+ for w_idx in 0..w_out {
+ let dst_idx = dst_idx + w_idx * c * h_k * w_k;
+ for c_idx in 0..c {
+ let dst_idx = dst_idx + c_idx * h_k * w_k;
+ let src_idx = c_idx * src_s1 + src_idx;
+ for h_k_idx in 0..h_k {
+ let src_h = h_idx * stride + h_k_idx * dilation;
+ if padding != 0 && (src_h < padding || src_h >= h + padding) {
+ continue;
+ }
+ let src_h = src_h - padding;
+ let src_idx = src_idx + src_h * src_s2;
+ let dst_idx = dst_idx + h_k_idx * w_k;
+ for w_k_idx in 0..w_k {
+ let src_w = w_idx * stride + w_k_idx * dilation;
+ if padding != 0 && (src_w < padding || src_w >= w + padding) {
+ continue;
+ }
+ let src_w = src_w - padding;
+ let src_idx = src_idx + src_w * src_s3;
+ let dst_idx = dst_idx + w_k_idx;
+ dst[dst_idx] = src[src_idx]
+ }
+ }
+ }
+ }
+ }
+ }
+ let storage = candle::WithDType::to_cpu_storage_owned(dst);
+ Ok((storage, (b * h_out * w_out, c * h_k * w_k).into()))
+ }
+}
+
+// Conv1d example as used in whisper.
+struct Conv1d;
+impl Benchmark for Conv1d {
+ type PreProcessData = (Tensor, Tensor);
+ type RunResult = Tensor;
+ fn preprocess() -> Result<Self::PreProcessData> {
+ let inp = Tensor::randn(0f32, 1., (1, 384, 3000), &Device::Cpu)?;
+ let w = Tensor::randn(0f32, 1., (384, 384, 3), &Device::Cpu)?;
+ Ok((inp, w))
+ }
+
+ fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
+ d.0.conv1d(&d.1, 0, 1, 1, 1)
+ }
+
+ const ITERS: usize = 5;
+}
+
+// Conv2d example as used in stable-diffusion.
+struct Conv2d;
+impl Benchmark for Conv2d {
+ type PreProcessData = (Tensor, Tensor);
+ type RunResult = Tensor;
+
+ fn preprocess() -> Result<Self::PreProcessData> {
+ let inp = Tensor::randn(0f32, 1., (2, 320, 96, 96), &Device::Cpu)?;
+ let w = Tensor::randn(0f32, 1., (320, 320, 3, 3), &Device::Cpu)?;
+ Ok((inp, w))
+ }
+
+ fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
+ d.0.conv2d(&d.1, 0, 1, 1, 1)
+ }
+
+ const ITERS: usize = 5;
+}
+
+// Conv2d example as used in stable-diffusion, im2col implementation.
+struct Conv2dIm2Col;
+impl Benchmark for Conv2dIm2Col {
+ type PreProcessData = (Tensor, Tensor);
+ type RunResult = Tensor;
+
+ fn preprocess() -> Result<Self::PreProcessData> {
+ let inp = Tensor::randn(0f32, 1., (2, 320, 96, 96), &Device::Cpu)?;
+ let w = Tensor::randn(0f32, 1., (320, 320, 3, 3), &Device::Cpu)?;
+ Ok((inp, w))
+ }
+
+ fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
+ // d.0.conv2d(&d.1, 0, 1, 1, 1)
+ let (b, _, h, w) = d.0.dims4()?;
+ let (_, _, h_k, w_k) = d.1.dims4()?;
+ let op = Im2Col {
+ h_k,
+ w_k,
+ stride: 1,
+ dilation: 1,
+ padding: 0,
+ };
+ let (h_out, w_out) = op.hw_out(h, w);
+ let col = d.0.apply_op1_no_bwd(&op)?;
+ let res = col.matmul(&d.1.flatten_from(1)?.t()?)?;
+ let res = res
+ .reshape((b, h_out, w_out, ()))?
+ .permute((0, 3, 1, 2))?
+ .contiguous()?;
+ if CHECK_CONV2D {
+ let res2 = d.0.conv2d(&d.1, op.padding, op.stride, op.dilation, 1);
+ let diff = (&res - res2)?.sqr()?.mean_all()?;
+ println!("{diff}");
+ }
+ Ok(res)
+ }
+
+ const ITERS: usize = 5;
+}
+
+struct Matmul;
+impl Benchmark for Matmul {
+ type PreProcessData = (Tensor, Tensor);
+ type RunResult = Tensor;
+ fn preprocess() -> Result<Self::PreProcessData> {
+ let lhs = Tensor::randn(0f32, 1., (1024, 1024), &Device::Cpu)?;
+ let rhs = Tensor::randn(0f32, 1., (1024, 1024), &Device::Cpu)?;
+ Ok((lhs, rhs))
+ }
+
+ fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
+ d.0.matmul(&d.1)
+ }
+
+ const ITERS: usize = 100;
+}
+
+// This benchmark is similar to:
+// https://github.com/ggerganov/llama.cpp/blob/master/examples/benchmark/benchmark-matmult.cpp
+struct QMatMul;
+impl Benchmark for QMatMul {
+ type PreProcessData = (candle::quantized::QMatMul, Tensor);
+ type RunResult = Tensor;
+ fn preprocess() -> Result<Self::PreProcessData> {
+ let zeros = vec![candle::quantized::k_quants::BlockQ4_0::zeros(); 4096 * 11008 / 32];
+ let mm = candle::quantized::QTensor::new(zeros, (4096, 11008))?;
+ let mm = candle::quantized::QMatMul::from_qtensor(mm);
+ let arg = Tensor::randn(0f32, 1., (128, 11008), &Device::Cpu)?;
+ Ok((mm, arg))
+ }
+
+ fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
+ d.0.forward(&d.1)
+ }
+
+ const ITERS: usize = 100;
+}
+
+struct Softmax;
+impl Benchmark for Softmax {
+ type PreProcessData = Tensor;
+ type RunResult = Tensor;
+ fn preprocess() -> Result<Self::PreProcessData> {
+ // Typical whisper tiny size.
+ let x = Tensor::randn(0f32, 1., (1, 6, 200, 1500), &Device::Cpu)?;
+ Ok(x)
+ }
+
+ fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
+ candle_nn::ops::softmax(d, D::Minus1)
+ }
+
+ const ITERS: usize = 100;
+}
+
+struct SoftmaxLastDim;
+impl Benchmark for SoftmaxLastDim {
+ type PreProcessData = Tensor;
+ type RunResult = Tensor;
+ fn preprocess() -> Result<Self::PreProcessData> {
+ // Typical whisper tiny size.
+ let x = Tensor::randn(0f32, 1., (1, 6, 200, 1500), &Device::Cpu)?;
+ Ok(x)
+ }
+
+ fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
+ candle_nn::ops::softmax_last_dim(d)
+ }
+
+ const ITERS: usize = 100;
+}
+
+fn run<B: Benchmark>(iters: Option<usize>) -> Result<()> {
+ use std::hint::black_box;
+
+ let iters = iters.unwrap_or(B::ITERS);
+ let d = B::preprocess()?;
+ let start = std::time::Instant::now();
+ for _iter in 0..iters {
+ let _res = black_box(B::run_one(black_box(&d))?);
+ }
+ println!("{:?}", start.elapsed() / iters as u32);
+ Ok(())
+}
+
+#[derive(Subcommand, Debug, Clone)]
+enum Task {
+ Conv1d,
+ Conv2d,
+ Conv2dIm2Col,
+ Matmul,
+ Qmatmul,
+ Softmax,
+ SoftmaxLastDim,
+}
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+pub struct Args {
+ /// The benchmark to be run.
+ #[command(subcommand)]
+ task: Task,
+
+ #[arg(long)]
+ iters: Option<usize>,
+}
+
+fn main() -> Result<()> {
+ let args = Args::parse();
+ match args.task {
+ Task::Conv1d => run::<Conv1d>(args.iters)?,
+ Task::Conv2d => run::<Conv2d>(args.iters)?,
+ Task::Conv2dIm2Col => run::<Conv2dIm2Col>(args.iters)?,
+ Task::Matmul => run::<Matmul>(args.iters)?,
+ Task::Softmax => run::<Softmax>(args.iters)?,
+ Task::SoftmaxLastDim => run::<SoftmaxLastDim>(args.iters)?,
+ Task::Qmatmul => run::<QMatMul>(args.iters)?,
+ }
+ Ok(())
+}