/// This example contains some simple benchmarks so that it's easy to run them in perf etc. #[cfg(feature = "mkl")] extern crate intel_mkl_src; #[cfg(feature = "accelerate")] extern crate accelerate_src; use candle::quantized::GgmlType; use candle::{CpuStorage, Device, Layout, Module, Result, Shape, Tensor, D}; use clap::{Parser, Subcommand}; const CHECK_CONV2D: bool = false; trait Benchmark { type PreProcessData; type RunResult; fn preprocess() -> Result; fn run_one(_: &Self::PreProcessData) -> Result; const ITERS: usize; } struct Im2Col { h_k: usize, w_k: usize, stride: usize, dilation: usize, padding: usize, } impl Im2Col { fn hw_out(&self, h: usize, w: usize) -> (usize, usize) { let h_out = (h + 2 * self.padding - self.dilation * (self.h_k - 1) - 1) / self.stride + 1; let w_out = (w + 2 * self.padding - self.dilation * (self.w_k - 1) - 1) / self.stride + 1; (h_out, w_out) } } impl candle::CustomOp1 for Im2Col { fn name(&self) -> &'static str { "im2col" } fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)> { let &Self { h_k, w_k, stride, dilation, padding, } = self; let (b, c, h, w) = layout.shape().dims4()?; let (h_out, w_out) = self.hw_out(h, w); let slice = storage.as_slice::()?; let src = &slice[layout.start_offset()..]; let mut dst = vec![0f32; b * h_out * w_out * c * h_k * w_k]; let (src_s0, src_s1, src_s2, src_s3) = { let s = layout.stride(); (s[0], s[1], s[2], s[3]) }; // TODO: provide specialized kernels for the common use cases. // - h_k = w_k = 1 // - padding = 0 // - stride = 1 // - dilation = 1 for b_idx in 0..b { let src_idx = b_idx * src_s0; let dst_idx = b_idx * h_out * w_out * c * h_k * w_k; for h_idx in 0..h_out { let dst_idx = dst_idx + h_idx * w_out * c * h_k * w_k; for w_idx in 0..w_out { let dst_idx = dst_idx + w_idx * c * h_k * w_k; for c_idx in 0..c { let dst_idx = dst_idx + c_idx * h_k * w_k; let src_idx = c_idx * src_s1 + src_idx; for h_k_idx in 0..h_k { let src_h = h_idx * stride + h_k_idx * dilation; if padding != 0 && (src_h < padding || src_h >= h + padding) { continue; } let src_h = src_h - padding; let src_idx = src_idx + src_h * src_s2; let dst_idx = dst_idx + h_k_idx * w_k; for w_k_idx in 0..w_k { let src_w = w_idx * stride + w_k_idx * dilation; if padding != 0 && (src_w < padding || src_w >= w + padding) { continue; } let src_w = src_w - padding; let src_idx = src_idx + src_w * src_s3; let dst_idx = dst_idx + w_k_idx; dst[dst_idx] = src[src_idx] } } } } } } let storage = candle::WithDType::to_cpu_storage_owned(dst); Ok((storage, (b * h_out * w_out, c * h_k * w_k).into())) } } // Conv1d example as used in whisper. struct Conv1d; impl Benchmark for Conv1d { type PreProcessData = (Tensor, Tensor); type RunResult = Tensor; fn preprocess() -> Result { let inp = Tensor::randn(0f32, 1., (1, 384, 3000), &Device::Cpu)?; let w = Tensor::randn(0f32, 1., (384, 384, 3), &Device::Cpu)?; Ok((inp, w)) } fn run_one(d: &Self::PreProcessData) -> Result { d.0.conv1d(&d.1, 0, 1, 1, 1) } const ITERS: usize = 5; } // Conv2d example as used in stable-diffusion. struct Conv2d; impl Benchmark for Conv2d { type PreProcessData = (Tensor, Tensor); type RunResult = Tensor; fn preprocess() -> Result { let inp = Tensor::randn(0f32, 1., (2, 320, 96, 96), &Device::Cpu)?; let w = Tensor::randn(0f32, 1., (320, 320, 3, 3), &Device::Cpu)?; Ok((inp, w)) } fn run_one(d: &Self::PreProcessData) -> Result { d.0.conv2d(&d.1, 0, 1, 1, 1) } const ITERS: usize = 5; } // Conv2d example as used in stable-diffusion, im2col implementation. struct Conv2dIm2Col; impl Benchmark for Conv2dIm2Col { type PreProcessData = (Tensor, Tensor); type RunResult = Tensor; fn preprocess() -> Result { let inp = Tensor::randn(0f32, 1., (2, 320, 96, 96), &Device::Cpu)?; let w = Tensor::randn(0f32, 1., (320, 320, 3, 3), &Device::Cpu)?; Ok((inp, w)) } fn run_one(d: &Self::PreProcessData) -> Result { // d.0.conv2d(&d.1, 0, 1, 1, 1) let (b, _, h, w) = d.0.dims4()?; let (_, _, h_k, w_k) = d.1.dims4()?; let op = Im2Col { h_k, w_k, stride: 1, dilation: 1, padding: 0, }; let (h_out, w_out) = op.hw_out(h, w); let col = d.0.apply_op1_no_bwd(&op)?; let res = col.matmul(&d.1.flatten_from(1)?.t()?)?; let res = res .reshape((b, h_out, w_out, ()))? .permute((0, 3, 1, 2))? .contiguous()?; if CHECK_CONV2D { let res2 = d.0.conv2d(&d.1, op.padding, op.stride, op.dilation, 1); let diff = (&res - res2)?.sqr()?.mean_all()?; println!("{diff}"); } Ok(res) } const ITERS: usize = 5; } struct MatMul; impl Benchmark for MatMul { type PreProcessData = (Tensor, Tensor); type RunResult = Tensor; fn preprocess() -> Result { let lhs = Tensor::randn(0f32, 1., (1024, 1024), &Device::Cpu)?; let rhs = Tensor::randn(0f32, 1., (1024, 1024), &Device::Cpu)?; Ok((lhs, rhs)) } fn run_one(d: &Self::PreProcessData) -> Result { d.0.matmul(&d.1) } const ITERS: usize = 100; } struct MatVec; impl Benchmark for MatVec { type PreProcessData = (Tensor, Tensor); type RunResult = Tensor; fn preprocess() -> Result { let lhs = Tensor::randn(0f32, 1., (1024 * 4, 1024 * 4), &Device::Cpu)?; let rhs = Tensor::randn(0f32, 1., (1024 * 4, 1), &Device::Cpu)?; Ok((lhs, rhs)) } fn run_one(d: &Self::PreProcessData) -> Result { d.0.matmul(&d.1) } const ITERS: usize = 100; } // This benchmark is similar to: // https://github.com/ggerganov/llama.cpp/blob/master/examples/benchmark/benchmark-matmult.cpp struct QMatMul; impl Benchmark for QMatMul { type PreProcessData = (candle::quantized::QMatMul, Tensor); type RunResult = Tensor; fn preprocess() -> Result { let zeros = vec![candle::quantized::k_quants::BlockQ4_0::zeros(); 4096 * 11008 / 32]; let mm = candle::quantized::QTensor::new( candle::quantized::QStorage::Cpu(Box::new(zeros)), (4096, 11008), )?; let mm = candle::quantized::QMatMul::from_qtensor(mm)?; let arg = Tensor::randn(0f32, 1., (128, 11008), &Device::Cpu)?; Ok((mm, arg)) } fn run_one(d: &Self::PreProcessData) -> Result { d.0.forward(&d.1) } const ITERS: usize = 100; } struct Cat; impl Benchmark for Cat { type PreProcessData = (Tensor, Tensor); type RunResult = Tensor; fn preprocess() -> Result { let lhs = Tensor::randn(0f32, 1., (1, 32, 2000, 128), &Device::Cpu)?; let rhs = Tensor::randn(0f32, 1., (1, 32, 1, 128), &Device::Cpu)?; Ok((lhs, rhs)) } fn run_one(d: &Self::PreProcessData) -> Result { Tensor::cat(&[&d.0, &d.1], 2) } const ITERS: usize = 1000; } struct Softmax; impl Benchmark for Softmax { type PreProcessData = Tensor; type RunResult = Tensor; fn preprocess() -> Result { // Typical whisper tiny size. let x = Tensor::randn(0f32, 1., (1, 6, 200, 1500), &Device::Cpu)?; Ok(x) } fn run_one(d: &Self::PreProcessData) -> Result { candle_nn::ops::softmax(d, D::Minus1) } const ITERS: usize = 100; } struct SoftmaxLastDim; impl Benchmark for SoftmaxLastDim { type PreProcessData = Tensor; type RunResult = Tensor; fn preprocess() -> Result { // Typical whisper tiny size. let x = Tensor::randn(0f32, 1., (1, 6, 200, 1500), &Device::Cpu)?; Ok(x) } fn run_one(d: &Self::PreProcessData) -> Result { candle_nn::ops::softmax_last_dim(d) } const ITERS: usize = 100; } fn run(iters: Option) -> Result<()> { use std::hint::black_box; let iters = iters.unwrap_or(B::ITERS); let d = B::preprocess()?; let start = std::time::Instant::now(); for _iter in 0..iters { let _res = black_box(B::run_one(black_box(&d))?); } println!("{:?}", start.elapsed() / iters as u32); Ok(()) } #[derive(Subcommand, Debug, Clone)] enum Task { Conv1d, Conv2d, Conv2dIm2Col, Matmul, Matvec, Qmatmul, Softmax, SoftmaxLastDim, Cat, } #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] pub struct Args { /// The benchmark to be run. #[command(subcommand)] task: Task, #[arg(long)] iters: Option, } fn main() -> Result<()> { let args = Args::parse(); match args.task { Task::Conv1d => run::(args.iters)?, Task::Conv2d => run::(args.iters)?, Task::Conv2dIm2Col => run::(args.iters)?, Task::Matmul => run::(args.iters)?, Task::Matvec => run::(args.iters)?, Task::Softmax => run::(args.iters)?, Task::SoftmaxLastDim => run::(args.iters)?, Task::Qmatmul => run::(args.iters)?, Task::Cat => run::(args.iters)?, } Ok(()) }