From ce9fbc368211815ef2dddff01575ca1f9d4eccd5 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 17 Mar 2024 10:49:13 +0100 Subject: Optimize the cat operation on contiguous tensors (#1855) * Add a specialized kernel for copy2d. * Move the cat operations. * Avoid transpositions in cat. * Bugfix. * Bugfix for the cuda kernel. * Add a benchmark. * Add more testing. * Test fix. * Faster kernel. * Add the missing kernel. * Tweak the test. * Add a metal kernel. * Fix for the metal kernel. * Get the tests to pass on metal. * Also use this opportunity to fix the metal kernel for ELU. * Add some bf16 kernels. * Clippy fixes. --- candle-nn/examples/cpu_benchmarks.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'candle-nn') diff --git a/candle-nn/examples/cpu_benchmarks.rs b/candle-nn/examples/cpu_benchmarks.rs index 001be116..430316b8 100644 --- a/candle-nn/examples/cpu_benchmarks.rs +++ b/candle-nn/examples/cpu_benchmarks.rs @@ -238,6 +238,23 @@ impl Benchmark for QMatMul { const ITERS: usize = 100; } +struct Cat; +impl Benchmark for Cat { + type PreProcessData = (Tensor, Tensor); + type RunResult = Tensor; + fn preprocess() -> Result { + let lhs = Tensor::randn(0f32, 1., (1, 32, 2000, 128), &Device::Cpu)?; + let rhs = Tensor::randn(0f32, 1., (1, 32, 1, 128), &Device::Cpu)?; + Ok((lhs, rhs)) + } + + fn run_one(d: &Self::PreProcessData) -> Result { + Tensor::cat(&[&d.0, &d.1], 2) + } + + const ITERS: usize = 1000; +} + struct Softmax; impl Benchmark for Softmax { type PreProcessData = Tensor; @@ -295,6 +312,7 @@ enum Task { Qmatmul, Softmax, SoftmaxLastDim, + Cat, } #[derive(Parser, Debug)] @@ -319,6 +337,7 @@ fn main() -> Result<()> { Task::Softmax => run::(args.iters)?, Task::SoftmaxLastDim => run::(args.iters)?, Task::Qmatmul => run::(args.iters)?, + Task::Cat => run::(args.iters)?, } Ok(()) } -- cgit v1.2.3