diff options
-rw-r--r-- | candle-core/examples/cuda_basics.rs | 34 | ||||
-rw-r--r-- | candle-core/examples/cuda_sum_benchmark.rs | 51 | ||||
-rw-r--r-- | candle-core/src/display.rs | 6 |
3 files changed, 56 insertions, 35 deletions
diff --git a/candle-core/examples/cuda_basics.rs b/candle-core/examples/cuda_basics.rs deleted file mode 100644 index 6050d793..00000000 --- a/candle-core/examples/cuda_basics.rs +++ /dev/null @@ -1,34 +0,0 @@ -#[cfg(feature = "mkl")] -extern crate intel_mkl_src; - -use anyhow::Result; -use candle::{Device, Tensor}; - -fn main() -> Result<()> { - let device = Device::new_cuda(0)?; - let ids = Tensor::new(&[0u32, 2u32, 1u32], &device)?; - let t = Tensor::new(&[[0f32, 1f32], [2f32, 3f32], [4f32, 5f32]], &device)?; - let hs = Tensor::embedding(&ids, &t)?; - println!("> {:?}", hs.to_vec2::<f32>()); - - let x = Tensor::new(&[3f32, 1., 4., 1., 5.], &device)?; - println!("{:?}", x.to_vec1::<f32>()?); - let y = Tensor::new(&[2f32, 7., 1., 8., 2.], &device)?; - let z = (y + x * 3.)?; - println!("{:?}", z.to_vec1::<f32>()?); - println!("{:?}", z.sqrt()?.to_vec1::<f32>()?); - let x = Tensor::new(&[[11f32, 22.], [33., 44.], [55., 66.], [77., 78.]], &device)?; - let y = Tensor::new(&[[1f32, 2., 3.], [4., 5., 6.]], &device)?; - println!("{:?}", y.to_vec2::<f32>()?); - let z = x.matmul(&y)?; - println!("{:?}", z.to_vec2::<f32>()?); - let x = Tensor::new( - &[[11f32, 22.], [33., 44.], [55., 66.], [77., 78.]], - &Device::Cpu, - )?; - let y = Tensor::new(&[[1f32, 2., 3.], [4., 5., 6.]], &Device::Cpu)?; - println!("{:?}", y.to_vec2::<f32>()?); - let z = x.matmul(&y)?; - println!("{:?}", z.to_vec2::<f32>()?); - Ok(()) -} diff --git a/candle-core/examples/cuda_sum_benchmark.rs b/candle-core/examples/cuda_sum_benchmark.rs new file mode 100644 index 00000000..09d0099d --- /dev/null +++ b/candle-core/examples/cuda_sum_benchmark.rs @@ -0,0 +1,51 @@ +#[cfg(feature = "mkl")] +extern crate intel_mkl_src; + +use std::str::FromStr; + +use anyhow::Result; +use candle::{Device, Tensor}; + +fn cos_sin(n: usize, device: &Device) -> Result<Tensor> { + let thetas: Vec<_> = (0..n).map(|i| (i as f32 / n as f32)).collect(); + let xs: Vec<_> = thetas.iter().map(|t| t.cos().abs()).collect(); + let ys: Vec<_> = thetas.iter().map(|t| t.sin().abs()).collect(); + let xs = Tensor::from_vec(xs, (n, 1), device)?; + let ys = Tensor::from_vec(ys, (1, n), device)?; + let ys = Tensor::cat(&[&ys, &ys, &ys, &ys, &ys, &ys], 1)?; + Ok(xs.matmul(&ys)?) +} + +fn main() -> Result<()> { + let device = Device::new_cuda(0)?; + let args = std::env::args().collect::<Vec<String>>(); + let n = if args.len() < 2 { + 2000usize + } else { + usize::from_str(&args[1])? + }; + let xys_cpu = cos_sin(n, &Device::Cpu)?; + let xys = cos_sin(n, &device)?; + println!("{xys_cpu:?} {xys:?}"); + let sum_cpu = xys_cpu.sum(&[1])?; + println!("{sum_cpu}"); + let sum = xys.sum(&[1])?; + println!("{sum}"); + let start = std::time::Instant::now(); + let n_iters = 100; + let mut v = 0f32; + for _i in 0..n_iters { + let sum = xys.sum(&[1])?; + let sum = sum.sum(&[0])?; + let sum: f32 = sum.reshape(&[])?.to_scalar()?; + v += sum; + } + let elapsed = start.elapsed(); + if v > 0. { + println!( + "ran {n_iters} iterations, time per iter: {:?} ({v})", + elapsed.div_f64(n_iters as f64) + ); + } + Ok(()) +} diff --git a/candle-core/src/display.rs b/candle-core/src/display.rs index 60907bb3..127e55b0 100644 --- a/candle-core/src/display.rs +++ b/candle-core/src/display.rs @@ -9,7 +9,11 @@ impl Tensor { &self, f: &mut std::fmt::Formatter, ) -> std::fmt::Result { - write!(f, "Tensor[")?; + let prefix = match self.device() { + crate::Device::Cpu => "Cpu", + crate::Device::Cuda(_) => "Cuda", + }; + write!(f, "{prefix}Tensor[")?; match self.dims() { [] => { if let Ok(v) = self.to_scalar::<T>() { |