From c654ecdb16733390eddef3f8d9ade5fa2988b14b Mon Sep 17 00:00:00 2001 From: laurent Date: Wed, 21 Jun 2023 18:56:04 +0100 Subject: Add a specific example for cuda. --- examples/cuda_basics.rs | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 examples/cuda_basics.rs (limited to 'examples/cuda_basics.rs') diff --git a/examples/cuda_basics.rs b/examples/cuda_basics.rs new file mode 100644 index 00000000..e1dca6a9 --- /dev/null +++ b/examples/cuda_basics.rs @@ -0,0 +1,9 @@ +use anyhow::Result; +use candle::{DType, Device, Tensor}; + +fn main() -> Result<()> { + let device = Device::new_cuda(0)?; + let x = Tensor::zeros(4, DType::F32, device)?; + println!("{:?}", x.to_vec1::()?); + Ok(()) +} -- cgit v1.2.3 From fcb4e6b84fa755834e1c76b49e5d167c50c47d14 Mon Sep 17 00:00:00 2001 From: laurent Date: Wed, 21 Jun 2023 19:55:57 +0100 Subject: Use a reference for the device. --- examples/basics.rs | 2 +- examples/cuda_basics.rs | 7 +++++-- src/tensor.rs | 22 +++++++++++----------- tests/grad_tests.rs | 2 +- tests/tensor_tests.rs | 10 +++++----- 5 files changed, 23 insertions(+), 20 deletions(-) (limited to 'examples/cuda_basics.rs') diff --git a/examples/basics.rs b/examples/basics.rs index f01f7871..d5d4f72b 100644 --- a/examples/basics.rs +++ b/examples/basics.rs @@ -2,7 +2,7 @@ use anyhow::Result; use candle::{Device, Tensor}; fn main() -> Result<()> { - let x = Tensor::var(&[3f32, 1., 4.], Device::Cpu)?; + let x = Tensor::var(&[3f32, 1., 4.], &Device::Cpu)?; let y = (((&x * &x)? + &x * 5f64)? + 4f64)?; println!("{:?}", y.to_vec1::()?); Ok(()) diff --git a/examples/cuda_basics.rs b/examples/cuda_basics.rs index e1dca6a9..a9647469 100644 --- a/examples/cuda_basics.rs +++ b/examples/cuda_basics.rs @@ -1,9 +1,12 @@ use anyhow::Result; -use candle::{DType, Device, Tensor}; +use candle::{Device, Tensor}; fn main() -> Result<()> { let device = Device::new_cuda(0)?; - let x = Tensor::zeros(4, DType::F32, device)?; + let x = Tensor::new(&[3f32, 1., 4., 1., 5.], &device)?; + let y = Tensor::new(&[2f32, 7., 1., 8., 2.], &device)?; println!("{:?}", x.to_vec1::()?); + let z = (x + y)?; + println!("{:?}", z.to_vec1::()?); Ok(()) } diff --git a/src/tensor.rs b/src/tensor.rs index be642329..a1262334 100644 --- a/src/tensor.rs +++ b/src/tensor.rs @@ -84,7 +84,7 @@ impl Tensor { fn ones_impl>( shape: S, dtype: DType, - device: Device, + device: &Device, is_variable: bool, ) -> Result { let shape = shape.into(); @@ -101,22 +101,22 @@ impl Tensor { Ok(Self(Arc::new(tensor_))) } - pub fn ones>(shape: S, dtype: DType, device: Device) -> Result { + pub fn ones>(shape: S, dtype: DType, device: &Device) -> Result { Self::ones_impl(shape, dtype, device, false) } - pub fn ones_var>(shape: S, dtype: DType, device: Device) -> Result { + pub fn ones_var>(shape: S, dtype: DType, device: &Device) -> Result { Self::ones_impl(shape, dtype, device, true) } pub fn ones_like(&self) -> Result { - Tensor::ones(self.shape(), self.dtype(), self.device()) + Tensor::ones(self.shape(), self.dtype(), &self.device()) } fn zeros_impl>( shape: S, dtype: DType, - device: Device, + device: &Device, is_variable: bool, ) -> Result { let shape = shape.into(); @@ -133,21 +133,21 @@ impl Tensor { Ok(Self(Arc::new(tensor_))) } - pub fn zeros>(shape: S, dtype: DType, device: Device) -> Result { + pub fn zeros>(shape: S, dtype: DType, device: &Device) -> Result { Self::zeros_impl(shape, dtype, device, false) } - pub fn zeros_var>(shape: S, dtype: DType, device: Device) -> Result { + pub fn zeros_var>(shape: S, dtype: DType, device: &Device) -> Result { Self::zeros_impl(shape, dtype, device, true) } pub fn zeros_like(&self) -> Result { - Tensor::zeros(self.shape(), self.dtype(), self.device()) + Tensor::zeros(self.shape(), self.dtype(), &self.device()) } pub fn new_impl( array: A, - device: Device, + device: &Device, is_variable: bool, ) -> Result { let shape = array.shape()?; @@ -164,11 +164,11 @@ impl Tensor { Ok(Self(Arc::new(tensor_))) } - pub fn new(array: A, device: Device) -> Result { + pub fn new(array: A, device: &Device) -> Result { Self::new_impl(array, device, false) } - pub fn var(array: A, device: Device) -> Result { + pub fn var(array: A, device: &Device) -> Result { Self::new_impl(array, device, true) } diff --git a/tests/grad_tests.rs b/tests/grad_tests.rs index 432b1520..56186e5d 100644 --- a/tests/grad_tests.rs +++ b/tests/grad_tests.rs @@ -3,7 +3,7 @@ use candle::{Device, Tensor}; #[test] fn simple_grad() -> Result<()> { - let x = Tensor::var(&[3f32, 1., 4.], Device::Cpu)?; + let x = Tensor::var(&[3f32, 1., 4.], &Device::Cpu)?; let y = (((&x * &x)? + &x * 5f64)? + 4f64)?; let grads = y.backward()?; let grad_x = grads.get(&x).context("no grad for x")?; diff --git a/tests/tensor_tests.rs b/tests/tensor_tests.rs index fb2d84d9..81c2e801 100644 --- a/tests/tensor_tests.rs +++ b/tests/tensor_tests.rs @@ -2,7 +2,7 @@ use candle::{DType, Device, Result, Tensor}; #[test] fn zeros() -> Result<()> { - let tensor = Tensor::zeros((5, 2), DType::F32, Device::Cpu)?; + let tensor = Tensor::zeros((5, 2), DType::F32, &Device::Cpu)?; let (dim1, dim2) = tensor.shape().r2()?; assert_eq!(dim1, 5); assert_eq!(dim2, 2); @@ -11,7 +11,7 @@ fn zeros() -> Result<()> { #[test] fn add_mul() -> Result<()> { - let tensor = Tensor::new(&[3f32, 1., 4.], Device::Cpu)?; + let tensor = Tensor::new(&[3f32, 1., 4.], &Device::Cpu)?; let dim1 = tensor.shape().r1()?; assert_eq!(dim1, 3); let content: Vec = tensor.to_vec1()?; @@ -28,7 +28,7 @@ fn add_mul() -> Result<()> { #[test] fn tensor_2d() -> Result<()> { let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]]; - let tensor = Tensor::new(data, Device::Cpu)?; + let tensor = Tensor::new(data, &Device::Cpu)?; let dims = tensor.shape().r2()?; assert_eq!(dims, (2, 5)); let content: Vec> = tensor.to_vec2()?; @@ -39,9 +39,9 @@ fn tensor_2d() -> Result<()> { #[test] fn binary_op() -> Result<()> { let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]]; - let tensor = Tensor::new(data, Device::Cpu)?; + let tensor = Tensor::new(data, &Device::Cpu)?; let data2 = &[[5f32, 5., 5., 5., 5.], [2., 1., 7., 8., 2.]]; - let tensor2 = Tensor::new(data2, Device::Cpu)?; + let tensor2 = Tensor::new(data2, &Device::Cpu)?; let tensor = (&tensor + (&tensor * &tensor)? / (&tensor + &tensor2))?; let dims = tensor.shape().r2()?; assert_eq!(dims, (2, 5)); -- cgit v1.2.3 From 97d9142dee086faf2e348fe1189d422da26f0fe5 Mon Sep 17 00:00:00 2001 From: laurent Date: Wed, 21 Jun 2023 20:48:22 +0100 Subject: Add a first kernel. --- examples/cuda_basics.rs | 4 ++-- src/cuda_backend.rs | 52 ++++++++++++++++++++++++++++++++++++++++++++++++- src/storage.rs | 5 ++++- 3 files changed, 57 insertions(+), 4 deletions(-) (limited to 'examples/cuda_basics.rs') diff --git a/examples/cuda_basics.rs b/examples/cuda_basics.rs index a9647469..0a4825fa 100644 --- a/examples/cuda_basics.rs +++ b/examples/cuda_basics.rs @@ -4,9 +4,9 @@ use candle::{Device, Tensor}; fn main() -> Result<()> { let device = Device::new_cuda(0)?; let x = Tensor::new(&[3f32, 1., 4., 1., 5.], &device)?; - let y = Tensor::new(&[2f32, 7., 1., 8., 2.], &device)?; println!("{:?}", x.to_vec1::()?); - let z = (x + y)?; + let y = Tensor::new(&[2f32, 7., 1., 8., 2.], &device)?; + let z = (y * 3.)?; println!("{:?}", z.to_vec1::()?); Ok(()) } diff --git a/src/cuda_backend.rs b/src/cuda_backend.rs index 5a80df08..06730251 100644 --- a/src/cuda_backend.rs +++ b/src/cuda_backend.rs @@ -1,11 +1,28 @@ use crate::{CpuStorage, DType, Result, Shape}; -use cudarc::driver::CudaSlice; +use cudarc::driver::{CudaSlice, LaunchAsync, LaunchConfig}; pub(crate) type Error = cudarc::driver::DriverError; #[derive(Debug, Clone)] pub struct CudaDevice(std::sync::Arc); +// TODO: Switch to pre-compiled PTX kernels rather than compiling on the fly. +const AFFINE_CU: &str = r#" +extern "C" __global__ void affine_f32( + const size_t numel, + const float *x, + float *y, + const float mul, + const float add +) { + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= numel) { + return; + } + y[i] = x[i] * mul + add; +} +"#; + impl CudaDevice { pub(crate) fn new(ordinal: usize) -> Result { let device = cudarc::driver::CudaDevice::new(ordinal)?; @@ -65,6 +82,39 @@ impl CudaStorage { } } + pub(crate) fn affine_impl( + &self, + shape: &Shape, + _stride: &[usize], + mul: f64, + add: f64, + ) -> Result { + match self { + Self::F32(arg) => { + // TODO: Handle the stride. + let dev = arg.device(); + let module_name = "affine_f32"; + if !dev.has_func(module_name, module_name) { + let ptx = cudarc::nvrtc::compile_ptx(AFFINE_CU).unwrap(); + dev.load_ptx(ptx, module_name, &[module_name])?; + } + let elem_count = shape.elem_count(); + let fwd_fn = dev.get_func(module_name, module_name).unwrap(); + let cfg = LaunchConfig::for_num_elems(elem_count as u32); + // SAFETY: if this function returns Ok(..), the kernel has been applied + // and has set the initially unset memory. + let out = unsafe { dev.alloc::(elem_count) }?; + let params = (elem_count, arg, &out, mul as f32, add as f32); + // SAFETY: well, well, well... + unsafe { fwd_fn.launch(cfg, params) }?; + Ok(Self::F32(out)) + } + Self::F64(_) => { + todo!() + } + } + } + pub(crate) fn to_cpu_storage(&self) -> Result { match self { Self::F32(slice) => { diff --git a/src/storage.rs b/src/storage.rs index 7230104e..573cf945 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -144,7 +144,10 @@ impl Storage { let storage = storage.affine_impl(shape, stride, mul, add)?; Ok(Self::Cpu(storage)) } - Self::Cuda { .. } => todo!(), + Self::Cuda(storage) => { + let storage = storage.affine_impl(shape, stride, mul, add)?; + Ok(Self::Cuda(storage)) + } } } -- cgit v1.2.3