use crate::op::{BackpropOp, Op}; use crate::tensor::from_storage; use crate::{CpuStorage, CudaStorage, Layout, MetalStorage, Result, Shape, Tensor}; use std::sync::Arc; /// Unary ops that can be defined in user-land. pub trait CustomOp1 { // Box does not support const yet, so use a function to get the name. fn name(&self) -> &'static str; /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)>; /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn cuda_fwd(&self, _storage: &CudaStorage, _layout: &Layout) -> Result<(CudaStorage, Shape)> { Err(crate::Error::Cuda( format!("no cuda implementation for {}", self.name()).into(), )) } /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn metal_fwd( &self, _storage: &MetalStorage, _layout: &Layout, ) -> Result<(MetalStorage, Shape)> { Err(crate::Error::Metal( format!("no metal implementation for {}", self.name()).into(), )) } /// This function takes as argument the argument `arg` used in the forward pass, the result /// produced by the forward operation `res` and the gradient of the result `grad_res`. /// The function should return the gradient of the argument. fn bwd(&self, _arg: &Tensor, _res: &Tensor, _grad_res: &Tensor) -> Result> { Err(crate::Error::BackwardNotSupported { op: self.name() }) } } pub trait CustomOp2 { fn name(&self) -> &'static str; /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn cpu_fwd( &self, s1: &CpuStorage, l1: &Layout, s2: &CpuStorage, l2: &Layout, ) -> Result<(CpuStorage, Shape)>; /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn cuda_fwd( &self, _: &CudaStorage, _: &Layout, _: &CudaStorage, _: &Layout, ) -> Result<(CudaStorage, Shape)> { Err(crate::Error::Cuda( format!("no cuda implementation for {}", self.name()).into(), )) } /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn metal_fwd( &self, _: &MetalStorage, _: &Layout, _: &MetalStorage, _: &Layout, ) -> Result<(MetalStorage, Shape)> { Err(crate::Error::Metal( format!("no metal implementation for {}", self.name()).into(), )) } fn bwd( &self, _arg1: &Tensor, _arg2: &Tensor, _res: &Tensor, _grad_res: &Tensor, ) -> Result<(Option, Option)> { Err(crate::Error::BackwardNotSupported { op: self.name() }) } } pub trait CustomOp3 { fn name(&self) -> &'static str; /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn cpu_fwd( &self, s1: &CpuStorage, l1: &Layout, s2: &CpuStorage, l2: &Layout, s3: &CpuStorage, l3: &Layout, ) -> Result<(CpuStorage, Shape)>; /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn cuda_fwd( &self, _: &CudaStorage, _: &Layout, _: &CudaStorage, _: &Layout, _: &CudaStorage, _: &Layout, ) -> Result<(CudaStorage, Shape)> { Err(crate::Error::Cuda( format!("no cuda implementation for {}", self.name()).into(), )) } /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn metal_fwd( &self, _: &MetalStorage, _: &Layout, _: &MetalStorage, _: &Layout, _: &MetalStorage, _: &Layout, ) -> Result<(MetalStorage, Shape)> { Err(crate::Error::Metal( format!("no metal implementation for {}", self.name()).into(), )) } fn bwd( &self, _arg1: &Tensor, _arg2: &Tensor, _arg3: &Tensor, _res: &Tensor, _grad_res: &Tensor, ) -> Result<(Option, Option, Option)> { Err(crate::Error::BackwardNotSupported { op: self.name() }) } } impl Tensor { /// Applies a unary custom op without backward support pub fn apply_op1_no_bwd(&self, c: &C) -> Result { let (storage, shape) = self.storage().apply_op1(self.layout(), c)?; Ok(from_storage(storage, shape, BackpropOp::none(), false)) } /// Applies a binary custom op without backward support pub fn apply_op2_no_bwd(&self, rhs: &Self, c: &C) -> Result { let (storage, shape) = self.storage() .apply_op2(self.layout(), &rhs.storage(), rhs.layout(), c)?; Ok(from_storage(storage, shape, BackpropOp::none(), false)) } /// Applies a ternary custom op without backward support pub fn apply_op3_no_bwd(&self, t2: &Self, t3: &Self, c: &C) -> Result { let (storage, shape) = self.storage().apply_op3( self.layout(), &t2.storage(), t2.layout(), &t3.storage(), t3.layout(), c, )?; Ok(from_storage(storage, shape, BackpropOp::none(), false)) } /// Applies a unary custom op. pub fn apply_op1_arc(&self, c: Arc>) -> Result { let (storage, shape) = self .storage() .apply_op1(self.layout(), c.as_ref().as_ref())?; let op = BackpropOp::new1(self, |s| Op::CustomOp1(s, c.clone())); Ok(from_storage(storage, shape, op, false)) } pub fn apply_op1(&self, c: C) -> Result { self.apply_op1_arc(Arc::new(Box::new(c))) } /// Applies a binary custom op. pub fn apply_op2_arc( &self, rhs: &Self, c: Arc>, ) -> Result { let (storage, shape) = self.storage().apply_op2( self.layout(), &rhs.storage(), rhs.layout(), c.as_ref().as_ref(), )?; let op = BackpropOp::new2(self, rhs, |t1, t2| Op::CustomOp2(t1, t2, c.clone())); Ok(from_storage(storage, shape, op, false)) } pub fn apply_op2(&self, r: &Self, c: C) -> Result { self.apply_op2_arc(r, Arc::new(Box::new(c))) } /// Applies a ternary custom op. pub fn apply_op3_arc( &self, t2: &Self, t3: &Self, c: Arc>, ) -> Result { let (storage, shape) = self.storage().apply_op3( self.layout(), &t2.storage(), t2.layout(), &t3.storage(), t3.layout(), c.as_ref().as_ref(), )?; let op = BackpropOp::new3(self, t2, t3, |t1, t2, t3| { Op::CustomOp3(t1, t2, t3, c.clone()) }); Ok(from_storage(storage, shape, op, false)) } pub fn apply_op3( &self, t2: &Self, t3: &Self, c: C, ) -> Result { self.apply_op3_arc(t2, t3, Arc::new(Box::new(c))) } } // In place ops. /// Unary ops that can be defined in user-land. /// These ops work in place and as such back-prop is unsupported. pub trait InplaceOp1 { // Box does not support const yet, so use a function to get the name. fn name(&self) -> &'static str; /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn cpu_fwd(&self, storage: &mut CpuStorage, layout: &Layout) -> Result<()>; /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn cuda_fwd(&self, _storage: &mut CudaStorage, _layout: &Layout) -> Result<()> { Err(crate::Error::Cuda( format!("no cuda implementation for {}", self.name()).into(), )) } /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn metal_fwd(&self, _storage: &mut MetalStorage, _layout: &Layout) -> Result<()> { Err(crate::Error::Metal( format!("no metal implementation for {}", self.name()).into(), )) } } pub trait InplaceOp2 { fn name(&self) -> &'static str; /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn cpu_fwd(&self, s1: &mut CpuStorage, l1: &Layout, s2: &CpuStorage, l2: &Layout) -> Result<()>; /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn cuda_fwd(&self, _: &mut CudaStorage, _: &Layout, _: &CudaStorage, _: &Layout) -> Result<()> { Err(crate::Error::Cuda( format!("no cuda implementation for {}", self.name()).into(), )) } /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn metal_fwd( &self, _: &mut MetalStorage, _: &Layout, _: &MetalStorage, _: &Layout, ) -> Result<()> { Err(crate::Error::Metal( format!("no metal implementation for {}", self.name()).into(), )) } } pub trait InplaceOp3 { fn name(&self) -> &'static str; /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn cpu_fwd( &self, s1: &mut CpuStorage, l1: &Layout, s2: &CpuStorage, l2: &Layout, s3: &CpuStorage, l3: &Layout, ) -> Result<()>; /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn cuda_fwd( &self, _: &mut CudaStorage, _: &Layout, _: &CudaStorage, _: &Layout, _: &CudaStorage, _: &Layout, ) -> Result<()> { Err(crate::Error::Cuda( format!("no cuda implementation for {}", self.name()).into(), )) } /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides, /// offsets etc so the associated layout should be used to access it. fn metal_fwd( &self, _: &mut MetalStorage, _: &Layout, _: &MetalStorage, _: &Layout, _: &MetalStorage, _: &Layout, ) -> Result<()> { Err(crate::Error::Metal( format!("no metal implementation for {}", self.name()).into(), )) } } impl Tensor { /// Applies a unary custom op in place. pub fn inplace_op1(&self, c: &C) -> Result<()> { self.storage_mut().inplace_op1(self.layout(), c) } /// Applies a unary custom op in place (for the first tensor). pub fn inplace_op2(&self, rhs: &Self, c: &C) -> Result<()> { self.storage_mut() .inplace_op2(self.layout(), &rhs.storage(), rhs.layout(), c) } /// Applies a ternary custom op in place (for the first tensor). pub fn inplace_op3(&self, t2: &Self, t3: &Self, c: &C) -> Result<()> { self.storage_mut().inplace_op3( self.layout(), &t2.storage(), t2.layout(), &t3.storage(), t3.layout(), c, ) } } pub struct UgIOp1 { name: &'static str, #[cfg(feature = "cuda")] func: cudarc::driver::CudaFunction, #[cfg(feature = "metal")] func: metal::ComputePipelineState, } impl UgIOp1 { #[allow(unused)] pub fn new( name: &'static str, kernel: ug::lang::ssa::Kernel, device: &crate::Device, ) -> Result { #[cfg(feature = "cuda")] { let device = device.as_cuda_device()?; let func = device.compile(name, kernel)?; Ok(Self { name, func }) } #[cfg(feature = "metal")] { let device = device.as_metal_device()?; let func = device.compile(name, kernel)?; Ok(Self { name, func }) } #[cfg(not(any(feature = "cuda", feature = "metal")))] { Ok(Self { name }) } } } impl InplaceOp1 for UgIOp1 { fn name(&self) -> &'static str { self.name } fn cpu_fwd(&self, _: &mut CpuStorage, _: &Layout) -> Result<()> { crate::bail!("ug ops are only supported on metal/cuda at the moment") } #[cfg(feature = "metal")] fn metal_fwd(&self, sto: &mut MetalStorage, layout: &Layout) -> Result<()> { use crate::backend::BackendStorage; use candle_metal_kernels::utils::EncoderProvider; let elem_count = layout.shape().elem_count(); if sto.dtype() != crate::DType::F32 { // TODO: support more dtypes. crate::bail!("input is not a f32 tensor") } let device = sto.device(); println!("here"); let command_buffer = device.command_buffer()?; let command_buffer = &command_buffer; let encoder = command_buffer.encoder(); let encoder = encoder.as_ref(); encoder.set_compute_pipeline_state(&self.func); let (g, b) = if elem_count % 32 == 0 { (elem_count / 32, 32) } else { (elem_count, 1) }; let grid_dims = metal::MTLSize { width: g as u64, height: 1, depth: 1, }; let group_dims = candle_metal_kernels::utils::get_block_dims(b as u64, 1, 1); candle_metal_kernels::utils::set_param(encoder, 0, (sto.buffer(), 0usize)); encoder.use_resource(sto.buffer(), metal::MTLResourceUsage::Write); encoder.dispatch_threads(grid_dims, group_dims); Ok(()) } #[cfg(feature = "cuda")] fn cuda_fwd(&self, sto: &mut CudaStorage, layout: &Layout) -> Result<()> { use crate::cuda_backend::WrapErr; use cudarc::driver::LaunchAsync; let elem_count = layout.shape().elem_count(); // TODO: support more dtypes. let sto = sto.as_cuda_slice::()?; let sto = match layout.contiguous_offsets() { None => crate::bail!("input has to be contiguous"), Some((o1, o2)) => sto.slice(o1..o2), }; let params = (&sto,); let (g, b) = if elem_count % 32 == 0 { (elem_count / 32, 32) } else { (elem_count, 1) }; let cfg = cudarc::driver::LaunchConfig { grid_dim: (g as u32, 1, 1), block_dim: (b as u32, 1, 1), shared_mem_bytes: 0, }; unsafe { self.func.clone().launch(cfg, params) }.w()?; Ok(()) } }