diff options
Diffstat (limited to 'candle-core')
-rw-r--r-- | candle-core/Cargo.toml | 4 | ||||
-rw-r--r-- | candle-core/benches/random.rs | 66 | ||||
-rw-r--r-- | candle-core/src/metal_backend.rs | 70 |
3 files changed, 127 insertions, 13 deletions
diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml index 91655f57..8edfef5a 100644 --- a/candle-core/Cargo.toml +++ b/candle-core/Cargo.toml @@ -49,3 +49,7 @@ metal = ["dep:metal", "dep:candle-metal-kernels"] name = "matmul" harness = false +[[bench]] +name = "random" +harness = false + diff --git a/candle-core/benches/random.rs b/candle-core/benches/random.rs new file mode 100644 index 00000000..781d8b39 --- /dev/null +++ b/candle-core/benches/random.rs @@ -0,0 +1,66 @@ +use candle_core::{DType, Device, Tensor}; +use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use std::time::Instant; + +fn rand_uniform(a: &Tensor) { + a.rand_like(0.0, 1.0).unwrap(); +} + +fn rand_normal(a: &Tensor) { + a.randn_like(100.0, 15.0).unwrap(); +} + +fn criterion_benchmark(c: &mut Criterion) { + let b = 1; + + let rows = 2048; + let cols = 2048; + + let device = Device::new_metal(0).unwrap(); + let device2 = device.clone(); + let dtype = DType::F32; + let tensor = Tensor::zeros((b, rows, cols), dtype, &device).unwrap(); + + let flops = b * rows * cols; + + let mut group = c.benchmark_group("metal_random_uniform"); + group.throughput(Throughput::Bytes(flops as u64)); + group.bench_function("iter", move |benches| { + benches.iter_custom(|iters| { + let start = Instant::now(); + for _i in 0..iters { + rand_uniform(black_box(&tensor)); + } + if let Device::Metal(device) = &device { + device.wait_until_completed().unwrap(); + } else { + panic!("Expected metal device"); + } + start.elapsed() + }) + }); + group.finish(); + + let tensor = Tensor::zeros((b, rows, cols), dtype, &device2).unwrap(); + + let mut group = c.benchmark_group("metal_random_normal"); + group.throughput(Throughput::Bytes(flops as u64)); + group.bench_function("iter", move |benches| { + benches.iter_custom(|iters| { + let start = Instant::now(); + for _i in 0..iters { + rand_normal(black_box(&tensor)); + } + if let Device::Metal(device) = &device2 { + device.wait_until_completed().unwrap(); + } else { + panic!("Expected metal device"); + } + start.elapsed() + }) + }); + group.finish(); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index c1c4aa4b..24beeb7a 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -8,7 +8,7 @@ use metal; use metal::{Buffer, CommandBuffer, CommandQueue, MTLResourceOptions, NSUInteger}; use std::collections::HashMap; use std::path::Path; -use std::sync::{Arc, RwLock, TryLockError}; +use std::sync::{Arc, Mutex, RwLock, TryLockError}; /// Simple way to catch lock error without /// depending on T @@ -106,6 +106,8 @@ pub struct MetalDevice { /// Whenever we actually allocate a new buffer, we make a full sweep to cleanup unused buffers /// (strong_count = 1). buffers: AllocatedBuffers, + /// Seed for random number generation. + seed: Arc<Mutex<u64>>, } impl std::fmt::Debug for MetalDevice { @@ -1483,6 +1485,7 @@ impl BackendDevice for MetalDevice { Ok(val) => val.parse()?, _ => 20, }; + let seed = Arc::new(Mutex::new(299792458)); Ok(Self { device, fence, @@ -1492,13 +1495,10 @@ impl BackendDevice for MetalDevice { compute_per_buffer, buffers, kernels, + seed, }) } - fn set_seed(&self, _seed: u64) -> Result<()> { - crate::bail!("Metal set_seed not implemented") - } - fn location(&self) -> crate::DeviceLocation { crate::DeviceLocation::Metal { gpu_id: self.registry_id() as usize, @@ -1551,12 +1551,31 @@ impl BackendDevice for MetalDevice { &self, shape: &Shape, dtype: DType, - mean: f64, - stddev: f64, + min: f64, + max: f64, ) -> Result<Self::Storage> { - // TODO is there a better way ? - let cpu_storage = crate::cpu_backend::CpuDevice.rand_uniform(shape, dtype, mean, stddev)?; - self.storage_from_cpu_storage(&cpu_storage) + let name = match dtype { + DType::F32 => "rand_uniform_f32", + DType::F16 => "rand_uniform_f16", + DType::BF16 => "rand_uniform_bf16", + dtype => crate::bail!("rand_uniform not implemented for {dtype:?}"), + }; + let buffer = self.new_buffer(shape.elem_count(), dtype, "rand_uniform")?; + let command_buffer = self.command_buffer()?; + candle_metal_kernels::call_random_uniform( + &self.device, + &command_buffer, + &self.kernels, + name, + *self.seed.lock().unwrap(), + min as f32, + max as f32, + shape.elem_count(), + &buffer, + ) + .map_err(MetalError::from)?; + + Ok(Self::Storage::new(buffer, self.clone(), dtype)) } fn rand_normal( @@ -1566,9 +1585,34 @@ impl BackendDevice for MetalDevice { mean: f64, stddev: f64, ) -> Result<Self::Storage> { - // TODO is there a better way ? - let cpu_storage = crate::cpu_backend::CpuDevice.rand_normal(shape, dtype, mean, stddev)?; - self.storage_from_cpu_storage(&cpu_storage) + let name = match dtype { + DType::F32 => "rand_normal_f32", + DType::F16 => "rand_normal_f16", + DType::BF16 => "rand_normal_bf16", + dtype => crate::bail!("rand_uniform not implemented for {dtype:?}"), + }; + let buffer = self.new_buffer(shape.elem_count(), dtype, "rand_normal")?; + let command_buffer = self.command_buffer()?; + candle_metal_kernels::call_random_normal( + &self.device, + &command_buffer, + &self.kernels, + name, + *self.seed.lock().unwrap(), + mean as f32, + stddev as f32, + shape.elem_count(), + &buffer, + ) + .map_err(MetalError::from)?; + + Ok(Self::Storage::new(buffer, self.clone(), dtype)) + } + + fn set_seed(&self, seed: u64) -> Result<()> { + let mut s = self.seed.try_lock().map_err(MetalError::from)?; + *s = seed; + Ok(()) } } |