diff options
-rw-r--r-- | candle-core/src/accelerate.rs | 32 | ||||
-rw-r--r-- | candle-core/src/op.rs | 18 |
2 files changed, 50 insertions, 0 deletions
diff --git a/candle-core/src/accelerate.rs b/candle-core/src/accelerate.rs index 87e0ee8d..1cb34e19 100644 --- a/candle-core/src/accelerate.rs +++ b/candle-core/src/accelerate.rs @@ -370,6 +370,38 @@ pub fn vd_sqr(a: &[f64], y: &mut [f64]) { y.iter_mut().zip(a.iter()).for_each(|(y, a)| *y = *a * *a) } +#[inline] +pub fn vs_tanh_inplace(y: &mut [f32]) { + unsafe { ffi::vvtanhf(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) } +} + +#[inline] +pub fn vd_tanh_inplace(y: &mut [f64]) { + unsafe { ffi::vvtanh(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) } +} + +#[inline] +pub fn vs_gelu(vs: &[f32], ys: &mut [f32]) { + for (&v, y) in vs.iter().zip(ys.iter_mut()) { + *y = (2.0f32 / std::f32::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v) + } + vs_tanh_inplace(ys); + for (&v, y) in vs.iter().zip(ys.iter_mut()) { + *y = 0.5 * v * (1.0 + *y) + } +} + +#[inline] +pub fn vd_gelu(vs: &[f64], ys: &mut [f64]) { + for (&v, y) in vs.iter().zip(ys.iter_mut()) { + *y = (2.0f64 / std::f64::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v) + } + vd_tanh_inplace(ys); + for (&v, y) in vs.iter().zip(ys.iter_mut()) { + *y = 0.5 * v * (1.0 + *y) + } +} + macro_rules! binary_op { ($fn_name:ident, $ty:ty, $accelerate_name:ident) => { #[inline] diff --git a/candle-core/src/op.rs b/candle-core/src/op.rs index fbfc9c1a..9382b217 100644 --- a/candle-core/src/op.rs +++ b/candle-core/src/op.rs @@ -600,6 +600,24 @@ impl UnaryOpT for Gelu { fn f64_vec(xs: &[f64], ys: &mut [f64]) { crate::mkl::vd_gelu(xs, ys) } + + #[cfg(feature = "accelerate")] + const F32_VEC: bool = true; + + #[cfg(feature = "accelerate")] + #[inline(always)] + fn f32_vec(xs: &[f32], ys: &mut [f32]) { + crate::accelerate::vs_gelu(xs, ys) + } + + #[cfg(feature = "accelerate")] + const F64_VEC: bool = true; + + #[cfg(feature = "accelerate")] + #[inline(always)] + fn f64_vec(xs: &[f64], ys: &mut [f64]) { + crate::accelerate::vd_gelu(xs, ys) + } } impl UnaryOpT for Relu { |