summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--candle-core/src/accelerate.rs32
-rw-r--r--candle-core/src/op.rs18
2 files changed, 50 insertions, 0 deletions
diff --git a/candle-core/src/accelerate.rs b/candle-core/src/accelerate.rs
index 87e0ee8d..1cb34e19 100644
--- a/candle-core/src/accelerate.rs
+++ b/candle-core/src/accelerate.rs
@@ -370,6 +370,38 @@ pub fn vd_sqr(a: &[f64], y: &mut [f64]) {
y.iter_mut().zip(a.iter()).for_each(|(y, a)| *y = *a * *a)
}
+#[inline]
+pub fn vs_tanh_inplace(y: &mut [f32]) {
+ unsafe { ffi::vvtanhf(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
+}
+
+#[inline]
+pub fn vd_tanh_inplace(y: &mut [f64]) {
+ unsafe { ffi::vvtanh(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
+}
+
+#[inline]
+pub fn vs_gelu(vs: &[f32], ys: &mut [f32]) {
+ for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+ *y = (2.0f32 / std::f32::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)
+ }
+ vs_tanh_inplace(ys);
+ for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+ *y = 0.5 * v * (1.0 + *y)
+ }
+}
+
+#[inline]
+pub fn vd_gelu(vs: &[f64], ys: &mut [f64]) {
+ for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+ *y = (2.0f64 / std::f64::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)
+ }
+ vd_tanh_inplace(ys);
+ for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+ *y = 0.5 * v * (1.0 + *y)
+ }
+}
+
macro_rules! binary_op {
($fn_name:ident, $ty:ty, $accelerate_name:ident) => {
#[inline]
diff --git a/candle-core/src/op.rs b/candle-core/src/op.rs
index fbfc9c1a..9382b217 100644
--- a/candle-core/src/op.rs
+++ b/candle-core/src/op.rs
@@ -600,6 +600,24 @@ impl UnaryOpT for Gelu {
fn f64_vec(xs: &[f64], ys: &mut [f64]) {
crate::mkl::vd_gelu(xs, ys)
}
+
+ #[cfg(feature = "accelerate")]
+ const F32_VEC: bool = true;
+
+ #[cfg(feature = "accelerate")]
+ #[inline(always)]
+ fn f32_vec(xs: &[f32], ys: &mut [f32]) {
+ crate::accelerate::vs_gelu(xs, ys)
+ }
+
+ #[cfg(feature = "accelerate")]
+ const F64_VEC: bool = true;
+
+ #[cfg(feature = "accelerate")]
+ #[inline(always)]
+ fn f64_vec(xs: &[f64], ys: &mut [f64]) {
+ crate::accelerate::vd_gelu(xs, ys)
+ }
}
impl UnaryOpT for Relu {