1 files changed, 56 insertions, 0 deletions
diff --git a/candle-metal-kernels/src/reduce.metal b/candle-metal-kernels/src/reduce.metal
index 93dac662..3c3cbc14 100644
--- a/candle-metal-kernels/src/reduce.metal
+++ b/candle-metal-kernels/src/reduce.metal
@@ -260,6 +260,59 @@ kernel void NAME(
     }                                                                             \
 }                                                                                 \
 
+#define RMSNORM(NAME, T)                                                          \
+kernel void NAME(                                                                 \
+    constant size_t &src_numel,                                                   \
+    constant size_t &el_to_sum_per_block,                                         \
+    device const T *src,                                                          \
+    device T *dst,                                                                \
+    device const T *alpha,                                                        \
+    constant float &eps,                                                          \
+                                                                                  \
+    uint id [[ thread_position_in_grid ]],                                        \
+    uint tid [[ thread_index_in_threadgroup ]],                                   \
+    uint dst_id [[ threadgroup_position_in_grid ]],                               \
+    uint block_dim [[ threads_per_threadgroup ]]                                  \
+) {                                                                               \
+    threadgroup float shared_memory[THREADGROUP_SIZE];                            \
+    shared_memory[tid] = 0;                                                       \
+    size_t start_idx = dst_id * el_to_sum_per_block;                              \
+    size_t stop_idx = min(start_idx + el_to_sum_per_block, src_numel);            \
+    size_t idx = start_idx + tid;                                                 \
+                                                                                  \
+                                                                                  \
+    float tmp = 0;                                                                \
+    while (idx < stop_idx) {                                                      \
+        tmp = tmp + float(src[idx]) * float(src[idx]);                            \
+        idx += block_dim;                                                         \
+    }                                                                             \
+    shared_memory[tid] = tmp;                                                     \
+                                                                                  \
+    threadgroup_barrier(mem_flags::mem_threadgroup);                              \
+                                                                                  \
+    for (uint s = block_dim / 2; s > 0; s >>= 1) {                                \
+        if (tid < s) {                                                            \
+            shared_memory[tid] = shared_memory[tid] + shared_memory[tid + s];     \
+        }                                                                         \
+        threadgroup_barrier(mem_flags::mem_threadgroup);                          \
+    }                                                                             \
+                                                                                  \
+    /* wait for shared_memory[0] to be filled */ \
+    threadgroup_barrier(mem_flags::mem_threadgroup);                              \
+                                                                                  \
+    float norm = sqrt(shared_memory[0] / float(el_to_sum_per_block) + eps);       \
+    float inv_norm = 1.0f / norm;                                                 \
+    idx = start_idx + tid;                                                        \
+    while (idx < stop_idx) {                                                      \
+        float val = float(src[idx]) * inv_norm;                                   \
+        if (alpha != nullptr) {                                                   \
+            val *= float(alpha[idx - start_idx]);                                 \
+        }                                                                         \
+        dst[idx] = T(val);                                                        \
+        idx += block_dim;                                                         \
+    }                                                                             \
+}                                                                                 \
+
 REDUCE(x + y, fast_sum_f32_strided, float, 0)
 REDUCE(x + y, fast_sum_u32_strided, uint, 0)
 REDUCE(x + y, fast_sum_f16_strided, half, 0)
@@ -286,6 +339,8 @@ ARGMAX(fast_argmax_u8_strided, uint8_t, 0)
 
 SOFTMAX(softmax_f32, float)
 SOFTMAX(softmax_f16, half)
+RMSNORM(rmsnorm_f32, float)
+RMSNORM(rmsnorm_f16, half)
 
 #if __METAL_VERSION__ >= 220
 REDUCE(x + y, fast_sum_i64_strided, int64_t, 0)
@@ -303,4 +358,5 @@ REDUCE(MIN(x, y), fast_min_bf16, bfloat, HUGE_VALBF)
 ARGMIN(fast_argmin_bf16, bfloat, HUGE_VALBF)
 ARGMAX(fast_argmax_bf16, bfloat, -HUGE_VALBF)
 SOFTMAX(softmax_bf16, bfloat)
+RMSNORM(rmsnorm_bf16, bfloat)
 #endif