Again set a few extra params in flash-attn. (#245)

* Again set a few extra params. * Use the appropriate kernel sizes. * Add all the kernel sizes. * Parallel compiling. * Reduce the amount of parallelism. * Add the missing kernel. * Fix a typo. * Remove bf16 support for now.
author: Laurent Mazare <laurent.mazare@gmail.com> 2023-07-26 14:16:37 +0100
committer: GitHub <noreply@github.com> 2023-07-26 14:16:37 +0100
commit: 2ce5f12513d0dafb04c7e345da9d4fba566cfa16 (patch)
tree: d8370aa035f667905e6f033e99e08fd93e677041 /candle-examples/examples/llama
parent: fa2b64d678ca83e2fbc3dabdecffbc778d5b067d (diff)
download: candle-2ce5f12513d0dafb04c7e345da9d4fba566cfa16.tar.gz
candle-2ce5f12513d0dafb04c7e345da9d4fba566cfa16.tar.bz2
candle-2ce5f12513d0dafb04c7e345da9d4fba566cfa16.zip
1 files changed, 5 insertions, 1 deletions
diff --git a/candle-examples/examples/llama/model.rs b/candle-examples/examples/llama/model.rs
index 0e850b6a..049d0c38 100644
--- a/candle-examples/examples/llama/model.rs
+++ b/candle-examples/examples/llama/model.rs
@@ -220,8 +220,12 @@ impl CausalSelfAttention {
         let v = self.repeat_kv(v)?;
 
         let y = if self.use_flash_attn {
+            // flash-attn expects (b_sz, seq_len, nheads, head_dim)
+            let q = q.transpose(1, 2)?;
+            let k = k.transpose(1, 2)?;
+            let v = v.transpose(1, 2)?;
             let softmax_scale = 1f32 / (self.head_dim as f32).sqrt();
-            flash_attn(softmax_scale, &q, &k, &v)?
+            flash_attn(softmax_scale, &q, &k, &v)?.transpose(1, 2)?
         } else {
             let in_dtype = q.dtype();
             let q = q.to_dtype(DType::F32)?;
author	Laurent Mazare <laurent.mazare@gmail.com>	2023-07-26 14:16:37 +0100
committer	GitHub <noreply@github.com>	2023-07-26 14:16:37 +0100
commit	2ce5f12513d0dafb04c7e345da9d4fba566cfa16 (patch)
tree	d8370aa035f667905e6f033e99e08fd93e677041 /candle-examples/examples/llama
parent	fa2b64d678ca83e2fbc3dabdecffbc778d5b067d (diff)
download	candle-2ce5f12513d0dafb04c7e345da9d4fba566cfa16.tar.gz candle-2ce5f12513d0dafb04c7e345da9d4fba566cfa16.tar.bz2 candle-2ce5f12513d0dafb04c7e345da9d4fba566cfa16.zip