GQA support in the quantized model. (#555)

* GQA support in the quantized model. * Fix the reshaping. * Fix the main llama model. * Infer the proper gqa from the model kind.
author: Laurent Mazare <laurent.mazare@gmail.com> 2023-08-22 19:41:10 +0100
committer: GitHub <noreply@github.com> 2023-08-22 19:41:10 +0100
commit: f9ecc8447753d759e776e762ba9309bb90b76bb3 (patch)
tree: 311d0e2f4dad33ea8174225cc1bfa5bf429ba713 /candle-examples/examples/llama
parent: 07067b01dce3c63b45fe4bdeb8d972f279e88b45 (diff)
download: candle-f9ecc8447753d759e776e762ba9309bb90b76bb3.tar.gz
candle-f9ecc8447753d759e776e762ba9309bb90b76bb3.tar.bz2
candle-f9ecc8447753d759e776e762ba9309bb90b76bb3.zip
1 files changed, 1 insertions, 1 deletions
diff --git a/candle-examples/examples/llama/model.rs b/candle-examples/examples/llama/model.rs
index 86d13bdb..561c2939 100644
--- a/candle-examples/examples/llama/model.rs
+++ b/candle-examples/examples/llama/model.rs
@@ -291,7 +291,7 @@ impl CausalSelfAttention {
             let x = x
                 .unsqueeze(2)?
                 .expand((b_sz, n_kv_head, n_rep, seq_len, head_dim))?
-                .reshape((b_sz, n_kv_head, n_rep, seq_len, head_dim))?;
+                .reshape((b_sz, n_kv_head * n_rep, seq_len, head_dim))?;
             Ok(x)
         }
     }
author	Laurent Mazare <laurent.mazare@gmail.com>	2023-08-22 19:41:10 +0100
committer	GitHub <noreply@github.com>	2023-08-22 19:41:10 +0100
commit	f9ecc8447753d759e776e762ba9309bb90b76bb3 (patch)
tree	311d0e2f4dad33ea8174225cc1bfa5bf429ba713 /candle-examples/examples/llama
parent	07067b01dce3c63b45fe4bdeb8d972f279e88b45 (diff)
download	candle-f9ecc8447753d759e776e762ba9309bb90b76bb3.tar.gz candle-f9ecc8447753d759e776e762ba9309bb90b76bb3.tar.bz2 candle-f9ecc8447753d759e776e762ba9309bb90b76bb3.zip