From f9ecc8447753d759e776e762ba9309bb90b76bb3 Mon Sep 17 00:00:00 2001
From: Laurent Mazare <laurent.mazare@gmail.com>
Date: Tue, 22 Aug 2023 19:41:10 +0100
Subject: GQA support in the quantized model. (#555)

* GQA support in the quantized model.

* Fix the reshaping.

* Fix the main llama model.

* Infer the proper gqa from the model kind.
---
 candle-examples/examples/llama/model.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'candle-examples/examples/llama')

diff --git a/candle-examples/examples/llama/model.rs b/candle-examples/examples/llama/model.rs
index 86d13bdb..561c2939 100644
--- a/candle-examples/examples/llama/model.rs
+++ b/candle-examples/examples/llama/model.rs
@@ -291,7 +291,7 @@ impl CausalSelfAttention {
             let x = x
                 .unsqueeze(2)?
                 .expand((b_sz, n_kv_head, n_rep, seq_len, head_dim))?
-                .reshape((b_sz, n_kv_head, n_rep, seq_len, head_dim))?;
+                .reshape((b_sz, n_kv_head * n_rep, seq_len, head_dim))?;
             Ok(x)
         }
     }
-- 
cgit v1.2.3