2 files changed, 16 insertions, 2 deletions
diff --git a/README.md b/README.md
index 23908756..2b966d24 100644
--- a/README.md
+++ b/README.md
@@ -48,8 +48,8 @@ For llama2, run the following command to retrieve the weight files and start a
 test server:
 ```bash
 cd candle-wasm-examples/llama2-c
-wget https://karpathy.ai/llama2c/model.bin
-wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
+wget https://huggingface.co/spaces/lmz/candle-llama2/resolve/main/model.bin
+wget https://huggingface.co/spaces/lmz/candle-llama2/resolve/main/tokenizer.json
 trunk serve --release --public-url /candle-llama2/ --port 8081
 ```
 And then browse to
diff --git a/candle-examples/examples/bert/main.rs b/candle-examples/examples/bert/main.rs
index 79c78968..574755ed 100644
--- a/candle-examples/examples/bert/main.rs
+++ b/candle-examples/examples/bert/main.rs
@@ -39,6 +39,10 @@ struct Args {
     /// The number of times to run the prompt.
     #[arg(long, default_value = "1")]
     n: usize,
+
+    /// L2 normalization for embeddings.
+    #[arg(long, default_value = "true")]
+    normalize_embeddings: bool,
 }
 
 impl Args {
@@ -164,7 +168,13 @@ fn main() -> Result<()> {
         // Apply some avg-pooling by taking the mean embedding value for all tokens (including padding)
         let (_n_sentence, n_tokens, _hidden_size) = embeddings.dims3()?;
         let embeddings = (embeddings.sum(1)? / (n_tokens as f64))?;
+        let embeddings = if args.normalize_embeddings {
+            normalize_l2(&embeddings)?
+        } else {
+            embeddings
+        };
         println!("pooled embeddings {:?}", embeddings.shape());
+
         let mut similarities = vec![];
         for i in 0..n_sentences {
             let e_i = embeddings.get(i)?;
@@ -184,3 +194,7 @@ fn main() -> Result<()> {
     }
     Ok(())
 }
+
+pub fn normalize_l2(v: &Tensor) -> Result<Tensor> {
+    Ok(v.broadcast_div(&v.sqr()?.sum_keepdim(1)?.sqrt()?)?)
+}