diff options
author | Laurent Mazare <laurent.mazare@gmail.com> | 2023-07-31 20:43:57 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-31 20:43:57 +0100 |
commit | 6b98b66eb36a484f1a65fbc1c528a8e0b90a1419 (patch) | |
tree | 3604078a30e0fa634bd3cf2d56096a847bb8b46b /candle-examples/examples/llama2-c/main.rs | |
parent | 9ae1f6afeecca7b424b0943d591809481dc88dbc (diff) | |
download | candle-6b98b66eb36a484f1a65fbc1c528a8e0b90a1419.tar.gz candle-6b98b66eb36a484f1a65fbc1c528a8e0b90a1419.tar.bz2 candle-6b98b66eb36a484f1a65fbc1c528a8e0b90a1419.zip |
Remove the end of text tokens. (#289)
Diffstat (limited to 'candle-examples/examples/llama2-c/main.rs')
-rw-r--r-- | candle-examples/examples/llama2-c/main.rs | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/candle-examples/examples/llama2-c/main.rs b/candle-examples/examples/llama2-c/main.rs index 65641b3c..d710652f 100644 --- a/candle-examples/examples/llama2-c/main.rs +++ b/candle-examples/examples/llama2-c/main.rs @@ -266,7 +266,8 @@ fn run_eval(tokenizer: Tokenizer, config_path: &std::path::PathBuf, args: Args) let file = std::io::BufReader::new(file); let mut tokens = vec![]; for line in file.lines() { - let line = tokenizer.encode(line?, false).map_err(E::msg)?; + let line = line?.replace("<|endoftext|>", ""); + let line = tokenizer.encode(line, false).map_err(E::msg)?; tokens.push(line.get_ids().to_vec()) } let tokens = tokens.concat(); |