1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
use candle::{DType, Error, Result, Tensor};
use rand::{distributions::Distribution, SeedableRng};
pub struct LogitsProcessor {
rng: rand::rngs::StdRng,
temperature: Option<f64>,
top_p: Option<f64>,
}
impl LogitsProcessor {
pub fn new(seed: u64, temperature: Option<f64>, top_p: Option<f64>) -> Self {
let temperature = if temperature.map_or(true, |v| v < 1e-7) {
None
} else {
temperature
};
Self {
rng: rand::rngs::StdRng::seed_from_u64(seed),
temperature,
top_p,
}
}
fn sample_argmax(&mut self, logits: Tensor) -> Result<u32> {
let logits_v: Vec<f32> = logits.to_vec1()?;
let next_token = logits_v
.iter()
.enumerate()
.max_by(|(_, u), (_, v)| u.total_cmp(v))
.map(|(i, _)| i as u32)
.unwrap();
Ok(next_token)
}
fn sample_multinomial(&mut self, prs: &Vec<f32>) -> Result<u32> {
let distr = rand::distributions::WeightedIndex::new(prs).map_err(Error::wrap)?;
let next_token = distr.sample(&mut self.rng) as u32;
Ok(next_token)
}
fn sample_topp(&mut self, prs: &mut Vec<f32>, top_p: f32) -> Result<u32> {
// top-p sampling (or "nucleus sampling") samples from the smallest set of
// tokens that exceed probability top_p. This way we never sample tokens that
// have very low probabilities and are less likely to go "off the rails".
let mut argsort_indices = (0..prs.len()).collect::<Vec<_>>();
// Sort by descending probability.
argsort_indices.sort_by(|&i, &j| prs[j].partial_cmp(&prs[i]).unwrap());
// Clamp smaller probabilities to zero.
let mut cumsum = 0.;
for index in &argsort_indices {
if cumsum >= top_p {
prs[*index] = 0.0;
} else {
cumsum += prs[*index];
}
}
// Sample with clamped probabilities.
self.sample_multinomial(prs)
}
pub fn sample(&mut self, logits: &Tensor) -> Result<u32> {
let logits = logits.to_dtype(DType::F32)?;
let next_token = match self.temperature {
None => self.sample_argmax(logits)?,
Some(temperature) => {
let logits = &(&logits / temperature)?;
let prs = candle_nn::ops::softmax_last_dim(logits)?;
let mut prs: Vec<f32> = prs.to_vec1()?;
let top_p = self.top_p.unwrap_or(1.);
if top_p <= 0.0 || top_p >= 1.0 {
// simply sample from the predicted probability distribution
self.sample_multinomial(&prs)?
} else {
// top-p (nucleus) sampling, clamping the least likely tokens to zero
self.sample_topp(&mut prs, top_p as f32)?
}
}
};
Ok(next_token)
}
}
|