diff options
Diffstat (limited to 'candle-examples/examples/whisper/audio.rs')
-rw-r--r-- | candle-examples/examples/whisper/audio.rs | 214 |
1 files changed, 0 insertions, 214 deletions
diff --git a/candle-examples/examples/whisper/audio.rs b/candle-examples/examples/whisper/audio.rs deleted file mode 100644 index 2ceed065..00000000 --- a/candle-examples/examples/whisper/audio.rs +++ /dev/null @@ -1,214 +0,0 @@ -// Audio processing code, adapted from whisper.cpp -// https://github.com/ggerganov/whisper.cpp - -pub trait Float: num_traits::Float + num_traits::FloatConst + num_traits::NumAssign {} - -impl Float for f32 {} -impl Float for f64 {} - -// https://github.com/ggerganov/whisper.cpp/blob/4774d2feb01a772a15de81ffc34b34a1f294f020/whisper.cpp#L2357 -fn fft<T: Float>(inp: &[T]) -> Vec<T> { - let n = inp.len(); - let zero = T::zero(); - if n == 1 { - return vec![inp[0], zero]; - } - if n % 2 == 1 { - return dft(inp); - } - let mut out = vec![zero; n * 2]; - - let mut even = Vec::with_capacity(n / 2); - let mut odd = Vec::with_capacity(n / 2); - - for (i, &inp) in inp.iter().enumerate() { - if i % 2 == 0 { - even.push(inp) - } else { - odd.push(inp); - } - } - - let even_fft = fft(&even); - let odd_fft = fft(&odd); - - let two_pi = T::PI() + T::PI(); - let n_t = T::from(n).unwrap(); - for k in 0..n / 2 { - let k_t = T::from(k).unwrap(); - let theta = two_pi * k_t / n_t; - let re = theta.cos(); - let im = -theta.sin(); - - let re_odd = odd_fft[2 * k]; - let im_odd = odd_fft[2 * k + 1]; - - out[2 * k] = even_fft[2 * k] + re * re_odd - im * im_odd; - out[2 * k + 1] = even_fft[2 * k + 1] + re * im_odd + im * re_odd; - - out[2 * (k + n / 2)] = even_fft[2 * k] - re * re_odd + im * im_odd; - out[2 * (k + n / 2) + 1] = even_fft[2 * k + 1] - re * im_odd - im * re_odd; - } - out -} - -// https://github.com/ggerganov/whisper.cpp/blob/4774d2feb01a772a15de81ffc34b34a1f294f020/whisper.cpp#L2337 -fn dft<T: Float>(inp: &[T]) -> Vec<T> { - let zero = T::zero(); - let n = inp.len(); - let two_pi = T::PI() + T::PI(); - - let mut out = Vec::new(); - out.reserve(2 * n); - let n_t = T::from(n).unwrap(); - for k in 0..n { - let k_t = T::from(k).unwrap(); - let mut re = zero; - let mut im = zero; - - for (j, &inp) in inp.iter().enumerate() { - let j_t = T::from(j).unwrap(); - let angle = two_pi * k_t * j_t / n_t; - re += inp * angle.cos(); - im -= inp * angle.sin(); - } - - out.push(re); - out.push(im); - } - out -} - -#[allow(clippy::too_many_arguments)] -// https://github.com/ggerganov/whisper.cpp/blob/4774d2feb01a772a15de81ffc34b34a1f294f020/whisper.cpp#L2414 -fn log_mel_spectrogram_w<T: Float>( - ith: usize, - hann: &[T], - samples: &[T], - filters: &[T], - fft_size: usize, - fft_step: usize, - speed_up: bool, - n_len: usize, - n_mel: usize, - n_threads: usize, -) -> Vec<T> { - let n_fft = if speed_up { - 1 + fft_size / 4 - } else { - 1 + fft_size / 2 - }; - - let zero = T::zero(); - let half = T::from(0.5).unwrap(); - let mut fft_in = vec![zero; fft_size]; - let mut mel = vec![zero; n_len * n_mel]; - - for i in (ith..n_len).step_by(n_threads) { - let offset = i * fft_step; - - // apply Hanning window - for j in 0..fft_size { - fft_in[j] = if offset + j < samples.len() { - hann[j] * samples[offset + j] - } else { - zero - } - } - - // FFT -> mag^2 - let mut fft_out: Vec<T> = fft(&fft_in); - - for j in 0..fft_size { - fft_out[j] = fft_out[2 * j] * fft_out[2 * j] + fft_out[2 * j + 1] * fft_out[2 * j + 1]; - } - for j in 1..fft_size / 2 { - let v = fft_out[fft_size - j]; - fft_out[j] += v; - } - - if speed_up { - // scale down in the frequency domain results in a speed up in the time domain - for j in 0..n_fft { - fft_out[j] = half * (fft_out[2 * j] + fft_out[2 * j + 1]); - } - } - - // mel spectrogram - for j in 0..n_mel { - let mut sum = zero; - for k in 0..n_fft { - sum += fft_out[k] * filters[j * n_fft + k]; - } - mel[j * n_len + i] = T::max(sum, T::from(1e-10).unwrap()).log10(); - } - } - mel -} - -fn log_mel_spectrogram_<T: Float + std::fmt::Display>( - samples: &[T], - filters: &[T], - fft_size: usize, - fft_step: usize, - n_mel: usize, - speed_up: bool, -) -> Vec<T> { - let zero = T::zero(); - let two_pi = T::PI() + T::PI(); - let half = T::from(0.5).unwrap(); - let one = T::from(1.0).unwrap(); - let four = T::from(4.0).unwrap(); - let fft_size_t = T::from(fft_size).unwrap(); - - let hann: Vec<T> = (0..fft_size) - .map(|i| half * (one - ((two_pi * T::from(i).unwrap()) / fft_size_t).cos())) - .collect(); - let n_len = samples.len() / fft_step; - - // pad audio with at least one extra chunk of zeros - let pad = 100 * super::CHUNK_LENGTH / 2; - let n_len = if n_len % pad != 0 { - (n_len / pad + 1) * pad - } else { - n_len - }; - let n_len = n_len + pad; - let samples = { - let mut samples_padded = samples.to_vec(); - let to_add = n_len * fft_step - samples.len(); - samples_padded.extend(std::iter::repeat(zero).take(to_add)); - samples_padded - }; - - // Use a single thread for now. - let mut mel = log_mel_spectrogram_w( - 0, &hann, &samples, filters, fft_size, fft_step, speed_up, n_len, n_mel, 1, - ); - let mmax = mel - .iter() - .max_by(|&u, &v| u.partial_cmp(v).unwrap_or(std::cmp::Ordering::Greater)) - .copied() - .unwrap_or(zero) - - T::from(8).unwrap(); - for m in mel.iter_mut() { - let v = T::max(*m, mmax); - *m = v / four + one - } - mel -} - -pub fn pcm_to_mel<T: Float + std::fmt::Display>( - samples: &[T], - filters: &[T], -) -> anyhow::Result<Vec<T>> { - let mel = log_mel_spectrogram_( - samples, - filters, - super::N_FFT, - super::HOP_LENGTH, - super::N_MELS, - false, - ); - Ok(mel) -} |