// Audio processing code, adapted from whisper.cpp // https://github.com/ggerganov/whisper.cpp pub trait Float: num_traits::Float + num_traits::FloatConst + num_traits::NumAssign {} impl Float for f32 {} impl Float for f64 {} // https://github.com/ggerganov/whisper.cpp/blob/4774d2feb01a772a15de81ffc34b34a1f294f020/whisper.cpp#L2357 fn fft(inp: &[T]) -> Vec { let n = inp.len(); let zero = T::zero(); if n == 1 { return vec![inp[0], zero]; } if n % 2 == 1 { return dft(inp); } let mut out = vec![zero; n * 2]; let mut even = Vec::with_capacity(n / 2); let mut odd = Vec::with_capacity(n / 2); for (i, &inp) in inp.iter().enumerate() { if i % 2 == 0 { even.push(inp) } else { odd.push(inp); } } let even_fft = fft(&even); let odd_fft = fft(&odd); let two_pi = T::PI() + T::PI(); let n_t = T::from(n).unwrap(); for k in 0..n / 2 { let k_t = T::from(k).unwrap(); let theta = two_pi * k_t / n_t; let re = theta.cos(); let im = -theta.sin(); let re_odd = odd_fft[2 * k]; let im_odd = odd_fft[2 * k + 1]; out[2 * k] = even_fft[2 * k] + re * re_odd - im * im_odd; out[2 * k + 1] = even_fft[2 * k + 1] + re * im_odd + im * re_odd; out[2 * (k + n / 2)] = even_fft[2 * k] - re * re_odd + im * im_odd; out[2 * (k + n / 2) + 1] = even_fft[2 * k + 1] - re * im_odd - im * re_odd; } out } // https://github.com/ggerganov/whisper.cpp/blob/4774d2feb01a772a15de81ffc34b34a1f294f020/whisper.cpp#L2337 fn dft(inp: &[T]) -> Vec { let zero = T::zero(); let n = inp.len(); let two_pi = T::PI() + T::PI(); let mut out = Vec::new(); out.reserve(2 * n); let n_t = T::from(n).unwrap(); for k in 0..n { let k_t = T::from(k).unwrap(); let mut re = zero; let mut im = zero; for (j, &inp) in inp.iter().enumerate() { let j_t = T::from(j).unwrap(); let angle = two_pi * k_t * j_t / n_t; re += inp * angle.cos(); im -= inp * angle.sin(); } out.push(re); out.push(im); } out } #[allow(clippy::too_many_arguments)] // https://github.com/ggerganov/whisper.cpp/blob/4774d2feb01a772a15de81ffc34b34a1f294f020/whisper.cpp#L2414 fn log_mel_spectrogram_w( ith: usize, hann: &[T], samples: &[T], filters: &[T], fft_size: usize, fft_step: usize, speed_up: bool, n_len: usize, n_mel: usize, n_threads: usize, ) -> Vec { let n_fft = if speed_up { 1 + fft_size / 4 } else { 1 + fft_size / 2 }; let zero = T::zero(); let half = T::from(0.5).unwrap(); let mut fft_in = vec![zero; fft_size]; let mut mel = vec![zero; n_len * n_mel]; for i in (ith..n_len).step_by(n_threads) { let offset = i * fft_step; // apply Hanning window for j in 0..fft_size { fft_in[j] = if offset + j < samples.len() { hann[j] * samples[offset + j] } else { zero } } // FFT -> mag^2 let mut fft_out: Vec = fft(&fft_in); for j in 0..fft_size { fft_out[j] = fft_out[2 * j] * fft_out[2 * j] + fft_out[2 * j + 1] * fft_out[2 * j + 1]; } for j in 1..fft_size / 2 { let v = fft_out[fft_size - j]; fft_out[j] += v; } if speed_up { // scale down in the frequency domain results in a speed up in the time domain for j in 0..n_fft { fft_out[j] = half * (fft_out[2 * j] + fft_out[2 * j + 1]); } } // mel spectrogram for j in 0..n_mel { let mut sum = zero; for k in 0..n_fft { sum += fft_out[k] * filters[j * n_fft + k]; } mel[j * n_len + i] = T::max(sum, T::from(1e-10).unwrap()).log10(); } } mel } fn log_mel_spectrogram_( samples: &[T], filters: &[T], fft_size: usize, fft_step: usize, n_mel: usize, speed_up: bool, ) -> Vec { let zero = T::zero(); let two_pi = T::PI() + T::PI(); let half = T::from(0.5).unwrap(); let one = T::from(1.0).unwrap(); let four = T::from(4.0).unwrap(); let fft_size_t = T::from(fft_size).unwrap(); let hann: Vec = (0..fft_size) .map(|i| half * (one - ((two_pi * T::from(i).unwrap()) / fft_size_t).cos())) .collect(); let n_len = samples.len() / fft_step; // pad audio with at least one extra chunk of zeros let pad = 100 * super::CHUNK_LENGTH / 2; let n_len = if n_len % pad != 0 { (n_len / pad + 1) * pad } else { n_len }; let n_len = n_len + pad; let samples = { let mut samples_padded = samples.to_vec(); let to_add = n_len * fft_step - samples.len(); samples_padded.extend(std::iter::repeat(zero).take(to_add)); samples_padded }; // Use a single thread for now. let mut mel = log_mel_spectrogram_w( 0, &hann, &samples, filters, fft_size, fft_step, speed_up, n_len, n_mel, 1, ); let mmax = mel .iter() .max_by(|&u, &v| u.partial_cmp(v).unwrap_or(std::cmp::Ordering::Greater)) .copied() .unwrap_or(zero) - T::from(8).unwrap(); for m in mel.iter_mut() { let v = T::max(*m, mmax); *m = v / four + one } mel } pub fn pcm_to_mel( samples: &[T], filters: &[T], ) -> anyhow::Result> { let mel = log_mel_spectrogram_( samples, filters, super::N_FFT, super::HOP_LENGTH, super::N_MELS, false, ); Ok(mel) }