summaryrefslogtreecommitdiff
path: root/candle-transformers/src/models/whisper/mod.rs
diff options
context:
space:
mode:
Diffstat (limited to 'candle-transformers/src/models/whisper/mod.rs')
-rw-r--r--candle-transformers/src/models/whisper/mod.rs26
1 files changed, 26 insertions, 0 deletions
diff --git a/candle-transformers/src/models/whisper/mod.rs b/candle-transformers/src/models/whisper/mod.rs
new file mode 100644
index 00000000..7dc8107b
--- /dev/null
+++ b/candle-transformers/src/models/whisper/mod.rs
@@ -0,0 +1,26 @@
+pub mod audio;
+pub mod model;
+
+pub const DTYPE: candle::DType = candle::DType::F32;
+
+// Audio parameters.
+pub const SAMPLE_RATE: usize = 16000;
+pub const N_FFT: usize = 400;
+pub const N_MELS: usize = 80;
+pub const HOP_LENGTH: usize = 160;
+pub const CHUNK_LENGTH: usize = 30;
+pub const N_SAMPLES: usize = CHUNK_LENGTH * SAMPLE_RATE; // 480000 samples in a 30-second chunk
+pub const N_FRAMES: usize = N_SAMPLES / HOP_LENGTH; // 3000 frames in a mel spectrogram input
+
+pub const NO_SPEECH_THRESHOLD: f64 = 0.6;
+pub const LOGPROB_THRESHOLD: f64 = -1.0;
+pub const TEMPERATURES: [f64; 6] = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0];
+pub const COMPRESSION_RATIO_THRESHOLD: f64 = 2.4;
+
+// Tokenizer dependent bits.
+pub const SOT_TOKEN: &str = "<|startoftranscript|>";
+pub const TRANSCRIBE_TOKEN: &str = "<|transcribe|>";
+pub const TRANSLATE_TOKEN: &str = "<|translate|>";
+pub const NO_TIMESTAMPS_TOKEN: &str = "<|notimestamps|>";
+pub const EOT_TOKEN: &str = "<|endoftext|>";
+pub const NO_SPEECH_TOKEN: &str = "<|nocaptions|>";