summaryrefslogtreecommitdiff
path: root/candle-examples/examples/quantized
diff options
context:
space:
mode:
authorLaurent Mazare <laurent.mazare@gmail.com>2024-04-01 00:15:48 +0200
committerGitHub <noreply@github.com>2024-04-01 00:15:48 +0200
commitcd29c7ccd420a840d883361c290ee92d06b9b96c (patch)
treed387a1f1af623de2e50751d493d541eb3789684c /candle-examples/examples/quantized
parentf9954b73bac9fed91a9a08d952adc1cfb836a568 (diff)
downloadcandle-cd29c7ccd420a840d883361c290ee92d06b9b96c.tar.gz
candle-cd29c7ccd420a840d883361c290ee92d06b9b96c.tar.bz2
candle-cd29c7ccd420a840d883361c290ee92d06b9b96c.zip
More ggml cuda kernels (#1977)
* Add more cuda kernels for quantized matmul. * Add the vec-dot bits. * Expose the quantized matmul-vec kernels. * Also include the quantize-q8-1 kernel. * Glue code for the q8-1 quantization. * mm-vec product via q8-1 quantization. * Add a test. * Add a mm test. * Get the test to return some sensible results. * Also test dmmv. * Fix the launch params. * Allow for tweaking the force_dmmv parameter while it's experimental.
Diffstat (limited to 'candle-examples/examples/quantized')
-rw-r--r--candle-examples/examples/quantized/main.rs8
1 files changed, 8 insertions, 0 deletions
diff --git a/candle-examples/examples/quantized/main.rs b/candle-examples/examples/quantized/main.rs
index 96344a49..3cabc3a4 100644
--- a/candle-examples/examples/quantized/main.rs
+++ b/candle-examples/examples/quantized/main.rs
@@ -235,6 +235,10 @@ struct Args {
/// Group-Query Attention, use 8 for the 70B version of LLaMAv2.
#[arg(long)]
gqa: Option<usize>,
+
+ /// Use the (experimental) fast cuda kernels.
+ #[arg(long)]
+ fast_cuda: bool,
}
impl Args {
@@ -341,6 +345,10 @@ fn main() -> anyhow::Result<()> {
use tracing_subscriber::prelude::*;
let args = Args::parse();
+
+ #[cfg(feature = "cuda")]
+ candle::quantized::cuda::set_force_dmmv(!args.fast_cuda);
+
let temperature = if args.temperature == 0. {
None
} else {