diff options
author | Laurent Mazare <laurent.mazare@gmail.com> | 2024-04-01 00:15:48 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-01 00:15:48 +0200 |
commit | cd29c7ccd420a840d883361c290ee92d06b9b96c (patch) | |
tree | d387a1f1af623de2e50751d493d541eb3789684c /candle-examples/examples/quantized | |
parent | f9954b73bac9fed91a9a08d952adc1cfb836a568 (diff) | |
download | candle-cd29c7ccd420a840d883361c290ee92d06b9b96c.tar.gz candle-cd29c7ccd420a840d883361c290ee92d06b9b96c.tar.bz2 candle-cd29c7ccd420a840d883361c290ee92d06b9b96c.zip |
More ggml cuda kernels (#1977)
* Add more cuda kernels for quantized matmul.
* Add the vec-dot bits.
* Expose the quantized matmul-vec kernels.
* Also include the quantize-q8-1 kernel.
* Glue code for the q8-1 quantization.
* mm-vec product via q8-1 quantization.
* Add a test.
* Add a mm test.
* Get the test to return some sensible results.
* Also test dmmv.
* Fix the launch params.
* Allow for tweaking the force_dmmv parameter while it's experimental.
Diffstat (limited to 'candle-examples/examples/quantized')
-rw-r--r-- | candle-examples/examples/quantized/main.rs | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/candle-examples/examples/quantized/main.rs b/candle-examples/examples/quantized/main.rs index 96344a49..3cabc3a4 100644 --- a/candle-examples/examples/quantized/main.rs +++ b/candle-examples/examples/quantized/main.rs @@ -235,6 +235,10 @@ struct Args { /// Group-Query Attention, use 8 for the 70B version of LLaMAv2. #[arg(long)] gqa: Option<usize>, + + /// Use the (experimental) fast cuda kernels. + #[arg(long)] + fast_cuda: bool, } impl Args { @@ -341,6 +345,10 @@ fn main() -> anyhow::Result<()> { use tracing_subscriber::prelude::*; let args = Args::parse(); + + #[cfg(feature = "cuda")] + candle::quantized::cuda::set_force_dmmv(!args.fast_cuda); + let temperature = if args.temperature == 0. { None } else { |