summaryrefslogtreecommitdiff
path: root/candle-core/src/quantized/ggml_file.rs
diff options
context:
space:
mode:
authorLaurent Mazare <laurent.mazare@gmail.com>2024-02-25 18:11:47 +0100
committerGitHub <noreply@github.com>2024-02-25 18:11:47 +0100
commit2f22afd80ef6bc3e0ac7f6d55e4a4dc4dd480190 (patch)
treea0fca7887e011d5c8fc75c10c6fb2fd7d90d56cb /candle-core/src/quantized/ggml_file.rs
parent8d04f70f4d1bd67c42fb7d63e7031d49cf780a61 (diff)
downloadcandle-2f22afd80ef6bc3e0ac7f6d55e4a4dc4dd480190.tar.gz
candle-2f22afd80ef6bc3e0ac7f6d55e4a4dc4dd480190.tar.bz2
candle-2f22afd80ef6bc3e0ac7f6d55e4a4dc4dd480190.zip
Cuda acceleration for quantized model. (#1754)
* Boilerplate for the quantized cuda support. * More basic cuda support. * More cuda quantization (quantize on cpu for now). * Add the dequantization bit. * Start adding some dedicated cuda kernels from llama.cpp. * Move the kernel code. * Start interfacing with the kernel. * Tweak the kernel launch params. * Bugfix for quantized metal. * Fix some clippy lints. * Tweak the launch parameters. * Tweak cuda basics to perform a quantized matmul. * Perform the dequantization on the cpu + use cublas for matmul. * Add the dequantization kernel. * Test the qmatmul. * More kernels. * Matmul-vec kernel. * Add a couple kernels. * More dequantization kernels.
Diffstat (limited to 'candle-core/src/quantized/ggml_file.rs')
-rw-r--r--candle-core/src/quantized/ggml_file.rs11
1 files changed, 2 insertions, 9 deletions
diff --git a/candle-core/src/quantized/ggml_file.rs b/candle-core/src/quantized/ggml_file.rs
index e6f5791c..99200bbd 100644
--- a/candle-core/src/quantized/ggml_file.rs
+++ b/candle-core/src/quantized/ggml_file.rs
@@ -1,7 +1,5 @@
//! Support for the GGML file format.
-#[cfg(feature = "metal")]
-use super::metal::load_quantized_metal;
use super::{k_quants, GgmlDType, QStorage};
use crate::{Device, Result};
use byteorder::{LittleEndian, ReadBytesExt};
@@ -130,13 +128,8 @@ fn from_raw_data<T: super::GgmlType + Send + Sync + 'static>(
let data = unsafe { std::slice::from_raw_parts(raw_data_ptr as *const T, n_blocks) };
let data: QStorage = match device {
Device::Cpu => QStorage::Cpu(Box::new(data.to_vec())),
- #[cfg(feature = "metal")]
- Device::Metal(metal) => load_quantized_metal(metal, data)?,
- #[cfg(not(feature = "metal"))]
- Device::Metal(_metal) => {
- crate::bail!("Metal backend requires `metal` feature")
- }
- device => unimplemented!("Implement quantized tensor for device {device:?}"),
+ Device::Metal(metal) => super::metal::load_quantized(metal, data)?,
+ Device::Cuda(cuda) => super::cuda::load_quantized(cuda, data)?,
};
super::QTensor::new(data, dims)
}