summaryrefslogtreecommitdiff
path: root/candle-core
diff options
context:
space:
mode:
authorLaurent Mazare <laurent.mazare@gmail.com>2023-09-28 09:05:29 +0100
committerGitHub <noreply@github.com>2023-09-28 09:05:29 +0100
commit5e1c595e00721a11bb46e9187ea7d86ea4ace0e3 (patch)
tree53613fca3defe5f9f1905b57107ec63ad7f309ab /candle-core
parent8a49e01b9d9e13732d83f79afa2f850e2ba7fdae (diff)
downloadcandle-5e1c595e00721a11bb46e9187ea7d86ea4ace0e3.tar.gz
candle-5e1c595e00721a11bb46e9187ea7d86ea4ace0e3.tar.bz2
candle-5e1c595e00721a11bb46e9187ea7d86ea4ace0e3.zip
Optimize the index-select cuda kernel. (#976)
Diffstat (limited to 'candle-core')
-rw-r--r--candle-core/src/cuda_backend.rs8
1 files changed, 4 insertions, 4 deletions
diff --git a/candle-core/src/cuda_backend.rs b/candle-core/src/cuda_backend.rs
index 00fd1d04..d1187b1c 100644
--- a/candle-core/src/cuda_backend.rs
+++ b/candle-core/src/cuda_backend.rs
@@ -884,8 +884,6 @@ impl<'a> Map1 for IndexSelect<'a> {
};
let ids_shape = ids_l.shape();
let ids_dims = ids_shape.dims();
- let ids_el = ids_shape.elem_count();
- let cfg = LaunchConfig::for_num_elems(ids_el as u32);
let ds = dev.htod_copy([ids_dims, ids_l.stride()].concat()).w()?;
let src = match src_l.contiguous_offsets() {
Some((o1, o2)) => src.slice(o1..o2),
@@ -894,11 +892,13 @@ impl<'a> Map1 for IndexSelect<'a> {
let left_size: usize = src_l.dims()[..self.2].iter().product();
let right_size: usize = src_l.dims()[self.2 + 1..].iter().product();
let dim_size = src_l.dims()[self.2];
+ let dst_el = ids_shape.elem_count() * left_size * right_size;
+ let cfg = LaunchConfig::for_num_elems(dst_el as u32);
let func = dev.get_or_load_func(&kernel_name::<T>(name), kernels::INDEXING)?;
// SAFETY: Set later by running the kernel.
- let out = unsafe { dev.alloc::<T>(ids_el * left_size * right_size) }.w()?;
+ let out = unsafe { dev.alloc::<T>(dst_el) }.w()?;
let params = (
- ids_el,
+ dst_el,
ids_dims.len(),
&ds,
ids,