diff options
author | Laurent Mazare <laurent.mazare@gmail.com> | 2024-01-07 17:18:46 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-07 17:18:46 +0100 |
commit | 89b5a068585b73193d2004a7293d5b2fa6c30bfd (patch) | |
tree | 1bb39bbcf60bf4140c704baa827c382693d3497e /candle-examples | |
parent | 30313c308106fff7b20fc8cb2b27eb79800cb818 (diff) | |
download | candle-89b5a068585b73193d2004a7293d5b2fa6c30bfd.tar.gz candle-89b5a068585b73193d2004a7293d5b2fa6c30bfd.tar.bz2 candle-89b5a068585b73193d2004a7293d5b2fa6c30bfd.zip |
Use bindgen-cuda for the custom-kernel example. (#1536)
* Use bindgen-cuda for the custom-kernel example.
* Only depend on the kernels when cuda is enabled.
* Skip rustfmt.
Diffstat (limited to 'candle-examples')
-rw-r--r-- | candle-examples/Cargo.toml | 3 | ||||
-rw-r--r-- | candle-examples/build.rs | 247 | ||||
-rw-r--r-- | candle-examples/examples/custom-ops/cuda_kernels.rs | 3 | ||||
-rw-r--r-- | candle-examples/examples/custom-ops/main.rs | 3 |
4 files changed, 20 insertions, 236 deletions
diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml index 439116f8..00340d08 100644 --- a/candle-examples/Cargo.toml +++ b/candle-examples/Cargo.toml @@ -49,11 +49,12 @@ tokio = "1.29.1" [build-dependencies] anyhow = { workspace = true } +bindgen_cuda = { version = "0.1.1", optional = true } [features] default = [] accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"] -cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda"] +cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda", "dep:bindgen_cuda"] cudnn = ["candle/cudnn"] flash-attn = ["cuda", "candle-transformers/flash-attn", "dep:candle-flash-attn"] mkl = ["dep:intel-mkl-src", "candle/mkl", "candle-nn/mkl", "candle-transformers/mkl"] diff --git a/candle-examples/build.rs b/candle-examples/build.rs index 0af3a6a4..ba40aeb4 100644 --- a/candle-examples/build.rs +++ b/candle-examples/build.rs @@ -4,251 +4,34 @@ use std::io::Write; use std::path::PathBuf; struct KernelDirectories { - kernel_dir: &'static str, + kernel_glob: &'static str, rust_target: &'static str, include_dirs: &'static [&'static str], } -const DIRS: [KernelDirectories; 1] = [KernelDirectories { - kernel_dir: "examples/custom-ops/kernels/", +const KERNEL_DIRS: [KernelDirectories; 1] = [KernelDirectories { + kernel_glob: "examples/custom-ops/kernels/*.cu", rust_target: "examples/custom-ops/cuda_kernels.rs", include_dirs: &[], }]; -impl KernelDirectories { - fn maybe_build_ptx( - &self, - cu_file: &std::path::Path, - ptx_file: &std::path::Path, - compute_cap: usize, - ) -> Result<()> { - let should_compile = if ptx_file.exists() { - let ptx_modified = ptx_file.metadata()?.modified()?; - let cu_modified = cu_file.metadata()?.modified()?; - cu_modified.duration_since(ptx_modified).is_ok() - } else { - true - }; - if should_compile { - #[cfg(feature = "cuda")] - { - let ccbin_env = std::env::var("CANDLE_NVCC_CCBIN"); - println!("cargo:rerun-if-env-changed=CANDLE_NVCC_CCBIN"); - let mut command = std::process::Command::new("nvcc"); - let out_dir = ptx_file.parent().context("no parent for ptx file")?; - let include_dirs: Vec<String> = - self.include_dirs.iter().map(|c| format!("-I{c}")).collect(); - command - .arg(format!("--gpu-architecture=sm_{compute_cap}")) - .arg("--ptx") - .args(["--default-stream", "per-thread"]) - .args(["--output-directory", out_dir.to_str().unwrap()]) - .arg(format!("-I/{}", self.kernel_dir)) - .args(include_dirs) - .arg(cu_file); - if let Ok(ccbin_path) = &ccbin_env { - command - .arg("-allow-unsupported-compiler") - .args(["-ccbin", ccbin_path]); - } - let output = command - .spawn() - .context("failed spawning nvcc")? - .wait_with_output()?; - if !output.status.success() { - anyhow::bail!( - "nvcc error while compiling {cu_file:?}:\n\n# stdout\n{:#}\n\n# stderr\n{:#}", - String::from_utf8_lossy(&output.stdout), - String::from_utf8_lossy(&output.stderr) - ) - } - } - #[cfg(not(feature = "cuda"))] - std::fs::OpenOptions::new() - .create(true) - .write(true) - .open(ptx_file)?; - } - Ok(()) - } - fn process(&self, out_dir: &std::path::Path, compute_cap: usize) -> Result<()> { - println!("cargo:rerun-if-changed={}", self.kernel_dir); - let kernel_dir = PathBuf::from(self.kernel_dir); - let out_dir = out_dir.join(self.kernel_dir); - if !out_dir.exists() { - std::fs::create_dir_all(&out_dir)?; - } - let mut cu_files = vec![]; - let mut cuh_files = vec![]; - for file in std::fs::read_dir(kernel_dir)?.flatten() { - let file = file.path(); - match file.extension().and_then(|v| v.to_str()) { - Some("cu") => cu_files.push(file), - Some("cuh") => cuh_files.push(file), - _ => {} - } - } - - let mut ptx_paths = vec![]; - for cu_file in cu_files.iter() { - let file_stem = cu_file - .file_stem() - .with_context(|| format!("no stem {cu_file:?}"))?; - let file_stem = file_stem.to_string_lossy().into_owned(); - let ptx_file = out_dir.join(&format!("{file_stem}.ptx")); - self.maybe_build_ptx(cu_file, &ptx_file, compute_cap)?; - ptx_paths.push(ptx_file); - } - - let regenerate_rs_file = true; - if regenerate_rs_file { - let mut file = std::fs::File::create(self.rust_target)?; - for ptx_path in ptx_paths { - let name = ptx_path - .file_stem() - .context("empty stem")? - .to_string_lossy(); - file.write_all(b"#[rustfmt::skip]\n")?; - let const_definition = format!( - r#"pub const {}: &str = include_str!(concat!(env!("OUT_DIR"), "/{}/{name}.ptx"));"#, - name.to_uppercase().replace('.', "_"), - self.kernel_dir, - ); - file.write_all(const_definition.as_bytes())?; - file.write_all(b"\n")?; - } - } - Ok(()) - } -} - fn main() -> Result<()> { println!("cargo:rerun-if-changed=build.rs"); - let out_dir = std::env::var("OUT_DIR").context("OUT_DIR not set")?; - let out_dir = PathBuf::from(out_dir); - #[cfg(feature = "cuda")] - set_cuda_include_dir()?; #[cfg(feature = "cuda")] - let compute_cap = compute_cap()?; - #[cfg(not(feature = "cuda"))] - let compute_cap = 0; - for d in DIRS { - d.process(&out_dir, compute_cap)? - } - Ok(()) -} - -fn set_cuda_include_dir() -> Result<()> { - // NOTE: copied from cudarc build.rs. - let env_vars = [ - "CUDA_PATH", - "CUDA_ROOT", - "CUDA_TOOLKIT_ROOT_DIR", - "CUDNN_LIB", - ]; - let env_vars = env_vars - .into_iter() - .map(std::env::var) - .filter_map(Result::ok) - .map(Into::<PathBuf>::into); - - let roots = [ - "/usr", - "/usr/local/cuda", - "/opt/cuda", - "/usr/lib/cuda", - "C:/Program Files/NVIDIA GPU Computing Toolkit", - "C:/CUDA", - ]; - let roots = roots.into_iter().map(Into::<PathBuf>::into); - let root = env_vars - .chain(roots) - .find(|path| path.join("include").join("cuda.h").is_file()) - .context("cannot find include/cuda.h")?; - println!( - "cargo:rustc-env=CUDA_INCLUDE_DIR={}", - root.join("include").display() - ); - Ok(()) -} - -#[allow(unused)] -fn compute_cap() -> Result<usize> { - println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP"); - - // Try to parse compute cap from env - let mut compute_cap = if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") { - println!("cargo:rustc-env=CUDA_COMPUTE_CAP={compute_cap_str}"); - compute_cap_str - .parse::<usize>() - .context("Could not parse code")? - } else { - // Grab compute cap from nvidia-smi - let out = std::process::Command::new("nvidia-smi") - .arg("--query-gpu=compute_cap") - .arg("--format=csv") - .output() - .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?; - let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?; - let mut lines = out.lines(); - assert_eq!( - lines.next().context("missing line in stdout")?, - "compute_cap" - ); - let cap = lines - .next() - .context("missing line in stdout")? - .replace('.', ""); - println!("cargo:rustc-env=CUDA_COMPUTE_CAP={cap}"); - cap.parse::<usize>() - .with_context(|| format!("cannot parse as int {cap}"))? - }; - - // Grab available GPU codes from nvcc and select the highest one - let max_nvcc_code = { - let out = std::process::Command::new("nvcc") - .arg("--list-gpu-code") - .output() - .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH."); - let out = std::str::from_utf8(&out.stdout).unwrap(); - - let out = out.lines().collect::<Vec<&str>>(); - let mut codes = Vec::with_capacity(out.len()); - for code in out { - let code = code.split('_').collect::<Vec<&str>>(); - if !code.is_empty() && code.contains(&"sm") { - if let Ok(num) = code[1].parse::<usize>() { - codes.push(num); - } - } + { + for kdir in KERNEL_DIRS.iter() { + let builder = bindgen_cuda::Builder::default().kernel_paths_glob(kdir.kernel_glob); + println!("cargo:info={builder:?}"); + let bindings = builder.build_ptx().unwrap(); + bindings.write(kdir.rust_target).unwrap() } - codes.sort(); - if !codes.contains(&compute_cap) { - anyhow::bail!( - "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {codes:?}." - ); - } - *codes.last().unwrap() - }; - - // If nvidia-smi compute_cap is higher than the highest gpu code from nvcc, - // then choose the highest gpu code in nvcc - if compute_cap > max_nvcc_code { - println!( - "cargo:warning=Lowering gpu arch {compute_cap} to max nvcc target {max_nvcc_code}." - ); - compute_cap = max_nvcc_code; } - - println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP"); - - if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") { - compute_cap = compute_cap_str - .parse::<usize>() - .with_context(|| format!("cannot parse as usize '{compute_cap_str}'"))?; - println!("cargo:warning=Using gpu arch {compute_cap} from $CUDA_COMPUTE_CAP"); + #[cfg(not(feature = "cuda"))] + { + for kdir in KERNEL_DIRS.iter() { + let _file = std::fs::File::create(kdir.rust_target)?; + } } - println!("cargo:rustc-env=CUDA_COMPUTE_CAP=sm_{compute_cap}"); - Ok(compute_cap) + Ok(()) } diff --git a/candle-examples/examples/custom-ops/cuda_kernels.rs b/candle-examples/examples/custom-ops/cuda_kernels.rs index 0bee73aa..c00b601b 100644 --- a/candle-examples/examples/custom-ops/cuda_kernels.rs +++ b/candle-examples/examples/custom-ops/cuda_kernels.rs @@ -1,2 +1 @@ -#[rustfmt::skip] -pub const LAYERNORM_KERNELS: &str = include_str!(concat!(env!("OUT_DIR"), "/examples/custom-ops/kernels//layernorm_kernels.ptx")); +pub const LAYERNORM_KERNELS: &str = include_str!(concat!(env!("OUT_DIR"), "/layernorm_kernels.ptx")); diff --git a/candle-examples/examples/custom-ops/main.rs b/candle-examples/examples/custom-ops/main.rs index f2f534dc..30e413c1 100644 --- a/candle-examples/examples/custom-ops/main.rs +++ b/candle-examples/examples/custom-ops/main.rs @@ -6,7 +6,8 @@ #[cfg(feature = "mkl")] extern crate intel_mkl_src; -#[allow(unused)] +#[rustfmt::skip] +#[cfg(feature = "cuda")] mod cuda_kernels; use clap::Parser; |