summaryrefslogtreecommitdiff
path: root/candle-examples
diff options
context:
space:
mode:
authorLaurent Mazare <laurent.mazare@gmail.com>2024-01-07 17:18:46 +0100
committerGitHub <noreply@github.com>2024-01-07 17:18:46 +0100
commit89b5a068585b73193d2004a7293d5b2fa6c30bfd (patch)
tree1bb39bbcf60bf4140c704baa827c382693d3497e /candle-examples
parent30313c308106fff7b20fc8cb2b27eb79800cb818 (diff)
downloadcandle-89b5a068585b73193d2004a7293d5b2fa6c30bfd.tar.gz
candle-89b5a068585b73193d2004a7293d5b2fa6c30bfd.tar.bz2
candle-89b5a068585b73193d2004a7293d5b2fa6c30bfd.zip
Use bindgen-cuda for the custom-kernel example. (#1536)
* Use bindgen-cuda for the custom-kernel example. * Only depend on the kernels when cuda is enabled. * Skip rustfmt.
Diffstat (limited to 'candle-examples')
-rw-r--r--candle-examples/Cargo.toml3
-rw-r--r--candle-examples/build.rs247
-rw-r--r--candle-examples/examples/custom-ops/cuda_kernels.rs3
-rw-r--r--candle-examples/examples/custom-ops/main.rs3
4 files changed, 20 insertions, 236 deletions
diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml
index 439116f8..00340d08 100644
--- a/candle-examples/Cargo.toml
+++ b/candle-examples/Cargo.toml
@@ -49,11 +49,12 @@ tokio = "1.29.1"
[build-dependencies]
anyhow = { workspace = true }
+bindgen_cuda = { version = "0.1.1", optional = true }
[features]
default = []
accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"]
-cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
+cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda", "dep:bindgen_cuda"]
cudnn = ["candle/cudnn"]
flash-attn = ["cuda", "candle-transformers/flash-attn", "dep:candle-flash-attn"]
mkl = ["dep:intel-mkl-src", "candle/mkl", "candle-nn/mkl", "candle-transformers/mkl"]
diff --git a/candle-examples/build.rs b/candle-examples/build.rs
index 0af3a6a4..ba40aeb4 100644
--- a/candle-examples/build.rs
+++ b/candle-examples/build.rs
@@ -4,251 +4,34 @@ use std::io::Write;
use std::path::PathBuf;
struct KernelDirectories {
- kernel_dir: &'static str,
+ kernel_glob: &'static str,
rust_target: &'static str,
include_dirs: &'static [&'static str],
}
-const DIRS: [KernelDirectories; 1] = [KernelDirectories {
- kernel_dir: "examples/custom-ops/kernels/",
+const KERNEL_DIRS: [KernelDirectories; 1] = [KernelDirectories {
+ kernel_glob: "examples/custom-ops/kernels/*.cu",
rust_target: "examples/custom-ops/cuda_kernels.rs",
include_dirs: &[],
}];
-impl KernelDirectories {
- fn maybe_build_ptx(
- &self,
- cu_file: &std::path::Path,
- ptx_file: &std::path::Path,
- compute_cap: usize,
- ) -> Result<()> {
- let should_compile = if ptx_file.exists() {
- let ptx_modified = ptx_file.metadata()?.modified()?;
- let cu_modified = cu_file.metadata()?.modified()?;
- cu_modified.duration_since(ptx_modified).is_ok()
- } else {
- true
- };
- if should_compile {
- #[cfg(feature = "cuda")]
- {
- let ccbin_env = std::env::var("CANDLE_NVCC_CCBIN");
- println!("cargo:rerun-if-env-changed=CANDLE_NVCC_CCBIN");
- let mut command = std::process::Command::new("nvcc");
- let out_dir = ptx_file.parent().context("no parent for ptx file")?;
- let include_dirs: Vec<String> =
- self.include_dirs.iter().map(|c| format!("-I{c}")).collect();
- command
- .arg(format!("--gpu-architecture=sm_{compute_cap}"))
- .arg("--ptx")
- .args(["--default-stream", "per-thread"])
- .args(["--output-directory", out_dir.to_str().unwrap()])
- .arg(format!("-I/{}", self.kernel_dir))
- .args(include_dirs)
- .arg(cu_file);
- if let Ok(ccbin_path) = &ccbin_env {
- command
- .arg("-allow-unsupported-compiler")
- .args(["-ccbin", ccbin_path]);
- }
- let output = command
- .spawn()
- .context("failed spawning nvcc")?
- .wait_with_output()?;
- if !output.status.success() {
- anyhow::bail!(
- "nvcc error while compiling {cu_file:?}:\n\n# stdout\n{:#}\n\n# stderr\n{:#}",
- String::from_utf8_lossy(&output.stdout),
- String::from_utf8_lossy(&output.stderr)
- )
- }
- }
- #[cfg(not(feature = "cuda"))]
- std::fs::OpenOptions::new()
- .create(true)
- .write(true)
- .open(ptx_file)?;
- }
- Ok(())
- }
- fn process(&self, out_dir: &std::path::Path, compute_cap: usize) -> Result<()> {
- println!("cargo:rerun-if-changed={}", self.kernel_dir);
- let kernel_dir = PathBuf::from(self.kernel_dir);
- let out_dir = out_dir.join(self.kernel_dir);
- if !out_dir.exists() {
- std::fs::create_dir_all(&out_dir)?;
- }
- let mut cu_files = vec![];
- let mut cuh_files = vec![];
- for file in std::fs::read_dir(kernel_dir)?.flatten() {
- let file = file.path();
- match file.extension().and_then(|v| v.to_str()) {
- Some("cu") => cu_files.push(file),
- Some("cuh") => cuh_files.push(file),
- _ => {}
- }
- }
-
- let mut ptx_paths = vec![];
- for cu_file in cu_files.iter() {
- let file_stem = cu_file
- .file_stem()
- .with_context(|| format!("no stem {cu_file:?}"))?;
- let file_stem = file_stem.to_string_lossy().into_owned();
- let ptx_file = out_dir.join(&format!("{file_stem}.ptx"));
- self.maybe_build_ptx(cu_file, &ptx_file, compute_cap)?;
- ptx_paths.push(ptx_file);
- }
-
- let regenerate_rs_file = true;
- if regenerate_rs_file {
- let mut file = std::fs::File::create(self.rust_target)?;
- for ptx_path in ptx_paths {
- let name = ptx_path
- .file_stem()
- .context("empty stem")?
- .to_string_lossy();
- file.write_all(b"#[rustfmt::skip]\n")?;
- let const_definition = format!(
- r#"pub const {}: &str = include_str!(concat!(env!("OUT_DIR"), "/{}/{name}.ptx"));"#,
- name.to_uppercase().replace('.', "_"),
- self.kernel_dir,
- );
- file.write_all(const_definition.as_bytes())?;
- file.write_all(b"\n")?;
- }
- }
- Ok(())
- }
-}
-
fn main() -> Result<()> {
println!("cargo:rerun-if-changed=build.rs");
- let out_dir = std::env::var("OUT_DIR").context("OUT_DIR not set")?;
- let out_dir = PathBuf::from(out_dir);
- #[cfg(feature = "cuda")]
- set_cuda_include_dir()?;
#[cfg(feature = "cuda")]
- let compute_cap = compute_cap()?;
- #[cfg(not(feature = "cuda"))]
- let compute_cap = 0;
- for d in DIRS {
- d.process(&out_dir, compute_cap)?
- }
- Ok(())
-}
-
-fn set_cuda_include_dir() -> Result<()> {
- // NOTE: copied from cudarc build.rs.
- let env_vars = [
- "CUDA_PATH",
- "CUDA_ROOT",
- "CUDA_TOOLKIT_ROOT_DIR",
- "CUDNN_LIB",
- ];
- let env_vars = env_vars
- .into_iter()
- .map(std::env::var)
- .filter_map(Result::ok)
- .map(Into::<PathBuf>::into);
-
- let roots = [
- "/usr",
- "/usr/local/cuda",
- "/opt/cuda",
- "/usr/lib/cuda",
- "C:/Program Files/NVIDIA GPU Computing Toolkit",
- "C:/CUDA",
- ];
- let roots = roots.into_iter().map(Into::<PathBuf>::into);
- let root = env_vars
- .chain(roots)
- .find(|path| path.join("include").join("cuda.h").is_file())
- .context("cannot find include/cuda.h")?;
- println!(
- "cargo:rustc-env=CUDA_INCLUDE_DIR={}",
- root.join("include").display()
- );
- Ok(())
-}
-
-#[allow(unused)]
-fn compute_cap() -> Result<usize> {
- println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
-
- // Try to parse compute cap from env
- let mut compute_cap = if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
- println!("cargo:rustc-env=CUDA_COMPUTE_CAP={compute_cap_str}");
- compute_cap_str
- .parse::<usize>()
- .context("Could not parse code")?
- } else {
- // Grab compute cap from nvidia-smi
- let out = std::process::Command::new("nvidia-smi")
- .arg("--query-gpu=compute_cap")
- .arg("--format=csv")
- .output()
- .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
- let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?;
- let mut lines = out.lines();
- assert_eq!(
- lines.next().context("missing line in stdout")?,
- "compute_cap"
- );
- let cap = lines
- .next()
- .context("missing line in stdout")?
- .replace('.', "");
- println!("cargo:rustc-env=CUDA_COMPUTE_CAP={cap}");
- cap.parse::<usize>()
- .with_context(|| format!("cannot parse as int {cap}"))?
- };
-
- // Grab available GPU codes from nvcc and select the highest one
- let max_nvcc_code = {
- let out = std::process::Command::new("nvcc")
- .arg("--list-gpu-code")
- .output()
- .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
- let out = std::str::from_utf8(&out.stdout).unwrap();
-
- let out = out.lines().collect::<Vec<&str>>();
- let mut codes = Vec::with_capacity(out.len());
- for code in out {
- let code = code.split('_').collect::<Vec<&str>>();
- if !code.is_empty() && code.contains(&"sm") {
- if let Ok(num) = code[1].parse::<usize>() {
- codes.push(num);
- }
- }
+ {
+ for kdir in KERNEL_DIRS.iter() {
+ let builder = bindgen_cuda::Builder::default().kernel_paths_glob(kdir.kernel_glob);
+ println!("cargo:info={builder:?}");
+ let bindings = builder.build_ptx().unwrap();
+ bindings.write(kdir.rust_target).unwrap()
}
- codes.sort();
- if !codes.contains(&compute_cap) {
- anyhow::bail!(
- "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {codes:?}."
- );
- }
- *codes.last().unwrap()
- };
-
- // If nvidia-smi compute_cap is higher than the highest gpu code from nvcc,
- // then choose the highest gpu code in nvcc
- if compute_cap > max_nvcc_code {
- println!(
- "cargo:warning=Lowering gpu arch {compute_cap} to max nvcc target {max_nvcc_code}."
- );
- compute_cap = max_nvcc_code;
}
-
- println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
-
- if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
- compute_cap = compute_cap_str
- .parse::<usize>()
- .with_context(|| format!("cannot parse as usize '{compute_cap_str}'"))?;
- println!("cargo:warning=Using gpu arch {compute_cap} from $CUDA_COMPUTE_CAP");
+ #[cfg(not(feature = "cuda"))]
+ {
+ for kdir in KERNEL_DIRS.iter() {
+ let _file = std::fs::File::create(kdir.rust_target)?;
+ }
}
- println!("cargo:rustc-env=CUDA_COMPUTE_CAP=sm_{compute_cap}");
- Ok(compute_cap)
+ Ok(())
}
diff --git a/candle-examples/examples/custom-ops/cuda_kernels.rs b/candle-examples/examples/custom-ops/cuda_kernels.rs
index 0bee73aa..c00b601b 100644
--- a/candle-examples/examples/custom-ops/cuda_kernels.rs
+++ b/candle-examples/examples/custom-ops/cuda_kernels.rs
@@ -1,2 +1 @@
-#[rustfmt::skip]
-pub const LAYERNORM_KERNELS: &str = include_str!(concat!(env!("OUT_DIR"), "/examples/custom-ops/kernels//layernorm_kernels.ptx"));
+pub const LAYERNORM_KERNELS: &str = include_str!(concat!(env!("OUT_DIR"), "/layernorm_kernels.ptx"));
diff --git a/candle-examples/examples/custom-ops/main.rs b/candle-examples/examples/custom-ops/main.rs
index f2f534dc..30e413c1 100644
--- a/candle-examples/examples/custom-ops/main.rs
+++ b/candle-examples/examples/custom-ops/main.rs
@@ -6,7 +6,8 @@
#[cfg(feature = "mkl")]
extern crate intel_mkl_src;
-#[allow(unused)]
+#[rustfmt::skip]
+#[cfg(feature = "cuda")]
mod cuda_kernels;
use clap::Parser;