summaryrefslogtreecommitdiff
path: root/candle-kernels/build.rs
diff options
context:
space:
mode:
authorOlivierDehaene <Olivier.dehaene@gmail.com>2023-10-16 16:37:38 +0200
committerGitHub <noreply@github.com>2023-10-16 15:37:38 +0100
commit75629981bc2b101400a301803c027da2362a4ff9 (patch)
tree105781868b6024facddbf05492dacd33873d4903 /candle-kernels/build.rs
parent0106b0b04c3505a1155b3eab65ac212977c6c3dd (diff)
downloadcandle-75629981bc2b101400a301803c027da2362a4ff9.tar.gz
candle-75629981bc2b101400a301803c027da2362a4ff9.tar.bz2
candle-75629981bc2b101400a301803c027da2362a4ff9.zip
feat: parse Cuda compute cap from env (#1066)
* feat: add support for multiple compute caps * Revert to one compute cap * fmt * fix
Diffstat (limited to 'candle-kernels/build.rs')
-rw-r--r--candle-kernels/build.rs196
1 files changed, 108 insertions, 88 deletions
diff --git a/candle-kernels/build.rs b/candle-kernels/build.rs
index ad084671..17a0bf9c 100644
--- a/candle-kernels/build.rs
+++ b/candle-kernels/build.rs
@@ -1,4 +1,5 @@
use std::io::Write;
+
fn main() {
println!("cargo:rerun-if-changed=build.rs");
@@ -23,6 +24,8 @@ fn main() {
}
mod cuda {
+ use anyhow::{Context, Result};
+
pub fn set_include_dir() {
use std::path::PathBuf;
// NOTE: copied from cudarc build.rs.
@@ -100,34 +103,112 @@ mod cuda {
include_directories.sort();
include_directories.dedup();
+ let compute_cap = compute_cap().expect("Could not get Cuda compute cap");
+
#[allow(unused)]
let include_options: Vec<String> = include_directories
.into_iter()
.map(|s| "-I".to_string() + &s.into_os_string().into_string().unwrap())
.collect::<Vec<_>>();
- // let start = std::time::Instant::now();
+ let ccbin_env = std::env::var("CANDLE_NVCC_CCBIN");
+ println!("cargo:rerun-if-env-changed=CANDLE_NVCC_CCBIN");
+ let children = kernel_paths
+ .par_iter()
+ .flat_map(|p| {
+ let mut output = p.clone();
+ output.set_extension("ptx");
+ let output_filename = std::path::Path::new(&out_dir).to_path_buf().join("out").with_file_name(output.file_name().unwrap());
+
+ let ignore = if output_filename.exists() {
+ let out_modified = output_filename.metadata().unwrap().modified().unwrap();
+ let in_modified = p.metadata().unwrap().modified().unwrap();
+ out_modified.duration_since(in_modified).is_ok()
+ } else {
+ false
+ };
+ if ignore {
+ None
+ } else {
+ let mut command = std::process::Command::new("nvcc");
+ command.arg(format!("--gpu-architecture=sm_{compute_cap}"))
+ .arg("--ptx")
+ .args(["--default-stream", "per-thread"])
+ .args(["--output-directory", &out_dir])
+ // Flash attention only
+ // .arg("--expt-relaxed-constexpr")
+ .args(&include_options);
+ if let Ok(ccbin_path) = &ccbin_env {
+ command
+ .arg("-allow-unsupported-compiler")
+ .args(["-ccbin", ccbin_path]);
+ }
+ command.arg(p);
+ Some((p, command.spawn()
+ .expect("nvcc failed to start. Ensure that you have CUDA installed and that `nvcc` is in your PATH.").wait_with_output()))
+ }
+ })
+ .collect::<Vec<_>>();
+
+ let ptx_paths: Vec<PathBuf> = glob::glob(&format!("{out_dir}/**/*.ptx"))
+ .unwrap()
+ .map(|p| p.unwrap())
+ .collect();
+ // We should rewrite `src/lib.rs` only if there are some newly compiled kernels, or removed
+ // some old ones
+ let write = !children.is_empty() || kernel_paths.len() < ptx_paths.len();
+ for (kernel_path, child) in children {
+ let output = child.expect("nvcc failed to run. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
+ assert!(
+ output.status.success(),
+ "nvcc error while compiling {kernel_path:?}:\n\n# stdout\n{:#}\n\n# stderr\n{:#}",
+ String::from_utf8_lossy(&output.stdout),
+ String::from_utf8_lossy(&output.stderr)
+ );
+ }
+ (write, kernel_paths)
+ }
+
+ #[allow(unused)]
+ fn compute_cap() -> Result<usize> {
+ println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
- // Grab compute code from nvidia-smi
- let mut compute_cap = {
+ // Try to parse compute caps from env
+ let mut compute_cap = if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
+ println!("cargo:rustc-env=CUDA_COMPUTE_CAP={compute_cap_str}");
+ compute_cap_str
+ .parse::<usize>()
+ .context("Could not parse code")?
+ } else {
+ // Use nvidia-smi to get the current compute cap
let out = std::process::Command::new("nvidia-smi")
- .arg("--query-gpu=compute_cap")
- .arg("--format=csv")
- .output()
- .expect("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.");
- let out = std::str::from_utf8(&out.stdout).unwrap();
+ .arg("--query-gpu=compute_cap")
+ .arg("--format=csv")
+ .output()
+ .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
+ let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?;
let mut lines = out.lines();
- assert_eq!(lines.next().unwrap(), "compute_cap");
- let cap = lines.next().unwrap().replace('.', "");
- cap.parse::<usize>().unwrap()
+ assert_eq!(
+ lines.next().context("missing line in stdout")?,
+ "compute_cap"
+ );
+ let cap = lines
+ .next()
+ .context("missing line in stdout")?
+ .replace('.', "");
+ let cap = cap
+ .parse::<usize>()
+ .with_context(|| format!("cannot parse as int {cap}"))?;
+ println!("cargo:rustc-env=CUDA_COMPUTE_CAP={cap}");
+ cap
};
// Grab available GPU codes from nvcc and select the highest one
- let max_nvcc_code = {
+ let (supported_nvcc_codes, max_nvcc_code) = {
let out = std::process::Command::new("nvcc")
- .arg("--list-gpu-code")
- .output()
- .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
+ .arg("--list-gpu-code")
+ .output()
+ .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
let out = std::str::from_utf8(&out.stdout).unwrap();
let out = out.lines().collect::<Vec<&str>>();
@@ -141,83 +222,22 @@ mod cuda {
}
}
codes.sort();
- if !codes.contains(&compute_cap) {
- panic!("nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {codes:?}.");
- }
- *codes.last().unwrap()
+ let max_nvcc_code = *codes.last().context("no gpu codes parsed from nvcc")?;
+ (codes, max_nvcc_code)
};
- // If nvidia-smi compute_cap is higher than the highest gpu code from nvcc,
- // then choose the highest gpu code in nvcc
- if compute_cap > max_nvcc_code {
- println!(
- "cargo:warning=Lowering gpu arch {compute_cap} to max nvcc target {max_nvcc_code}."
- );
- compute_cap = max_nvcc_code;
+ // Check that nvcc supports the asked compute caps
+ if !supported_nvcc_codes.contains(&compute_cap) {
+ anyhow::bail!(
+ "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {supported_nvcc_codes:?}."
+ );
}
-
- println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
- if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
- compute_cap = compute_cap_str.parse::<usize>().unwrap();
- println!("cargo:warning=Using gpu arch {compute_cap} from $CUDA_COMPUTE_CAP");
+ if compute_cap > max_nvcc_code {
+ anyhow::bail!(
+ "CUDA compute cap {compute_cap} is higher than the highest gpu code from nvcc {max_nvcc_code}"
+ );
}
- println!("cargo:rustc-env=CUDA_COMPUTE_CAP=sm_{compute_cap}");
-
- let ccbin_env = std::env::var("CANDLE_NVCC_CCBIN");
- println!("cargo:rerun-if-env-changed=CANDLE_NVCC_CCBIN");
- let children = kernel_paths
- .par_iter()
- .flat_map(|p| {
- let mut output = p.clone();
- output.set_extension("ptx");
- let output_filename = std::path::Path::new(&out_dir).to_path_buf().join("out").with_file_name(output.file_name().unwrap());
-
- let ignore = if output_filename.exists() {
- let out_modified = output_filename.metadata().unwrap().modified().unwrap();
- let in_modified = p.metadata().unwrap().modified().unwrap();
- out_modified.duration_since(in_modified).is_ok()
- }else{
- false
- };
- if ignore{
- None
- }else{
- let mut command = std::process::Command::new("nvcc");
- command.arg(format!("--gpu-architecture=sm_{compute_cap}"))
- .arg("--ptx")
- .args(["--default-stream", "per-thread"])
- .args(["--output-directory", &out_dir])
- // Flash attention only
- // .arg("--expt-relaxed-constexpr")
- .args(&include_options);
- if let Ok(ccbin_path) = &ccbin_env {
- command
- .arg("-allow-unsupported-compiler")
- .args(["-ccbin", ccbin_path]);
- }
- command.arg(p);
- Some((p, command.spawn()
- .expect("nvcc failed to start. Ensure that you have CUDA installed and that `nvcc` is in your PATH.").wait_with_output()))
- }})
- .collect::<Vec<_>>();
-
- let ptx_paths: Vec<PathBuf> = glob::glob(&format!("{out_dir}/**/*.ptx"))
- .unwrap()
- .map(|p| p.unwrap())
- .collect();
- // We should rewrite `src/lib.rs` only if there are some newly compiled kernels, or removed
- // some old ones
- let write = !children.is_empty() || kernel_paths.len() < ptx_paths.len();
- for (kernel_path, child) in children {
- let output = child.expect("nvcc failed to run. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
- assert!(
- output.status.success(),
- "nvcc error while compiling {kernel_path:?}:\n\n# stdout\n{:#}\n\n# stderr\n{:#}",
- String::from_utf8_lossy(&output.stdout),
- String::from_utf8_lossy(&output.stderr)
- );
- }
- (write, kernel_paths)
+ Ok(compute_cap)
}
}