diff --git a/docs/source/builder-cli.md b/docs/source/builder-cli.md index a4c80025..5a150328 100644 --- a/docs/source/builder-cli.md +++ b/docs/source/builder-cli.md @@ -244,6 +244,10 @@ Generate CMake files for a kernel extension build * `-f`, `--force` — Force-overwrite existing files * `--unique-id ` — This is an optional unique identifier that is suffixed to the kernel name to avoid name collisions. (e.g. Git SHA) +* `--kernel-sha ` — Full commit SHA of the kernel source, recorded in the build metadata. When absent, it is detected from the kernel's git repository (used by Nix builds where the source has no `.git`) +* `--kernel-dirty` — Mark the kernel source as having uncommitted changes in the build metadata. Only meaningful together with `--kernel-sha` +* `--kernel-builder-sha ` — Full commit SHA of the `kernel-builder` source, recorded in the build metadata. When absent, the SHA baked in at compile time is used (Nix builds pass this since the sandbox has no `.git`) +* `--kernel-builder-dirty` — Mark `kernel-builder` as having uncommitted changes in the build metadata. Only meaningful together with `--kernel-builder-sha` diff --git a/docs/source/kernel-requirements.md b/docs/source/kernel-requirements.md index eac3ead6..8e83ea65 100644 --- a/docs/source/kernel-requirements.md +++ b/docs/source/kernel-requirements.md @@ -81,6 +81,26 @@ metadata. Currently the following top-level keys are supported: - `digest` (`Digest`, required): hash digest of the kernel files. - `python-depends` (`list[str]`, optional): list of Python dependencies from a curated set of Python dependencies. +- `build-info` (`dict`, optional): provenance of the build, used to flag + non-reproducible (dirty) builds. It contains two optional sub-objects: + - `kernel-builder`: the `kernel-builder` that produced the build, with its + `version` (`str`), the `sha` (`str`) of the `kernel-builder` source it was + built from (when known), and a `dirty` (`bool`) flag that is `true` when + `kernel-builder` was built from a source tree with uncommitted changes. + - `kernel`: the kernel source that was built, with its commit `sha` (`str`) + and a `dirty` (`bool`) flag that is `true` when the kernel source had + uncommitted changes. + + When either `dirty` flag is set, the kernel was built from uncommitted + sources and cannot be reliably reproduced. `kernels` warns when loading such + a build, `kernel-builder` warns when uploading it, and a warning banner is + added to the Hub repository's README. + + > **Note:** For Nix builds, dirtiness follows Nix's flake tree status, which + > also counts **untracked** files (including an uncommitted `flake.lock`). + > Commit your `flake.lock` (and avoid stray untracked files) so that clean + > builds are not flagged as dirty. Local `create-pyproject` runs only + > consider changes to tracked files. Example `metadata.json`: diff --git a/flake.nix b/flake.nix index aa4b29c5..00c03479 100644 --- a/flake.nix +++ b/flake.nix @@ -37,6 +37,14 @@ ; inherit (import ./nix-builder/lib/cache.nix) mkForCache; + # Git provenance of `kernel-builder` itself, recorded in the build + # metadata of kernels it builds. `self` here is the `kernel-builder` + # flake (as resolved in the consuming kernel's lock). It is captured + # here because the `self` argument of `genKernelFlakeOutputs` below + # shadows it with the *kernel's* `self`. + builderRev = self.rev or self.dirtyRev or null; + builderDirty = !(self ? rev); + systems = with flake-utils.lib.system; [ aarch64-darwin aarch64-linux @@ -109,6 +117,8 @@ path rev self + builderRev + builderDirty doGetKernelCheck pythonCheckInputs pythonNativeCheckInputs diff --git a/kernel-builder/build.rs b/kernel-builder/build.rs index 3acdfe01..4022e5df 100644 --- a/kernel-builder/build.rs +++ b/kernel-builder/build.rs @@ -1,3 +1,52 @@ +use std::process::Command; + fn main() { minijinja_embed::embed_templates!("src/pyproject/templates"); + emit_git_info(); +} + +fn emit_git_info() { + let sha = env_var("KERNEL_BUILDER_GIT_SHA").or_else(|| git_output(&["rev-parse", "HEAD"])); + if let Some(sha) = sha { + println!("cargo:rustc-env=KERNEL_BUILDER_GIT_SHA={sha}"); + } + + let dirty = match env_var("KERNEL_BUILDER_GIT_DIRTY") { + Some(value) => is_truthy(&value), + // Only consider tracked files; untracked files (e.g. generated build + // artifacts) should not mark the build as dirty. + None => git_output(&["status", "--porcelain", "--untracked-files=no"]) + .map(|out| !out.is_empty()) + .unwrap_or(false), + }; + println!( + "cargo:rustc-env=KERNEL_BUILDER_GIT_DIRTY={}", + if dirty { "1" } else { "0" } + ); + + println!("cargo:rerun-if-env-changed=KERNEL_BUILDER_GIT_SHA"); + println!("cargo:rerun-if-env-changed=KERNEL_BUILDER_GIT_DIRTY"); + println!("cargo:rerun-if-changed=../.git/HEAD"); + println!("cargo:rerun-if-changed=../.git/index"); +} + +fn env_var(name: &str) -> Option { + std::env::var(name).ok().filter(|s| !s.is_empty()) +} + +fn is_truthy(value: &str) -> bool { + value == "1" || value.eq_ignore_ascii_case("true") +} + +fn git_output(args: &[&str]) -> Option { + let output = Command::new("git").args(args).output().ok()?; + if !output.status.success() { + return None; + } + let s = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if s.is_empty() { + None + } else { + Some(s) + } } diff --git a/kernel-builder/src/main.rs b/kernel-builder/src/main.rs index 1a7a7a8b..c26c2e48 100644 --- a/kernel-builder/src/main.rs +++ b/kernel-builder/src/main.rs @@ -174,6 +174,28 @@ enum Commands { /// kernel name to avoid name collisions. (e.g. Git SHA) #[arg(long)] unique_id: Option, + + /// Full commit SHA of the kernel source, recorded in the build + /// metadata. When absent, it is detected from the kernel's git + /// repository (used by Nix builds where the source has no `.git`). + #[arg(long)] + kernel_sha: Option, + + /// Mark the kernel source as having uncommitted changes in the build + /// metadata. Only meaningful together with `--kernel-sha`. + #[arg(long)] + kernel_dirty: bool, + + /// Full commit SHA of the `kernel-builder` source, recorded in the + /// build metadata. When absent, the SHA baked in at compile time is + /// used (Nix builds pass this since the sandbox has no `.git`). + #[arg(long)] + kernel_builder_sha: Option, + + /// Mark `kernel-builder` as having uncommitted changes in the build + /// metadata. Only meaningful together with `--kernel-builder-sha`. + #[arg(long)] + kernel_builder_dirty: bool, }, /// Spawn a kernel development shell. @@ -365,7 +387,20 @@ fn main() -> Result<()> { force, target_dir, unique_id, - } => create_pyproject(kernel_dir, target_dir, force, unique_id), + kernel_sha, + kernel_dirty, + kernel_builder_sha, + kernel_builder_dirty, + } => create_pyproject( + kernel_dir, + target_dir, + force, + unique_id, + kernel_sha, + kernel_dirty, + kernel_builder_sha, + kernel_builder_dirty, + ), Commands::Devshell { kernel_dir, variant, diff --git a/kernel-builder/src/pyproject/common.rs b/kernel-builder/src/pyproject/common.rs index c03ddb2e..16ba5d13 100644 --- a/kernel-builder/src/pyproject/common.rs +++ b/kernel-builder/src/pyproject/common.rs @@ -4,7 +4,7 @@ use eyre::Result; use itertools::Itertools; use kernels_data::config::{Backend, General}; -use kernels_data::metadata::{BackendInfo, Metadata}; +use kernels_data::metadata::{BackendInfo, BuildInfo, KernelBuilderInfo, Metadata}; use crate::pyproject::ops_identifier::KernelIdentifier; use crate::pyproject::FileSet; @@ -20,11 +20,30 @@ pub fn write_compat_py(file_set: &mut FileSet) -> Result<()> { Ok(()) } +fn kernel_builder_info() -> KernelBuilderInfo { + KernelBuilderInfo { + version: env!("CARGO_PKG_VERSION").to_owned(), + sha: option_env!("KERNEL_BUILDER_GIT_SHA").map(str::to_owned), + dirty: matches!(option_env!("KERNEL_BUILDER_GIT_DIRTY"), Some("1")), + } +} + pub fn write_metadata( general: &General, kernel_id: &KernelIdentifier, file_set: &mut FileSet, ) -> Result<()> { + // Prefer externally-provided `kernel-builder` provenance (e.g. from Nix), + // falling back to the provenance baked in at compile time. + let kernel_builder = kernel_id + .kernel_builder() + .cloned() + .unwrap_or_else(kernel_builder_info); + let build_info = BuildInfo { + kernel_builder: Some(kernel_builder), + kernel: kernel_id.git_info().cloned(), + }; + for backend in &Backend::all() { let writer = file_set.entry(format!("metadata-{backend}.json")); @@ -51,6 +70,7 @@ pub fn write_metadata( backend_type: *backend, }, digest: None, + build_info: Some(build_info.clone()), }; serde_json::to_writer_pretty(writer, &metadata)?; diff --git a/kernel-builder/src/pyproject/mod.rs b/kernel-builder/src/pyproject/mod.rs index 5582d5b6..be33e6be 100644 --- a/kernel-builder/src/pyproject/mod.rs +++ b/kernel-builder/src/pyproject/mod.rs @@ -6,6 +6,7 @@ use std::{ use eyre::{bail, Result}; use kernels_data::config::{Build, Framework}; +use kernels_data::metadata::{GitInfo, KernelBuilderInfo}; use minijinja::Environment; use crate::{ @@ -39,16 +40,38 @@ pub fn create_pyproject_file_set(build: Build, kernel_id: &KernelIdentifier) -> Ok(file_set) } +#[allow(clippy::too_many_arguments)] pub fn create_pyproject( kernel_dir: Option, target_dir: Option, force: bool, unique_id: Option, + kernel_sha: Option, + kernel_dirty: bool, + kernel_builder_sha: Option, + kernel_builder_dirty: bool, ) -> Result<()> { let kernel_dir = check_or_infer_kernel_dir(kernel_dir)?; let target_dir = check_or_infer_target_dir(&kernel_dir, target_dir)?; let build = parse_build(&kernel_dir)?; - let kernel_id = KernelIdentifier::new(&kernel_dir, build.general.name.python_name(), unique_id); + let git_override = kernel_sha.map(|sha| GitInfo { + sha, + dirty: kernel_dirty, + }); + // The version is always that of the running `kernel-builder`; only the + // git provenance is supplied externally (e.g. by Nix). + let kernel_builder_override = kernel_builder_sha.map(|sha| KernelBuilderInfo { + version: env!("CARGO_PKG_VERSION").to_owned(), + sha: Some(sha), + dirty: kernel_builder_dirty, + }); + let kernel_id = KernelIdentifier::new( + &kernel_dir, + build.general.name.python_name(), + unique_id, + git_override, + kernel_builder_override, + ); let file_set = create_pyproject_file_set(build, &kernel_id)?; file_set.write(&target_dir, force)?; @@ -65,7 +88,14 @@ pub fn clean_pyproject( let kernel_dir = check_or_infer_kernel_dir(kernel_dir)?; let target_dir = check_or_infer_target_dir(&kernel_dir, target_dir)?; let build = parse_build(&kernel_dir)?; - let kernel_id = KernelIdentifier::new(&kernel_dir, build.general.name.python_name(), unique_id); + // Provenance is irrelevant when computing the set of files to clean. + let kernel_id = KernelIdentifier::new( + &kernel_dir, + build.general.name.python_name(), + unique_id, + None, + None, + ); let generated_files = create_pyproject_file_set(build, &kernel_id)?.into_names(); diff --git a/kernel-builder/src/pyproject/ops_identifier.rs b/kernel-builder/src/pyproject/ops_identifier.rs index 46157260..5c8b7ecf 100644 --- a/kernel-builder/src/pyproject/ops_identifier.rs +++ b/kernel-builder/src/pyproject/ops_identifier.rs @@ -3,6 +3,7 @@ use std::path::Path; use eyre::{Result, WrapErr}; use git2::Repository; use kernels_data::config::Backend; +use kernels_data::metadata::{GitInfo, KernelBuilderInfo}; use rand::Rng; pub fn random_identifier() -> String { @@ -28,9 +29,25 @@ pub fn git_identifier(target_dir: impl AsRef) -> Result { Ok(if dirty { format!("{rev}_dirty") } else { rev }) } +pub fn git_info(target_dir: impl AsRef) -> Option { + let repo = Repository::discover(target_dir.as_ref()).ok()?; + let head = repo.head().ok()?; + let commit = head.peel_to_commit().ok()?; + let sha = commit.id().to_string(); + + let mut status_options = git2::StatusOptions::new(); + status_options.include_untracked(false); // Ignore untracked files (like generated CMake files) + status_options.exclude_submodules(true); + let dirty = !repo.statuses(Some(&mut status_options)).ok()?.is_empty(); + + Some(GitInfo { sha, dirty }) +} + pub struct KernelIdentifier { name: String, unique_id: String, + git_info: Option, + kernel_builder: Option, } impl KernelIdentifier { @@ -41,19 +58,44 @@ impl KernelIdentifier { /// source revision. If this identefier is not provided, a Git short /// hash is extracted from the repository of `target_dir`. If that /// fails, a random identifier is generated. - pub fn new(kernel_dir: impl AsRef, name: String, unique_id: Option) -> Self { + pub fn new( + kernel_dir: impl AsRef, + name: String, + unique_id: Option, + git_override: Option, + kernel_builder: Option, + ) -> Self { + // Prefer an explicitly provided git provenance (e.g. passed by Nix + // builds, where the source tree has no `.git`); fall back to detecting + // it from the kernel's git repository. + let git_info = git_override.or_else(|| git_info(kernel_dir.as_ref())); let unique_id = unique_id.unwrap_or_else(|| match git_identifier(kernel_dir.as_ref()) { Ok(rev) => rev, Err(_) => random_identifier(), }); - Self { name, unique_id } + Self { + name, + unique_id, + git_info, + kernel_builder, + } } pub fn name(&self) -> &str { &self.name } + pub fn git_info(&self) -> Option<&GitInfo> { + self.git_info.as_ref() + } + + /// Externally-provided `kernel-builder` provenance, when available. When + /// `None`, the compile-time baked provenance is used instead. + pub fn kernel_builder(&self) -> Option<&KernelBuilderInfo> { + self.kernel_builder.as_ref() + } + /// Create the kernel identifier string for a given backend. pub fn to_string_for_backend(&self, backend: Backend) -> String { format!("_{}_{}_{}", self.name, backend, self.unique_id) @@ -63,3 +105,59 @@ impl KernelIdentifier { &self.unique_id } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn explicit_provenance_overrides_are_stored() { + let tmp = tempfile::tempdir().unwrap(); + let git = GitInfo { + sha: "a".repeat(40), + dirty: true, + }; + let kernel_builder = KernelBuilderInfo { + version: "0.16.0-dev0".to_owned(), + sha: Some("b".repeat(40)), + dirty: false, + }; + + let id = KernelIdentifier::new( + tmp.path(), + "relu".to_owned(), + Some("rev123".to_owned()), + Some(git), + Some(kernel_builder), + ); + + assert_eq!(id.unique_id(), "rev123"); + assert_eq!(id.to_string_for_backend(Backend::Cuda), "_relu_cuda_rev123"); + + let git = id.git_info().expect("kernel git info"); + assert_eq!(git.sha, "a".repeat(40)); + assert!(git.dirty); + + let kb = id.kernel_builder().expect("kernel-builder info"); + assert_eq!(kb.sha.as_deref(), Some(&"b".repeat(40)[..])); + assert!(!kb.dirty); + } + + #[test] + fn no_provenance_in_non_git_dir() { + let tmp = tempfile::tempdir().unwrap(); + let id = KernelIdentifier::new( + tmp.path(), + "relu".to_owned(), + Some("x".to_owned()), + None, + None, + ); + + // No overrides supplied and the temp dir is not a git repository, so + // no kernel provenance is recorded. The kernel-builder provenance is + // filled in later from the compile-time default in `write_metadata`. + assert!(id.git_info().is_none()); + assert!(id.kernel_builder().is_none()); + } +} diff --git a/kernel-builder/src/upload.rs b/kernel-builder/src/upload.rs index 2113c2f3..ebfa06d2 100644 --- a/kernel-builder/src/upload.rs +++ b/kernel-builder/src/upload.rs @@ -150,6 +150,11 @@ fn run_upload_typed(args: UploadArgs) -> Result<()> { build_dir.display() ); + let dirty_build = detect_dirty_build(&variants); + if dirty_build { + eprintln!("{DIRTY_BUILD_WARNING}"); + } + let (repo_id, branch) = get_repo_and_branch(&kernel_dir, args.repo_id, args.branch, &variants)?; let repo_url = match api @@ -207,6 +212,7 @@ fn run_upload_typed(args: UploadArgs) -> Result<()> { collect_readme_commit_ops( &kernel_dir, + dirty_build, operations_by_branch .entry(MAIN_BRANCH.to_owned()) .or_default(), @@ -363,16 +369,51 @@ fn collect_benchmark_commit_ops( } /// Collect README commit operation: upload build/CARD.md as README.md. -fn collect_readme_commit_ops(kernel_dir: &Path, operations: &mut Vec) { +fn collect_readme_commit_ops( + kernel_dir: &Path, + dirty_build: bool, + operations: &mut Vec, +) { let Ok(card_path) = discover_build_file(kernel_dir, "CARD.md") else { return; }; + + let source = if dirty_build { + match fs::read_to_string(&card_path) { + Ok(content) => AddSource::Bytes(prepend_dirty_banner(&content).into_bytes()), + Err(err) => { + eprintln!( + "Warning: cannot read `{}` to add dirty-build banner: {err}", + card_path.display() + ); + AddSource::File(card_path) + } + } + } else { + AddSource::File(card_path) + }; + operations.push(CommitOperation::Add { path_in_repo: "README.md".to_owned(), - source: AddSource::File(card_path), + source, }); } +fn prepend_dirty_banner(content: &str) -> String { + let banner = format!("{DIRTY_BUILD_BANNER}\n\n"); + + // Keep the YAML front matter (delimited by `---`) at the very top. + if let Some(rest) = content.strip_prefix("---\n") { + if let Some(end) = rest.find("\n---\n") { + let split = "---\n".len() + end + "\n---\n".len(); + let (front_matter, body) = content.split_at(split); + return format!("{front_matter}\n{banner}{}", body.trim_start_matches('\n')); + } + } + + format!("{banner}{content}") +} + /// Collect build artifact commit operations: add variant files, delete stale ones. fn collect_build_commit_ops( build_dir: &Path, @@ -456,6 +497,29 @@ fn discover_build_file( ); } +const DIRTY_BUILD_WARNING: &str = "\ +Warning: one or more build variants were built from uncommitted (dirty) \ +sources.\n Such builds are not reproducible. A warning banner will be \ +added to the\n repository README."; + +/// Markdown banner prepended to the Hub README for dirty builds. +const DIRTY_BUILD_BANNER: &str = "> [!WARNING]\n\ + > This kernel was built from uncommitted (dirty) sources and is therefore \ + **not reproducible**. It should not be relied upon for production use."; + +fn detect_dirty_build(variants: &[PathBuf]) -> bool { + variants.iter().any(|variant| { + let metadata_path = variant.join("metadata.json"); + let Ok(file) = File::open(&metadata_path) else { + return false; + }; + match Metadata::from_reader(BufReader::new(file)) { + Ok(metadata) => metadata.build_info.as_ref().is_some_and(|bi| bi.is_dirty()), + Err(_) => false, + } + }) +} + /// Determine the branch name (`v{version}`) from variant metadata. fn detect_branch_from_metadata(variants: &[PathBuf]) -> Result> { let mut versions: HashSet = HashSet::new(); @@ -513,22 +577,68 @@ mod tests { fs::write(kernel_dir.join("build/CARD.md"), "# Readme").unwrap(); let mut operations = vec![]; - collect_readme_commit_ops(kernel_dir, &mut operations); + collect_readme_commit_ops(kernel_dir, false, &mut operations); assert_eq!(operations.len(), 1); match &operations[0] { - CommitOperation::Add { path_in_repo, .. } => { + CommitOperation::Add { + path_in_repo, + source, + } => { assert_eq!(path_in_repo, "README.md"); + assert!(matches!(source, AddSource::File(_))); } _ => panic!("Expected Add operation"), } } + #[test] + fn test_collect_readme_commit_ops_dirty() { + let temp_dir = tempfile::tempdir().unwrap(); + let kernel_dir = temp_dir.path(); + + fs::create_dir_all(kernel_dir.join("build")).unwrap(); + fs::write(kernel_dir.join("build/CARD.md"), "# Readme").unwrap(); + + let mut operations = vec![]; + collect_readme_commit_ops(kernel_dir, true, &mut operations); + + assert_eq!(operations.len(), 1); + match &operations[0] { + CommitOperation::Add { + path_in_repo, + source, + } => { + assert_eq!(path_in_repo, "README.md"); + match source { + AddSource::Bytes(bytes) => { + let content = String::from_utf8(bytes.clone()).unwrap(); + assert!(content.contains("[!WARNING]")); + assert!(content.contains("# Readme")); + } + _ => panic!("Expected Bytes source for dirty build"), + } + } + _ => panic!("Expected Add operation"), + } + } + + #[test] + fn test_prepend_dirty_banner_with_front_matter() { + let content = "---\nlicense: apache-2.0\n---\n# Title\n\nBody."; + let result = prepend_dirty_banner(content); + assert!(result.starts_with("---\nlicense: apache-2.0\n---\n")); + assert!(result.contains("[!WARNING]")); + let banner_pos = result.find("[!WARNING]").unwrap(); + let title_pos = result.find("# Title").unwrap(); + assert!(banner_pos < title_pos); + } + #[test] fn test_collect_readme_commit_ops_no_card() { let temp_dir = tempfile::tempdir().unwrap(); let mut operations = vec![]; - collect_readme_commit_ops(temp_dir.path(), &mut operations); + collect_readme_commit_ops(temp_dir.path(), false, &mut operations); assert!(operations.is_empty()); } diff --git a/kernels-data/bindings/python/kernels_data.pyi b/kernels-data/bindings/python/kernels_data.pyi index f0397c8e..9feb36eb 100644 --- a/kernels-data/bindings/python/kernels_data.pyi +++ b/kernels-data/bindings/python/kernels_data.pyi @@ -7,7 +7,10 @@ from typing import Optional, final __all__ = [ "Backend", "BackendInfo", + "BuildInfo", "DigestAlgorithm", + "GitInfo", + "KernelBuilderInfo", "KernelName", "Metadata", "Digest", @@ -63,6 +66,64 @@ class BackendInfo: def __repr__(self) -> str: ... +@final +class GitInfo: + """Git provenance (commit SHA and dirty state) of a source tree.""" + + @property + def sha(self) -> str: + """Full 40-character commit SHA.""" + ... + + @property + def dirty(self) -> bool: + """Whether the working tree had uncommitted changes to tracked files.""" + ... + + def __repr__(self) -> str: ... + +@final +class KernelBuilderInfo: + """Provenance of the `kernel-builder` that produced a build.""" + + @property + def version(self) -> str: + """`kernel-builder` package version.""" + ... + + @property + def sha(self) -> Optional[str]: + """Commit SHA of the `kernel-builder` source, when known.""" + ... + + @property + def dirty(self) -> bool: + """Whether `kernel-builder` was built from a dirty source tree.""" + ... + + def __repr__(self) -> str: ... + +@final +class BuildInfo: + """Build provenance: git state of the `kernel-builder` and kernel source.""" + + @property + def kernel_builder(self) -> Optional[KernelBuilderInfo]: + """Provenance of the `kernel-builder` that produced the build.""" + ... + + @property + def kernel(self) -> Optional[GitInfo]: + """Git provenance of the kernel source that was built.""" + ... + + @property + def dirty(self) -> bool: + """Whether either the `kernel-builder` or the kernel source was dirty.""" + ... + + def __repr__(self) -> str: ... + @final class Version: """A dotted numeric version (e.g. `12.8.0`). @@ -263,4 +324,6 @@ class Metadata: def backend(self) -> BackendInfo: ... @property def digest(self) -> Optional[Digest]: ... + @property + def build_info(self) -> Optional[BuildInfo]: ... def __repr__(self) -> str: ... diff --git a/kernels-data/bindings/python/src/lib.rs b/kernels-data/bindings/python/src/lib.rs index 25d234ca..a34518c9 100644 --- a/kernels-data/bindings/python/src/lib.rs +++ b/kernels-data/bindings/python/src/lib.rs @@ -6,7 +6,7 @@ use std::str::FromStr; use kernels_data::config::{Backend, KernelName}; use kernels_data::digest::{Digest, DigestAlgorithm, DigestViolation}; -use kernels_data::metadata::{BackendInfo, Metadata}; +use kernels_data::metadata::{BackendInfo, BuildInfo, GitInfo, KernelBuilderInfo, Metadata}; use kernels_data::version::Version; use pyo3::Bound as PyBound; use pyo3::exceptions::{PyException, PyOSError, PyRuntimeError, PyValueError}; @@ -188,6 +188,130 @@ impl PyBackendInfo { } } +#[pyclass(name = "GitInfo", frozen)] +#[derive(Clone, Debug)] +struct PyGitInfo { + sha: String, + dirty: bool, +} + +impl From for PyGitInfo { + fn from(g: GitInfo) -> Self { + Self { + sha: g.sha, + dirty: g.dirty, + } + } +} + +#[pymethods] +impl PyGitInfo { + #[getter] + fn sha(&self) -> &str { + &self.sha + } + + #[getter] + fn dirty(&self) -> bool { + self.dirty + } + + fn __repr__(&self) -> String { + format!("GitInfo(sha={:?}, dirty={})", self.sha, self.dirty) + } +} + +#[pyclass(name = "KernelBuilderInfo", frozen)] +#[derive(Clone, Debug)] +struct PyKernelBuilderInfo { + version: String, + sha: Option, + dirty: bool, +} + +impl From for PyKernelBuilderInfo { + fn from(kb: KernelBuilderInfo) -> Self { + Self { + version: kb.version, + sha: kb.sha, + dirty: kb.dirty, + } + } +} + +#[pymethods] +impl PyKernelBuilderInfo { + #[getter] + fn version(&self) -> &str { + &self.version + } + + #[getter] + fn sha(&self) -> Option<&str> { + self.sha.as_deref() + } + + #[getter] + fn dirty(&self) -> bool { + self.dirty + } + + fn __repr__(&self) -> String { + format!( + "KernelBuilderInfo(version={:?}, sha={:?}, dirty={})", + self.version, self.sha, self.dirty + ) + } +} + +#[pyclass(name = "BuildInfo", frozen)] +#[derive(Clone, Debug)] +struct PyBuildInfo { + kernel_builder: Option, + kernel: Option, +} + +impl From for PyBuildInfo { + fn from(b: BuildInfo) -> Self { + Self { + kernel_builder: b.kernel_builder.map(Into::into), + kernel: b.kernel.map(Into::into), + } + } +} + +#[pymethods] +impl PyBuildInfo { + #[getter] + fn kernel_builder(&self) -> Option { + self.kernel_builder.clone() + } + + #[getter] + fn kernel(&self) -> Option { + self.kernel.clone() + } + + /// Whether either the `kernel-builder` or the kernel source was dirty. + #[getter] + fn dirty(&self) -> bool { + self.kernel_builder.as_ref().is_some_and(|kb| kb.dirty) + || self.kernel.as_ref().is_some_and(|k| k.dirty) + } + + fn __repr__(&self) -> String { + format!( + "BuildInfo(kernel_builder={}, kernel={})", + self.kernel_builder + .as_ref() + .map_or("None".to_string(), |kb| kb.__repr__()), + self.kernel + .as_ref() + .map_or("None".to_string(), |k| k.__repr__()) + ) + } +} + /// Parsed `metadata.json` for a kernel build variant. #[pyclass(name = "Metadata", frozen)] #[derive(Clone, Debug)] @@ -201,6 +325,7 @@ struct PyMetadata { python_depends: Vec, backend: PyBackendInfo, digest: Option, + build_info: Option, } impl From for PyMetadata { @@ -215,6 +340,7 @@ impl From for PyMetadata { python_depends: m.python_depends, backend: m.backend.into(), digest: m.digest.map(Into::into), + build_info: m.build_info.map(Into::into), } } } @@ -293,9 +419,14 @@ impl PyMetadata { self.digest.clone() } + #[getter] + fn build_info(&self) -> Option { + self.build_info.clone() + } + fn __repr__(&self) -> String { format!( - "Metadata(id={}, name={:?}, version={:?}, license={:?}, upstream={:?}, source={:?}, python_depends={:?}, backend={}, digest={})", + "Metadata(id={}, name={:?}, version={:?}, license={:?}, upstream={:?}, source={:?}, python_depends={:?}, backend={}, digest={}, build_info={})", self.id, self.name, self.version, @@ -306,7 +437,10 @@ impl PyMetadata { self.backend.__repr__(), self.digest .as_ref() - .map_or("None".to_string(), |sd| sd.__repr__()) + .map_or("None".to_string(), |sd| sd.__repr__()), + self.build_info + .as_ref() + .map_or("None".to_string(), |bi| bi.__repr__()) ) } } @@ -530,6 +664,9 @@ impl PyDigest { fn kernels_data_py(m: &PyBound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/kernels-data/src/metadata.rs b/kernels-data/src/metadata.rs index 109c93fd..bb88f68b 100644 --- a/kernels-data/src/metadata.rs +++ b/kernels-data/src/metadata.rs @@ -15,6 +15,40 @@ pub struct BackendInfo { pub archs: Option>, } +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "kebab-case")] +pub struct GitInfo { + pub sha: String, + pub dirty: bool, +} + +/// Provenance of the `kernel-builder` that produced a build. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "kebab-case")] +pub struct KernelBuilderInfo { + pub version: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub sha: Option, + pub dirty: bool, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "kebab-case")] +pub struct BuildInfo { + #[serde(skip_serializing_if = "Option::is_none")] + pub kernel_builder: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub kernel: Option, +} + +impl BuildInfo { + /// Whether either the `kernel-builder` or the kernel source was dirty. + pub fn is_dirty(&self) -> bool { + self.kernel_builder.as_ref().is_some_and(|kb| kb.dirty) + || self.kernel.as_ref().is_some_and(|k| k.dirty) + } +} + /// Kernel metadata. #[derive(Debug, Deserialize, Serialize)] #[serde(rename_all = "kebab-case")] @@ -31,6 +65,8 @@ pub struct Metadata { pub backend: BackendInfo, #[serde(skip_serializing_if = "Option::is_none")] pub digest: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub build_info: Option, } impl Metadata { @@ -52,3 +88,80 @@ impl FromStr for Metadata { Ok(serde_json::from_str(s)?) } } + +#[cfg(test)] +mod tests { + use super::*; + + const METADATA_NO_BUILD_INFO: &str = r#"{ + "name": "relu", + "id": "_relu_cuda_abc1234", + "version": 1, + "license": "Apache-2.0", + "python-depends": [], + "backend": { "type": "cuda" } + }"#; + + const METADATA_WITH_BUILD_INFO: &str = r#"{ + "name": "relu", + "id": "_relu_cuda_abc1234", + "version": 1, + "license": "Apache-2.0", + "python-depends": [], + "backend": { "type": "cuda" }, + "build-info": { + "kernel-builder": { "version": "0.16.0-dev0", "sha": "1111111111111111111111111111111111111111", "dirty": true }, + "kernel": { "sha": "2222222222222222222222222222222222222222", "dirty": false } + } + }"#; + + #[test] + fn parses_metadata_without_build_info() { + let metadata: Metadata = METADATA_NO_BUILD_INFO.parse().unwrap(); + assert!(metadata.build_info.is_none()); + } + + #[test] + fn parses_and_reports_dirty_build_info() { + let metadata: Metadata = METADATA_WITH_BUILD_INFO.parse().unwrap(); + let build_info = metadata.build_info.expect("build-info should be present"); + assert!(build_info.is_dirty()); + + let kernel_builder = build_info.kernel_builder.unwrap(); + assert_eq!(kernel_builder.version, "0.16.0-dev0"); + assert!(kernel_builder.dirty); + assert_eq!(kernel_builder.sha.as_deref(), Some(&"1".repeat(40)[..])); + + let kernel = build_info.kernel.unwrap(); + assert!(!kernel.dirty); + assert_eq!(kernel.sha, "2".repeat(40)); + } + + #[test] + fn build_info_round_trips_with_kebab_case_keys() { + let metadata: Metadata = METADATA_WITH_BUILD_INFO.parse().unwrap(); + let json = serde_json::to_string(&metadata).unwrap(); + assert!(json.contains("\"build-info\"")); + assert!(json.contains("\"kernel-builder\"")); + + // Re-parsing the serialized form yields the same dirtiness. + let reparsed: Metadata = json.parse().unwrap(); + assert!(reparsed.build_info.unwrap().is_dirty()); + } + + #[test] + fn build_info_is_not_dirty_when_all_clean() { + let build_info = BuildInfo { + kernel_builder: Some(KernelBuilderInfo { + version: "0.16.0".to_owned(), + sha: None, + dirty: false, + }), + kernel: Some(GitInfo { + sha: "abc".to_owned(), + dirty: false, + }), + }; + assert!(!build_info.is_dirty()); + } +} diff --git a/kernels/src/kernels/utils.py b/kernels/src/kernels/utils.py index c54a5922..650443f1 100644 --- a/kernels/src/kernels/utils.py +++ b/kernels/src/kernels/utils.py @@ -204,6 +204,28 @@ def _validate_variant_dependencies(variant_path: Path) -> None: validate_dependencies(metadata.name.python_name, metadata.python_depends, _backend()) +def _warn_if_dirty_build(metadata: Metadata, repo_info: "RepoInfo | None") -> None: + build_info = metadata.build_info + if build_info is None or not build_info.dirty: + return + + where = f" of `{repo_info.repo_id}`" if repo_info is not None else "" + reasons = [] + kernel_builder = build_info.kernel_builder + if kernel_builder is not None and kernel_builder.dirty: + reasons.append("the `kernel-builder` had uncommitted changes") + kernel = build_info.kernel + if kernel is not None and kernel.dirty: + reasons.append("the kernel source had uncommitted changes") + + warnings.warn( + f"Kernel `{metadata.id}`{where} was built from a dirty source tree " + f"({' and '.join(reasons)}). Such builds are not reproducible and " + f"should not be relied upon.", + stacklevel=2, + ) + + def _import_from_path(variant_path: Path, repo_info: RepoInfo | None = None) -> ModuleType: if (loaded_kernel := _loaded_kernels.get(variant_path)) is not None: return loaded_kernel.module @@ -211,6 +233,8 @@ def _import_from_path(variant_path: Path, repo_info: RepoInfo | None = None) -> metadata = Metadata.read_from_file(variant_path / "metadata.json") module_name = metadata.name.python_name + _warn_if_dirty_build(metadata, repo_info) + file_path = variant_path / "__init__.py" if not file_path.exists(): file_path = variant_path / module_name / "__init__.py" diff --git a/kernels/tests/test_dirty_build.py b/kernels/tests/test_dirty_build.py new file mode 100644 index 00000000..c55c7639 --- /dev/null +++ b/kernels/tests/test_dirty_build.py @@ -0,0 +1,118 @@ +import json +import warnings + +import pytest +from kernels_data import Metadata + +from kernels.utils import RepoInfo, _import_from_path, _loaded_kernels, _warn_if_dirty_build + + +def _metadata(build_info: dict | None) -> Metadata: + data: dict = { + "name": "relu", + "id": "_relu_cuda_abc1234", + "version": 1, + "license": "Apache-2.0", + "python-depends": [], + "backend": {"type": "cuda"}, + } + if build_info is not None: + data["build-info"] = build_info + return Metadata.from_bytes(json.dumps(data).encode("utf-8")) + + +def _clean_builder() -> dict: + return {"version": "0.16.0-dev0", "sha": "a" * 40, "dirty": False} + + +def _dirty_builder() -> dict: + return {"version": "0.16.0-dev0", "sha": "a" * 40, "dirty": True} + + +def _clean_kernel() -> dict: + return {"sha": "b" * 40, "dirty": False} + + +def _dirty_kernel() -> dict: + return {"sha": "b" * 40, "dirty": True} + + +@pytest.fixture +def fresh_registry(): + """Run the test against a clean loaded-kernel registry, restore on teardown.""" + saved = _loaded_kernels.copy() + _loaded_kernels.clear() + yield + _loaded_kernels.clear() + _loaded_kernels.update(saved) + + +def _warnings_for(metadata: Metadata, repo_info: RepoInfo | None = None) -> list[str]: + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _warn_if_dirty_build(metadata, repo_info) + return [str(w.message) for w in caught] + + +def test_no_build_info_does_not_warn(): + assert _warnings_for(_metadata(None)) == [] + + +def test_clean_build_does_not_warn(): + build_info = {"kernel-builder": _clean_builder(), "kernel": _clean_kernel()} + assert _warnings_for(_metadata(build_info)) == [] + + +def test_dirty_kernel_builder_warns(): + build_info = {"kernel-builder": _dirty_builder(), "kernel": _clean_kernel()} + messages = _warnings_for(_metadata(build_info)) + assert len(messages) == 1 + assert "`kernel-builder` had uncommitted changes" in messages[0] + assert "kernel source had uncommitted changes" not in messages[0] + + +def test_dirty_kernel_source_warns(): + build_info = {"kernel-builder": _clean_builder(), "kernel": _dirty_kernel()} + messages = _warnings_for(_metadata(build_info)) + assert len(messages) == 1 + assert "kernel source had uncommitted changes" in messages[0] + assert "`kernel-builder` had uncommitted changes" not in messages[0] + + +def test_both_dirty_warns_with_both_reasons(): + build_info = {"kernel-builder": _dirty_builder(), "kernel": _dirty_kernel()} + messages = _warnings_for(_metadata(build_info)) + assert len(messages) == 1 + assert "`kernel-builder` had uncommitted changes" in messages[0] + assert "kernel source had uncommitted changes" in messages[0] + + +def test_warning_includes_repo_id_when_available(): + build_info = {"kernel-builder": _dirty_builder(), "kernel": _clean_kernel()} + messages = _warnings_for(_metadata(build_info), RepoInfo(repo_id="acme/relu", revision="main")) + assert "acme/relu" in messages[0] + + +def test_import_from_path_warns_on_dirty_build(tmp_path, fresh_registry): + variant_path = tmp_path / "variant" + (variant_path / "relu").mkdir(parents=True) + (variant_path / "relu" / "__init__.py").write_text("VALUE = 42\n") + + metadata = { + "name": "relu", + "id": "_relu_cuda_abc1234", + "version": 1, + "license": "Apache-2.0", + "python-depends": [], + "backend": {"type": "cuda"}, + "build-info": {"kernel-builder": _dirty_builder(), "kernel": _clean_kernel()}, + } + (variant_path / "metadata.json").write_text(json.dumps(metadata)) + + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + module = _import_from_path(variant_path) + + assert module.VALUE == 42 + dirty_warnings = [w for w in caught if "dirty source tree" in str(w.message)] + assert len(dirty_warnings) == 1 diff --git a/nix-builder/lib/build.nix b/nix-builder/lib/build.nix index 4b49871d..a2eeaaa1 100644 --- a/nix-builder/lib/build.nix +++ b/nix-builder/lib/build.nix @@ -113,6 +113,7 @@ rec { rev, doGetKernelCheck, stripRPath ? false, + provenanceArgs ? "", }: let inherit (lib) fileset; @@ -151,6 +152,7 @@ rec { doGetKernelCheck pythonDeps backendPythonDeps + provenanceArgs ; kernelName = kernelConfig.name; } @@ -166,6 +168,7 @@ rec { rev pythonDeps backendPythonDeps + provenanceArgs ; kernelName = kernelConfig.name; @@ -183,6 +186,7 @@ rec { rev pythonDeps backendPythonDeps + provenanceArgs ; inherit (kernelConfig) torchStableAbiVersion; @@ -199,6 +203,7 @@ rec { doGetKernelCheck, bundleOnly, buildSets, + provenanceArgs ? "", }: let kernelConfig = readKernelConfig path; @@ -212,6 +217,7 @@ rec { kernelConfig rev doGetKernelCheck + provenanceArgs ; stripRPath = true; }; @@ -227,6 +233,7 @@ rec { rev, doGetKernelCheck, buildSets, + provenanceArgs ? "", }: let extensions = mkDistTorchExtensions { @@ -235,6 +242,7 @@ rec { path rev doGetKernelCheck + provenanceArgs ; bundleOnly = true; }; @@ -334,6 +342,7 @@ rec { buildSets, doGetKernelCheck, pythonCheckInputs, + provenanceArgs ? "", }: let kernelConfig = readKernelConfig path; @@ -350,6 +359,7 @@ rec { kernelConfig rev doGetKernelCheck + provenanceArgs ; }; testPython = diff --git a/nix-builder/lib/extension/torch/arch.nix b/nix-builder/lib/extension/torch/arch.nix index d2365591..d16df70f 100644 --- a/nix-builder/lib/extension/torch/arch.nix +++ b/nix-builder/lib/extension/torch/arch.nix @@ -71,6 +71,10 @@ # Revision to bake into the ops name. rev, + # Extra `create-pyproject` flags recording git provenance (commit SHA and + # dirty state) of the kernel source and `kernel-builder`. + provenanceArgs ? "", + src, }: @@ -132,7 +136,7 @@ stdenv.mkDerivation (prevAttrs: { mkdir -p $out cp -r --no-preserve=mode ${src}/* $out/ ${pkgs.kernel-builder}/bin/kernel-builder create-pyproject \ - --unique-id ${rev} $out + --unique-id ${rev} ${provenanceArgs} $out ''; preConfigure = diff --git a/nix-builder/lib/extension/torch/no-arch.nix b/nix-builder/lib/extension/torch/no-arch.nix index f8921115..c3264734 100644 --- a/nix-builder/lib/extension/torch/no-arch.nix +++ b/nix-builder/lib/extension/torch/no-arch.nix @@ -41,6 +41,10 @@ pythonDeps, backendPythonDeps, + + # Extra `create-pyproject` flags recording git provenance (commit SHA and + # dirty state) of the kernel source and `kernel-builder`. + provenanceArgs ? "", }: # Extra validation - the environment should correspind to the build config. @@ -79,7 +83,7 @@ stdenv.mkDerivation (prevAttrs: { mkdir -p $out cp -r --no-preserve=mode ${src}/* $out/ ${pkgs.kernel-builder}/bin/kernel-builder create-pyproject \ - --unique-id ${rev} $out + --unique-id ${rev} ${provenanceArgs} $out ''; framework = "torch"; diff --git a/nix-builder/lib/extension/tvm-ffi/arch.nix b/nix-builder/lib/extension/tvm-ffi/arch.nix index ffc877cd..6c32a238 100644 --- a/nix-builder/lib/extension/tvm-ffi/arch.nix +++ b/nix-builder/lib/extension/tvm-ffi/arch.nix @@ -68,6 +68,10 @@ # Revision to bake into the ops name. rev, + # Extra `create-pyproject` flags recording git provenance (commit SHA and + # dirty state) of the kernel source and `kernel-builder`. + provenanceArgs ? "", + src, }: @@ -129,7 +133,7 @@ stdenv.mkDerivation (prevAttrs: { # Generate build files. postPatch = '' kernel-builder create-pyproject \ - --unique-id ${rev} . + --unique-id ${rev} ${provenanceArgs} . ''; preConfigure = diff --git a/nix-builder/lib/gen-flake-outputs.nix b/nix-builder/lib/gen-flake-outputs.nix index 5409edb3..e710695a 100644 --- a/nix-builder/lib/gen-flake-outputs.nix +++ b/nix-builder/lib/gen-flake-outputs.nix @@ -11,6 +11,12 @@ rev ? null, self ? null, + # Git provenance of the `kernel-builder` itself (the flake revision it was + # evaluated from), recorded in the build metadata. `null`/`false` when + # `kernel-builder` is used from a non-git source (e.g. a local `path:`). + builderRev ? null, + builderDirty ? false, + doGetKernelCheck, pythonCheckInputs, pythonNativeCheckInputs, @@ -36,6 +42,31 @@ let revUnderscored = builtins.replaceStrings [ "-" ] [ "_" ] flakeRev; + # Extra `kernel-builder create-pyproject` flags that record the full git + # provenance (commit SHA + dirty state) of both the kernel source and + # `kernel-builder` in the build metadata. The Nix sandbox has no `.git`, so + # this information has to be passed in explicitly. + provenanceArgs = + let + kernelSha = + if self == null then + null + else if self ? rev then + self.rev + else if self ? dirtyRev then + lib.removeSuffix "-dirty" self.dirtyRev + else + null; + kernelDirty = self != null && !(self ? rev); + builderSha = if builderRev == null then null else lib.removeSuffix "-dirty" builderRev; + parts = + lib.optional (kernelSha != null) "--kernel-sha ${kernelSha}" + ++ lib.optional (kernelSha != null && kernelDirty) "--kernel-dirty" + ++ lib.optional (builderSha != null) "--kernel-builder-sha ${builderSha}" + ++ lib.optional (builderSha != null && builderDirty) "--kernel-builder-dirty"; + in + lib.concatStringsSep " " parts; + applicableBuildSets = build.applicableBuildSets { inherit path buildSets; }; kernelConfig = (import ./kernel-config.nix { inherit lib; }) path; @@ -153,12 +184,17 @@ in packages = let bundle = build.mkExtensionBundle { - inherit path doGetKernelCheck; + inherit path doGetKernelCheck provenanceArgs; buildSets = applicableBuildSets; rev = revUnderscored; }; ciTests = build.mkCiTests { - inherit path doGetKernelCheck pythonCheckInputs; + inherit + path + doGetKernelCheck + pythonCheckInputs + provenanceArgs + ; buildSets = applicableBuildSets; rev = revUnderscored; }; @@ -175,7 +211,7 @@ in builtins.map (backend: { name = backend; value = build.mkExtensionBundle { - inherit path doGetKernelCheck; + inherit path doGetKernelCheck provenanceArgs; buildSets = builtins.filter ( set: buildConfigBackend set.buildConfig == backend ) applicableBuildSets; @@ -241,7 +277,7 @@ in ++ (headOrEmpty (setsWithFramework "xpu")); in build.mkExtensionBundle { - inherit path doGetKernelCheck; + inherit path doGetKernelCheck provenanceArgs; buildSets = onePerFramework; rev = revUnderscored; }; @@ -254,7 +290,7 @@ in builtins.map (backend: { name = backend; value = build.mkExtensionBundle { - inherit path doGetKernelCheck; + inherit path doGetKernelCheck provenanceArgs; # It is too costly to build all variants in CI, so we just build one per framework. buildSets = headOrEmpty ( builtins.filter (set: set.buildConfig.framework == backend) buildSetsSorted @@ -277,7 +313,7 @@ in }; redistributable = build.mkDistTorchExtensions { - inherit path doGetKernelCheck; + inherit path doGetKernelCheck provenanceArgs; bundleOnly = false; rev = revUnderscored; buildSets = applicableBuildSets;