diff --git a/.github/ISSUE_TEMPLATE/skills-failure.md b/.github/ISSUE_TEMPLATE/skills-failure.md new file mode 100644 index 000000000..95b543a79 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/skills-failure.md @@ -0,0 +1,33 @@ +--- +name: Skills deploy gate failed +about: Auto-opened by the deploy-registry verify job when catalogue or scan fails. +title: "[skills-gate] Catalogue or scan failure blocking deploy" +labels: ["skills-gate"] +--- + +The pre-deploy verify job for the agent-skills catalogue failed. +Most recent run: + +{{ env.WORKFLOW_URL }} + +Trigger: `{{ env.RUN_TRIGGER }}` + +This is a single rolling tracker issue. The deploy workflow updates the +same open issue on every subsequent failure until it is closed. Closing +this issue without fixing the underlying problem reopens (or creates) +the next time the gate fails. + +Likely causes: + +- A `sources[].skills[]` override in `registry//skills/README.md` + no longer matches a `skills//SKILL.md` upstream (renamed, + deleted, or moved). +- A declared `owner/repo@ref` no longer clones (repo renamed, deleted, + flipped to private, or the branch ref is gone). +- An upstream `SKILL.md` is missing the required `name` or `description` + frontmatter per the agentskills.io v0.2.0 specification. +- A SkillSpector critical-severity finding on upstream content. Open + alerts are listed under the repository's Security tab, Code scanning. + +See the run logs and any new Code scanning alerts for specifics, then +land a PR that updates the catalogue or the upstream source repo. diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 71ecc55f2..8c4d929ad 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -102,14 +102,69 @@ jobs: # We want to do some basic README checks first before we try analyzing the # contents needs: validate-style + outputs: + skills_changed: ${{ steps.filter.outputs.skills }} steps: - name: Check out code uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Detect changed files + uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 + id: filter + with: + filters: | + skills: + - 'registry/**/skills/**' + - 'cmd/readmevalidation/**' + - 'go.mod' + - 'go.sum' + - '.github/workflows/ci.yaml' - name: Set up Go uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6 with: go-version: "1.24.0" - name: Validate contributors run: go build ./cmd/readmevalidation && ./readmevalidation + - name: Validate skill sources (online) + if: steps.filter.outputs.skills == 'true' + env: + READMEVALIDATION_ONLINE: "1" + run: ./readmevalidation - name: Remove build file artifact run: rm ./readmevalidation + + scan-skills: + name: Scan skill sources with SkillSpector + runs-on: ubuntu-latest + needs: validate-readme-files + if: needs.validate-readme-files.outputs.skills_changed == 'true' + permissions: + contents: read + security-events: write + steps: + - name: Check out code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + - name: Set up yq + uses: mikefarah/yq@b534aa9ee5d38001fba3cd8fe254a037e4847b37 # v4.44.6 + - name: Install SkillSpector + run: | + python -m pip install --upgrade pip + pip install "skillspector @ git+https://github.com/NVIDIA/SkillSpector.git@2eb844780ab163f01468ecf142c40a2ec0fcaec0" + - name: Scan declared skill sources + run: ./scripts/scan-skill-sources.sh ./sarif + - name: Upload SARIF to code scanning + uses: github/codeql-action/upload-sarif@9e8d0789d4a0fa9ceb6b1738f7e269594bdd67f0 # v3.28.9 + with: + sarif_file: sarif + # Initial policy: warn-only. SkillSpector flags real patterns in + # coder/skills/setup that are intentional for an installer skill + # (GitHub device-flow credential access). The gate prints findings + # but does not fail the job. Flip continue-on-error to false in a + # follow-up once the coder/skills baseline is triaged. + - name: Report critical findings + continue-on-error: true + run: | + jq -e '[.runs[].results[]? | select((.level // "") == "error")] | length == 0' sarif/*.sarif diff --git a/.github/workflows/deploy-registry.yaml b/.github/workflows/deploy-registry.yaml index eb54665c1..38979e4c5 100644 --- a/.github/workflows/deploy-registry.yaml +++ b/.github/workflows/deploy-registry.yaml @@ -19,14 +19,58 @@ on: - ".icons/**" jobs: - deploy: + verify: + name: Verify catalogue and scan skills runs-on: ubuntu-latest + permissions: + contents: read + security-events: write + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Set up Go + uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6 + with: + go-version: "1.24.0" + cache: false + - name: Verify catalogue against upstream sources + env: + READMEVALIDATION_ONLINE: "1" + run: go run ./cmd/readmevalidation + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + - name: Set up yq + uses: mikefarah/yq@b534aa9ee5d38001fba3cd8fe254a037e4847b37 # v4.44.6 + - name: Install SkillSpector + run: | + python -m pip install --upgrade pip + pip install "skillspector @ git+https://github.com/NVIDIA/SkillSpector.git@2eb844780ab163f01468ecf142c40a2ec0fcaec0" + - name: Scan declared skill sources + run: ./scripts/scan-skill-sources.sh ./sarif + - name: Upload SARIF to code scanning + uses: github/codeql-action/upload-sarif@9e8d0789d4a0fa9ceb6b1738f7e269594bdd67f0 # v3.28.9 + with: + sarif_file: sarif + # Initial policy: warn-only. SkillSpector flags real patterns in + # coder/skills/setup that are intentional for an installer skill + # (GitHub device-flow credential access). The gate prints findings + # but does not fail the deploy. Flip continue-on-error to false in + # a follow-up once the coder/skills baseline is triaged. + - name: Report critical findings + continue-on-error: true + run: | + jq -e '[.runs[].results[]? | select((.level // "") == "error")] | length == 0' sarif/*.sarif + deploy: + name: Deploy registry + needs: verify + runs-on: ubuntu-latest # Set id-token permission for gcloud permissions: contents: read id-token: write - steps: - name: Checkout code uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -41,3 +85,24 @@ jobs: run: gcloud builds triggers run 29818181-126d-4f8a-a937-f228b27d3d34 --branch main - name: Deploy to registry.coder.com run: gcloud builds triggers run 106610ff-41fb-4bd0-90a2-7643583fb9c0 --tag production + + open-issue-on-failure: + name: Open or update skills-gate tracker issue + needs: verify + if: failure() + runs-on: ubuntu-latest + permissions: + issues: write + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Open or update tracker issue + uses: JasonEtco/create-an-issue@1b14a70e4d8dc185e5cc76d3bec9eab20257b2c5 # v2.9.2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + WORKFLOW_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + RUN_TRIGGER: ${{ github.event_name }} + with: + filename: .github/ISSUE_TEMPLATE/skills-failure.md + update_existing: true + search_existing: open diff --git a/cmd/readmevalidation/coderskills.go b/cmd/readmevalidation/coderskills.go index a2b75b211..66ea386a2 100644 --- a/cmd/readmevalidation/coderskills.go +++ b/cmd/readmevalidation/coderskills.go @@ -4,10 +4,14 @@ import ( "bufio" "context" "errors" + "fmt" "os" + "os/exec" "path" + "path/filepath" "regexp" "slices" + "sort" "strings" "golang.org/x/xerrors" @@ -315,25 +319,284 @@ func aggregateSkillsReadmeFiles() ([]readme, error) { } func validateAllCoderSkills() error { - allReadmeFiles, err := aggregateSkillsReadmeFiles() + readmes, err := parseAllCoderSkillsReadmes() if err != nil { return err } + if len(readmes) == 0 { + return nil + } + + if err := validateAllCoderSkillsReadmes(readmes); err != nil { + return err + } + + logger.Info(context.Background(), "processed all skills README files", "num_files", len(readmes)) + return nil +} + +// parseAllCoderSkillsReadmes walks every registry//skills/README.md +// and returns the parsed structures. Callers can pass the result to the +// offline validator (validateAllCoderSkillsReadmes) and, when network access +// is available, also to the online validator (validateAllCoderSkillsOnline) +// so each README is only read and parsed once per run. +func parseAllCoderSkillsReadmes() ([]coderSkillsReadme, error) { + allReadmeFiles, err := aggregateSkillsReadmeFiles() + if err != nil { + return nil, err + } + logger.Info(context.Background(), "processing skills README files", "num_files", len(allReadmeFiles)) if len(allReadmeFiles) == 0 { + return nil, nil + } + + return parseCoderSkillsReadmeFiles(allReadmeFiles) +} + +// skillsSourceKey identifies one unique upstream source so the online +// verifier clones a given repo@ref exactly once across all namespaces. +type skillsSourceKey struct { + repo string // "owner/repo" + ref string // resolved ref; defaults to "main" when unspecified. +} + +// skillsSourceContent is the result of cloning and walking one upstream +// source repo. Slugs are the directory names under skills/ that contain +// a SKILL.md file. +type skillsSourceContent struct { + slugs map[string]bool + skillMDPath map[string]string // slug -> absolute path to skills//SKILL.md +} + +// skillMarkdownFrontmatter is the minimal SKILL.md frontmatter shape required +// by the agentskills.io v0.2.0 specification. Both fields are required and +// must be non-empty. +type skillMarkdownFrontmatter struct { + Name string `yaml:"name"` + Description string `yaml:"description"` +} + +// validateAllCoderSkillsOnline verifies that every slug declared under +// sources[].skills in a skills README actually exists upstream as +// skills//SKILL.md, that each upstream SKILL.md parses with the +// required agentskills.io frontmatter, and that no slug is claimed by more +// than one source within the same namespace. +// +// This function performs network I/O (shallow Git clones). Callers should +// only invoke it when network access is expected to be available. +func validateAllCoderSkillsOnline(readmes []coderSkillsReadme) error { + if len(readmes) == 0 { return nil } - readmes, err := parseCoderSkillsReadmeFiles(allReadmeFiles) - if err != nil { - return err + keys, _ := collectUniqueSkillsSources(readmes) + logger.Info(context.Background(), "verifying upstream skill sources", + "num_unique_sources", len(keys)) + + content := make(map[skillsSourceKey]skillsSourceContent, len(keys)) + var cleanups []func() + defer func() { + for _, c := range cleanups { + c() + } + }() + + var errs []error + for _, k := range keys { + src, cleanup, err := fetchSkillsSourceContent(k.repo, k.ref) + if cleanup != nil { + cleanups = append(cleanups, cleanup) + } + if err != nil { + errs = append(errs, err) + continue + } + content[k] = src } - if err := validateAllCoderSkillsReadmes(readmes); err != nil { - return err + if len(errs) > 0 { + return validationPhaseError{phase: validationPhaseUpstream, errors: errs} } - logger.Info(context.Background(), "processed all skills README files", "num_files", len(readmes)) + for _, rm := range readmes { + rErrs := verifyReadmeAgainstSources(rm, content) + errs = append(errs, rErrs...) + } + + if len(errs) > 0 { + return validationPhaseError{phase: validationPhaseUpstream, errors: errs} + } + + logger.Info(context.Background(), "upstream skill source verification passed", + "num_files", len(readmes), "num_sources", len(keys)) return nil } + +// collectUniqueSkillsSources returns every unique repo@ref declared across +// all skills READMEs in deterministic order. +func collectUniqueSkillsSources(readmes []coderSkillsReadme) ([]skillsSourceKey, map[skillsSourceKey]bool) { + seen := make(map[skillsSourceKey]bool) + var keys []skillsSourceKey + for _, rm := range readmes { + for _, src := range rm.frontmatter.Sources { + repo, ref, hasRef := strings.Cut(src.Repo, "@") + if !hasRef || ref == "" { + ref = "main" + } + k := skillsSourceKey{repo: repo, ref: ref} + if seen[k] { + continue + } + seen[k] = true + keys = append(keys, k) + } + } + sort.Slice(keys, func(i, j int) bool { + if keys[i].repo != keys[j].repo { + return keys[i].repo < keys[j].repo + } + return keys[i].ref < keys[j].ref + }) + return keys, seen +} + +// fetchSkillsSourceContent shallow-clones a single upstream source repo at +// the given ref and returns the set of skill slugs found under skills// +// that contain a SKILL.md file. The returned cleanup function removes the +// temp clone directory. +// +// Today only branch refs are supported. This matches the behavior of the +// registry-server build pipeline, which clones via plumbing.NewBranchReferenceName. +// When that pipeline grows tag/SHA support, this function should grow it too. +// +// Uses the system git binary instead of a Go Git library so we do not have +// to vendor one for a single shallow clone per source. Every CI runner and +// developer laptop already has git on PATH. +func fetchSkillsSourceContent(repo, ref string) (skillsSourceContent, func(), error) { + dir, err := os.MkdirTemp("", strings.ReplaceAll(repo, "/", "_")+"_*") + if err != nil { + return skillsSourceContent{}, nil, xerrors.Errorf("creating temp dir for %s@%s: %w", repo, ref, err) + } + cleanup := func() { + if rmErr := os.RemoveAll(dir); rmErr != nil { + logger.Warn(context.Background(), "could not remove temp clone dir", + "path", dir, "error", rmErr.Error()) + } + } + + url := "https://github.com/" + repo + ".git" + cmd := exec.Command("git", "clone", "--depth=1", "--branch", ref, "--single-branch", url, dir) + cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0") + if out, err := cmd.CombinedOutput(); err != nil { + return skillsSourceContent{}, cleanup, xerrors.Errorf("cloning %s@%s: %v: %s", repo, ref, err, strings.TrimSpace(string(out))) + } + + skillsDir := filepath.Join(dir, "skills") + entries, err := os.ReadDir(skillsDir) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return skillsSourceContent{ + slugs: map[string]bool{}, + skillMDPath: map[string]string{}, + }, cleanup, nil + } + return skillsSourceContent{}, cleanup, xerrors.Errorf("reading skills directory of %s@%s: %w", repo, ref, err) + } + + content := skillsSourceContent{ + slugs: make(map[string]bool, len(entries)), + skillMDPath: make(map[string]string, len(entries)), + } + for _, e := range entries { + if !e.IsDir() { + continue + } + skillMD := filepath.Join(skillsDir, e.Name(), "SKILL.md") + if _, statErr := os.Stat(skillMD); statErr != nil { + continue + } + content.slugs[e.Name()] = true + content.skillMDPath[e.Name()] = skillMD + } + return content, cleanup, nil +} + +// verifyReadmeAgainstSources checks one parsed skills README against the +// upstream content already fetched for each declared source. +func verifyReadmeAgainstSources(rm coderSkillsReadme, content map[skillsSourceKey]skillsSourceContent) []error { + var errs []error + + // Track every slug across every source in this namespace so we can + // surface duplicates across sources before the registry-server build + // pipeline silently picks one. + slugOrigin := make(map[string]string) + + for i, src := range rm.frontmatter.Sources { + repo, ref, hasRef := strings.Cut(src.Repo, "@") + if !hasRef || ref == "" { + ref = "main" + } + key := skillsSourceKey{repo: repo, ref: ref} + upstream, ok := content[key] + if !ok { + // fetchSkillsSourceContent already produced an error for this key. + continue + } + + for slug := range src.Skills { + if !upstream.slugs[slug] { + errs = append(errs, addFilePathToError(rm.filePath, + xerrors.Errorf("sources[%d].skills[%q]: slug not found upstream at %s/skills/%s/SKILL.md (repo=%s ref=%s)", + i, slug, repo, slug, repo, ref))) + } + } + + for slug := range upstream.slugs { + origin := fmt.Sprintf("%s@%s", repo, ref) + if prev, dup := slugOrigin[slug]; dup { + errs = append(errs, addFilePathToError(rm.filePath, + xerrors.Errorf("slug %q is provided by multiple sources in the same namespace (%s and %s)", + slug, prev, origin))) + continue + } + slugOrigin[slug] = origin + } + + for slug, mdPath := range upstream.skillMDPath { + for _, fmErr := range validateSkillMarkdownFrontmatter(mdPath) { + errs = append(errs, addFilePathToError(rm.filePath, + xerrors.Errorf("sources[%d] %s@%s skill %q: %v", i, repo, ref, slug, fmErr))) + } + } + } + + return errs +} + +// validateSkillMarkdownFrontmatter reads a SKILL.md file and confirms its +// YAML frontmatter contains a non-empty name and description per the +// agentskills.io v0.2.0 specification. +func validateSkillMarkdownFrontmatter(absPath string) []error { + raw, err := os.ReadFile(absPath) + if err != nil { + return []error{xerrors.Errorf("could not read SKILL.md: %v", err)} + } + fm, _, err := separateSkillsFrontmatter(string(raw)) + if err != nil { + return []error{xerrors.Errorf("SKILL.md frontmatter parse error: %v", err)} + } + var yml skillMarkdownFrontmatter + if err := yaml.Unmarshal([]byte(fm), &yml); err != nil { + return []error{xerrors.Errorf("SKILL.md frontmatter YAML invalid: %v", err)} + } + var errs []error + if strings.TrimSpace(yml.Name) == "" { + errs = append(errs, xerrors.New("SKILL.md is missing required frontmatter field 'name'")) + } + if strings.TrimSpace(yml.Description) == "" { + errs = append(errs, xerrors.New("SKILL.md is missing required frontmatter field 'description'")) + } + return errs +} diff --git a/cmd/readmevalidation/main.go b/cmd/readmevalidation/main.go index f3c732242..16e834beb 100644 --- a/cmd/readmevalidation/main.go +++ b/cmd/readmevalidation/main.go @@ -44,6 +44,15 @@ func main() { errs = append(errs, err) } + if os.Getenv("READMEVALIDATION_ONLINE") == "1" { + skillsReadmes, err := parseAllCoderSkillsReadmes() + if err != nil { + errs = append(errs, err) + } else if err := validateAllCoderSkillsOnline(skillsReadmes); err != nil { + errs = append(errs, err) + } + } + if len(errs) == 0 { logger.Info(context.Background(), "processed all READMEs in directory", "dir", rootRegistryPath) os.Exit(0) diff --git a/cmd/readmevalidation/readmefiles.go b/cmd/readmevalidation/readmefiles.go index 70580a777..9acae3cd0 100644 --- a/cmd/readmevalidation/readmefiles.go +++ b/cmd/readmevalidation/readmefiles.go @@ -35,6 +35,12 @@ const ( // is having all its relative URLs be validated for whether they point to // valid resources. validationPhaseCrossReference validationPhase = "Cross-referencing relative asset URLs" + + // validationPhaseUpstream indicates when external skill source repos + // declared in a skills README's frontmatter are being cloned and walked + // to confirm every catalogue override slug exists upstream and every + // upstream SKILL.md has the required agentskills.io v0.2.0 frontmatter. + validationPhaseUpstream validationPhase = "Upstream skill source verification" // --- end of validationPhases ---. ) diff --git a/scripts/scan-skill-sources.sh b/scripts/scan-skill-sources.sh new file mode 100755 index 000000000..dc09bd52e --- /dev/null +++ b/scripts/scan-skill-sources.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +# Usage: scripts/scan-skill-sources.sh [SARIF_OUT_DIR] +# +# Walks every registry//skills/README.md, extracts each unique +# owner/repo@ref from the YAML frontmatter under sources[].repo, and runs +# NVIDIA SkillSpector over the upstream GitHub repository. One SARIF file +# is written to SARIF_OUT_DIR (default ./sarif) per unique source. +# +# This script does NOT decide pass or fail on findings. It exits non-zero +# only when skillspector itself crashes. Severity gating lives in the +# workflow so the policy is visible alongside the deploy gate. + +set -euo pipefail + +OUT_DIR="${1:-./sarif}" +mkdir -p "${OUT_DIR}" + +for bin in skillspector yq; do + if ! command -v "${bin}" > /dev/null 2>&1; then + echo "Required binary '${bin}' is not installed or not on PATH." >&2 + exit 2 + fi +done + +declare -a readme_files=() +for f in registry/*/skills/README.md; do + if [[ -f "${f}" ]]; then + readme_files+=("${f}") + fi +done + +declare -a sources=() +if ((${#readme_files[@]} > 0)); then + raw_sources="" + for f in "${readme_files[@]}"; do + frontmatter="$(awk '/^---$/{c++; next} c==1{print} c>=2{exit}' "${f}")" + extracted="$(printf '%s\n' "${frontmatter}" | yq -r '.sources[].repo')" + raw_sources+="${extracted}"$'\n' + done + sorted_sources="$(printf '%s' "${raw_sources}" | sort -u | sed '/^$/d')" + mapfile -t sources <<< "${sorted_sources}" +fi + +if ((${#sources[@]} == 0)); then + echo "No skills sources declared; nothing to scan." + exit 0 +fi + +scan_failed=0 +for src in "${sources[@]}"; do + repo="${src%@*}" + ref="${src#*@}" + if [[ "${ref}" == "${src}" ]]; then + ref="main" + fi + safe="${repo//\//_}__${ref}" + out_file="${OUT_DIR}/${safe}.sarif" + echo "==> Scanning ${repo}@${ref}" + # SkillSpector exits non-zero when findings are present even though it + # still writes the SARIF file. Treat a missing SARIF as the only true + # crash; severity gating happens in the workflow against the SARIF. + skillspector scan "https://github.com/${repo}" \ + --no-llm \ + --format sarif \ + --output "${out_file}" || true + if [[ ! -s "${out_file}" ]]; then + echo "skillspector did not produce SARIF for ${repo}@${ref}" >&2 + scan_failed=1 + fi +done + +exit "${scan_failed}"