vllm-project · LucasWilkinson · May 12, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 2, 2026
diff --git a/.github/actions/gpu-test/action.yml b/.github/actions/gpu-test/action.yml
@@ -0,0 +1,67 @@
+name: GPU Test
+description: Compile and run FA4 tests (pull SIF from Docker Hub, cache by tag)
+
+inputs:
+  test-filter:
+    description: pytest -k filter expression
+    required: false
+    default: ""
+  compile-workers:
+    description: parallel workers for Pass 1 kernel compilation
+    required: false
+    default: "64"
+  fa4_image_cu129:
+    description: Docker image for CUDA 12.9 (used when driver does not support CUDA 13.0)
+    required: true
+  fa4_image_cu130:
+    description: Docker image for CUDA 13.0 (used when driver supports CUDA 13.0)
+    required: true
+
+runs:
+  using: composite
+  steps:
+    - name: Select FA4 image based on CUDA version
+      shell: bash
+      run: |
+        # Read max supported CUDA version from nvidia-smi header, e.g. "CUDA Version: 12.9"
+        CUDA_VER=$(nvidia-smi | grep -oP "CUDA Version: \K[0-9]+\.[0-9]+")
+        CUDA_MAJOR=$(echo "$CUDA_VER" | cut -d. -f1)
+        echo "Detected max CUDA version: $CUDA_VER"
+        if [ "$CUDA_MAJOR" -ge 13 ]; then
+          echo "Using cu130 image"
+          echo "FA4_IMAGE=${{ inputs.fa4_image_cu130 }}" >> "$GITHUB_ENV"
+        else
+          echo "Using cu129 image"
+          echo "FA4_IMAGE=${{ inputs.fa4_image_cu129 }}" >> "$GITHUB_ENV"
+        fi
+
+    - name: Pull FA4 SIF
+      shell: bash
+      run: |
+        CI_WORK_DIR="${CI_WORK_DIR:-/scratch/user/$USER}"
+        TAG=$(echo "$FA4_IMAGE" | tr '/: ' '---')
+        SIF="$CI_WORK_DIR/${TAG}.sif"
+        # Apptainer doesn't support tag@digest refs — strip the tag, keep digest only.
+        PULL_REF=$(echo "$FA4_IMAGE" | sed 's/:[^@]*@/@/')
+        echo "PULL_REF=$PULL_REF"
+        echo "SIF=$SIF"
+        mkdir -p "$CI_WORK_DIR/apptainer_cache" /tmp/apptainer_tmp
+        if [ ! -f "$SIF" ]; then
+          echo "Pulling $PULL_REF → $SIF"
+          APPTAINER_TMPDIR="/tmp/apptainer_tmp" \
+          APPTAINER_CACHEDIR="$CI_WORK_DIR/apptainer_cache" \
+          apptainer pull "$SIF" "docker://$PULL_REF"
+        else
+          echo "Using cached SIF: $SIF"
+        fi
+        # Remove stale SIFs from previous image versions to prevent unbounded disk growth.
+        find "$CI_WORK_DIR" -maxdepth 1 -name "*.sif" ! -name "$(basename "$SIF")" -delete
+        echo "FA4_SIF=$SIF" >> "$GITHUB_ENV"
+
+    - name: Compile and run tests
+      shell: bash
+      run: |
+        python3 "$GITHUB_WORKSPACE/tools/ci/run_fa4_ci.py" \
+          --repo-root "$GITHUB_WORKSPACE" \
+          --test-filter "${{ inputs.test-filter }}" \
+          --compile-workers "${{ inputs.compile-workers }}"
diff --git a/.github/scripts/test_ci_local.sh b/.github/scripts/test_ci_local.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../../" && pwd)
+
+python3 "$SCRIPT_DIR/tools/ci/run_fa4_ci.py" \
+  --repo-root "$SCRIPT_DIR" \
+  "$@"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,45 @@
+name: CI
+
+on:
+  push:
+    branches: [main, ci-fix]
+
+permissions:
+  contents: read
+
+env:
+  CI_WORK_DIR: ${{ vars.CI_WORK_DIR || format('/scratch/user/{0}', github.actor) }}
+  FA4_TEST_FILTER: "1024-1024-128-True-0-0.0-False-False-False-mha-dtype0 or 1024-1024-128-False-0-0.0-False-False-False-mha-dtype0"
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install ruff
+        run: pip install ruff
+      - name: Ruff check
+        run: ruff check flash_attn/cute/ --extend-exclude "flash_attn/cute/flash_bwd.py,flash_attn/cute/flash_fwd.py,flash_attn/cute/flash_fwd_sm100.py,flash_attn/cute/interface.py"
+      - name: Ruff format
+        run: ruff format --check flash_attn/cute/ --exclude "flash_attn/cute/flash_bwd.py,flash_attn/cute/flash_fwd.py,flash_attn/cute/flash_fwd_sm100.py,flash_attn/cute/interface.py"
+
+  fa4-correctness-and-benchmark:
+    strategy:
+      fail-fast: false
+      matrix:
+        gpu: [b200]
+    runs-on: [self-hosted, '${{ matrix.gpu }}']
+    name: fa4-correctness-and-benchmark (${{ matrix.gpu }})
+    timeout-minutes: 60
+    steps:
+      - uses: actions/checkout@v4
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - uses: ./.github/actions/gpu-test
+        with:
+          test-filter: ${{ env.FA4_TEST_FILTER }}
+          fa4_image_cu129: "togethercomputer/training-performance:flash-attn-cu12.9-26.03.25@sha256:304a5c3d2b3a75b151cd2a964cd26d444e0d8b5686d63943df13378c9705f943"
+          fa4_image_cu130: "togethercomputer/training-performance:flash-attn-cu13.0-26.04.01@sha256:56e50b056eb4d671410846c3483e843ee7bd0f5b13cb45b6f0d7eb8bd27694a5"
diff --git a/.gitignore b/.gitignore
@@ -36,6 +36,8 @@ var/
 
 # Dev
 venv
+agent_space/
+benchmarks/results/
 
 # compile-time generated file
 flash_attn_config.py
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1 @@
+CLAUDE.md
diff --git a/AI/CLC_TRACE_DEBUG.md b/AI/CLC_TRACE_DEBUG.md
@@ -0,0 +1,82 @@
+# CLC Trace Debugging
+
+Use this when you suspect the CLC work scheduler is making surprising tile assignment decisions and you want a raw scheduler trace from the current kernel.
+
+## Current trace format
+
+SM100 forward kernels emit one trace line per scheduler-warp query at `FA_LOG_LEVEL=3`:
+
+```text
+[CLC] query sm=<smid> cta=<blockIdx.x> (m_blk=<m>,h=<h>,b=<b>,s=<s>) valid=<0|1>
+```
+
+Current emit sites:
+- `flash_attn/cute/flash_fwd_sm100.py`
+- `flash_attn/cute/flash_fwd_mla_sm100.py`
+
+## How to capture a trace
+
+Important:
+- `FA_LOG_LEVEL=3` is needed for the `[CLC] query ...` device-side prints.
+- `FA_CLC=1` only requests CLC; the kernel may still fall back if the shape/features disable it.
+
+Minimal repro pattern:
+
+```bash
+FA_LOG_LEVEL=3 FA_CLC=1 CUDA_VISIBLE_DEVICES=0 python - <<'PY' \
+  > agent_space/clc_trace.log 2>&1
+import torch
+from flash_attn.cute.interface import flash_attn_func
+
+torch.manual_seed(0)
+q = torch.randn(1, 512, 16, 128, device='cuda', dtype=torch.bfloat16)
+k = torch.randn(1, 512, 1, 128, device='cuda', dtype=torch.bfloat16)
+v = torch.randn(1, 512, 1, 128, device='cuda', dtype=torch.bfloat16)
+flash_attn_func(q, k, v, causal=True)
+torch.cuda.synchronize()
+PY
+```
+
+If you want the run to say explicitly whether CLC was selected, keep the host log prefix too:
+
+```text
+[FA] TileScheduler=SingleTileLPTScheduler, scheduling_mode=CLC, USE_2CTA=False
+```
+
+## What to look for
+
+- `scheduling_mode=CLC` in host logs confirms the shape actually used the CLC path.
+- `valid=1` means the returned work tile is valid.
+- `valid=0` means the scheduler is exhausted for that CTA/scheduler warp query.
+- `m_blk`, `h`, `b`, `s` are the logical work coordinates after the scheduler mapping.
+- `cta` is the physical `blockIdx.x`; for clustered launches multiple CTAs may participate in the same logical tile.
+
+## Parse the trace
+
+A lightweight parser lives in `AI/parse_clc_log.py`.
+
+Text summary:
+
+```bash
+python AI/parse_clc_log.py agent_space/clc_trace.log
+```
+
+HTML view:
+
+```bash
+python AI/parse_clc_log.py agent_space/clc_trace.log --html -o agent_space/clc_trace.html
+```
+
+## Suggested workflow
+
+1. Reproduce the surprising case with `FA_LOG_LEVEL=3 FA_CLC=1`.
+2. Save stdout/stderr to `agent_space/clc_trace.log`.
+3. Run `AI/parse_clc_log.py` on that log to get a compact per-SM / per-CTA summary.
+4. If the trace still looks suspicious, attach or paste that log in the investigation thread / agent notes.
+5. Compare against the relevant mapping logic in `flash_attn/cute/tile_scheduler.py`.
+
+## Caveats
+
+- The trace is noisy and expensive; use a single small shape first.
+- Because the print happens on scheduler queries, many lines may be terminal `valid=0` queries after work is exhausted.
+- Dense noncausal and varlen MHA may intentionally fall back away from CLC depending on the current heuristic in `flash_attn/cute/interface.py`.