pytorch · huydhn · Jun 23, 2026
@@ -110,6 +110,16 @@ defaults:
       - "x86_64"
       - "aarch64"
     target_manylinux: "2_28"
+  hf_cache:
+    # Shared HuggingFace model cache (modules/hf-cache). Not enabled on any
+    # cluster by default — add "hf-cache" to a cluster's modules list to turn it
+    # on. The model data lives in a single shared, private S3 bucket
+    # (pytorch-hf-model-cache in us-east-2); these are just the knobs.
+    namespace: hf-cache
+    bucket_region: us-east-2          # region of the shared model-cache bucket
+    rclone_image: "rclone/rclone:1.69.1"
+    vfs_cache_max_size: 200G          # per-node NVMe cap for the rclone read cache
+    refresh_schedule: "0 7 * * *"     # daily 07:00 UTC
 
 clusters:
   meta-staging-aws-uw1:

@@ -173,13 +173,26 @@ def resolve_max_runners(value, def_file, cluster_id):
     return value
 
 
-def generate_runner(def_file, template_content, cluster_config, output_dir, module_name, pypi_cache_enabled=True):
+def generate_runner(
+    def_file,
+    template_content,
+    cluster_config,
+    output_dir,
+    module_name,
+    pypi_cache_enabled=True,
+    hf_cache_enabled=False,
+):
     """Generate a single runner config from its definition.
 
     pypi_cache_enabled controls whether the `# BEGIN_PYPI_CACHE` / `# END_PYPI_CACHE`
     block in the template is preserved (True) or stripped (False). Strip when the
     cluster does not deploy the pypi-cache module — the env vars would otherwise
     point at a Service that doesn't exist on this cluster.
+
+    hf_cache_enabled does the same for the `# BEGIN_HF_CACHE` / `# END_HF_CACHE`
+    block (HF_HOME env + the read-only /mnt/hf_cache hostPath mount). Strip when the
+    cluster does not deploy the hf-cache module — the hostPath would otherwise be
+    empty and HF_HUB_OFFLINE=1 would make every model load fail.
     """
     with open(def_file) as f:
         data = yaml.safe_load(f)
@@ -338,6 +351,7 @@ def generate_runner(def_file, template_content, cluster_config, output_dir, modu
     # Replace all template placeholders
     output_content = template_content
     output_content = strip_conditional_block(output_content, "PYPI_CACHE", keep=pypi_cache_enabled)
+    output_content = strip_conditional_block(output_content, "HF_CACHE", keep=hf_cache_enabled)
     replacements = {
         "{{GITHUB_CONFIG_URL}}": github_url,
         "{{GITHUB_SECRET_NAME}}": k8s_secret_ref,
@@ -499,9 +513,22 @@ def main():
     # that doesn't exist on this cluster.
     pypi_cache_enabled = "pypi-cache" in (cluster_cfg.get("modules") or [])
 
+    # hf-cache module is cluster-scoped: when absent, strip the HF_CACHE env vars
+    # and the /mnt/hf_cache hostPath mount from the workflow pod template so jobs
+    # don't mount an empty path and fail offline model loads.
+    hf_cache_enabled = "hf-cache" in (cluster_cfg.get("modules") or [])
+
     count = 0
     for def_file in def_files:
-        if generate_runner(def_file, template_content, cluster_config, output_dir, module_name, pypi_cache_enabled):
+        if generate_runner(
+            def_file,
+            template_content,
+            cluster_config,
+            output_dir,
+            module_name,
+            pypi_cache_enabled,
+            hf_cache_enabled,
+        ):
             count += 1
 
     print()

@@ -437,6 +437,18 @@ data:
             - name: PYPI_CACHE_WHL_URL
               value: "http://pypi-cache-cpu.pypi-cache.svc.cluster.local:8080/whl/cpu/"
             # END_PYPI_CACHE
+            # BEGIN_HF_CACHE
+            - name: HF_HOME
+              value: "/mnt/hf_cache"
+            - name: HF_HUB_CACHE
+              value: "/mnt/hf_cache/hub"
+            - name: HF_HUB_OFFLINE
+              value: "1"
+            - name: TRANSFORMERS_OFFLINE
+              value: "1"
+            - name: HF_DATASETS_OFFLINE
+              value: "1"
+            # END_HF_CACHE
             - name: TORCH_CI_MAX_MEMORY
               value: "{{MEMORY_BYTES}}"
           # Workflow container gets the actual compute resources
@@ -450,6 +462,15 @@ data:
               memory: "{{MEMORY}}"
               ephemeral-storage: "{{DISK_SIZE}}"{{GPU_LIMIT}}
           volumeMounts:
+            # BEGIN_HF_CACHE
+            # Shared HuggingFace model cache. The hf-cache mount DaemonSet keeps
+            # this FUSE mount alive on the host; HostToContainer propagation lets
+            # the job pod see it. Read-only — refresh is a separate writer job.
+            - name: hf-cache
+              mountPath: /mnt/hf_cache
+              readOnly: true
+              mountPropagation: HostToContainer
+            # END_HF_CACHE
             # K8s default /dev/shm is 64Mi (container runtime tmpfs). NCCL
             # blows past that on multi-GPU workloads; PyTorch docker-based CI
             # also runs with --shm-size=1g-2g, so match that ceiling for all
@@ -461,6 +482,12 @@ data:
           emptyDir:
             medium: Memory
             sizeLimit: 2Gi
+        # BEGIN_HF_CACHE
+        - name: hf-cache
+          hostPath:
+            path: /mnt/hf_cache
+            type: DirectoryOrCreate
+        # END_HF_CACHE
   wrapper.js: |
     #!/usr/bin/env node
     'use strict';

@@ -0,0 +1,77 @@
+# hf-cache — shared HuggingFace model cache
+
+Gives OSDC runners a shared, read-only HuggingFace model cache at `/mnt/hf_cache`,
+the OSDC equivalent of the old EC2 CI `/mnt/hf_cache` mount. Jobs read model
+weights from a local cache instead of downloading from the Hub on every run.
+
+## Design in one paragraph
+
+The model cache is stored as **plain, symlink-free HuggingFace cache-layout files
+in a single shared S3 bucket** (the portable source of truth — any object store
+can host the same layout). A privileged per-node **rclone FUSE mount**
+(`mount-daemonset`) exposes that bucket **read-only** at the host path
+`/mnt/hf_cache`; reads are lazy and cached on node-local NVMe, so a cold Karpenter
+node only pulls the models its jobs touch. Job pods (ARC kubernetes mode) get the
+path bind-mounted via the gated `# BEGIN_HF_CACHE` block in
+`modules/arc-runners/templates/runner.yaml.tpl`. A **refresh CronJob** is the only
+writer: it downloads the curated model set and publishes a symlink-free copy to S3.
+
+No metadata engine, no EFS — just S3 + rclone, which keeps it cloud-portable.
+
+## Components
+
+| Component | What it does |
+|-----------|--------------|
+| `terraform/hf-cache-bucket/` | Shared, private S3 bucket `pytorch-hf-model-cache` (one-time, manual apply) |
+| `terraform/` | Per-cluster IRSA roles: `hf-cache-mount` (read-only), `hf-cache-refresh` (read/write) |
+| `kubernetes/mount-daemonset.yaml.tpl` | rclone FUSE mount → read-only `/mnt/hf_cache` on every runner/workflow node |
+| `kubernetes/refresh-cronjob.yaml.tpl` | Downloads `models.txt` from the Hub, publishes symlink-free to S3 |
+| `scripts/python/refresh_cache.py` | Refresh driver (download + `rclone copy -L`, dropping `blobs/`) |
+| `models.txt` | Curated model manifest (kept in sync with pytorch/pytorch CI pins) |
+
+## Runner consumption
+
+When `hf-cache` is in a cluster's `modules:` list, `generate_runners.py` keeps the
+`# BEGIN_HF_CACHE` block, which adds to every job pod:
+
+- volume + read-only `hostPath` mount of `/mnt/hf_cache` (`HostToContainer` propagation)
+- env: `HF_HOME=/mnt/hf_cache`, `HF_HUB_CACHE=/mnt/hf_cache/hub`, `HF_HUB_OFFLINE=1`,
+  `TRANSFORMERS_OFFLINE=1`, `HF_DATASETS_OFFLINE=1`
+
+`from_pretrained(...)` / `vllm.LLM(model=...)` then resolve from the cache with no
+code changes. When the module is absent the block is stripped, so this is a no-op
+for clusters that don't enable it.
+
+## Enable on a cluster
+
+1. One-time (per account): apply the shared bucket
+   ```
+   cd modules/hf-cache/terraform/hf-cache-bucket
+   tofu init -backend-config=... && tofu apply
+   ```
+2. (Optional) create the gated/private-model token Secret:
+   ```
+   kubectl create secret generic hf-cache-token -n hf-cache --from-literal=token=hf_xxx
+   ```
+3. Add `hf-cache` to the cluster's `modules:` list in `clusters.yaml` (after
+   `arc-runners`), then redeploy:
+   ```
+   just deploy-module <cluster> hf-cache
+   just deploy-module <cluster> arc-runners   # re-render job pods with the HF_CACHE block
+   ```
+4. Populate the cache immediately (otherwise it waits for the CronJob):
+   ```
+   kubectl create job -n hf-cache --from=cronjob/hf-cache-refresh hf-cache-refresh-manual
+   ```
+
+## Open items (see PR description)
+
+- **Symlink-free layout** is assumed to resolve transparently via `from_pretrained`
+  from an `rclone -L`, `blobs/`-excluded layout — needs a validation spike before
+  enabling on a real cluster.
+- The mount DaemonSet is **privileged** (FUSE + Bidirectional propagation); confirm
+  this is acceptable under the cluster's Pod Security posture.
+- Single shared bucket in `us-east-2` means cross-region S3 reads for other regions
+  (node-local cache absorbs repeats). Per-region buckets / replication is a follow-up.
+- Strict-offline (`HF_HUB_OFFLINE=1`): an uncached model errors out (matches EC2).
+  Graceful online fallback (overlay) is a possible enhancement.
@@ -0,0 +1,115 @@
+#!/usr/bin/env bash
+set -euo pipefail
+#
+# HF cache module deploy script.
+# Called by: just deploy-module <cluster> hf-cache
+# Args: $1=cluster-id  $2=cluster-name  $3=region
+#
+# Deploys:
+#   1. hf-cache namespace + ServiceAccounts (kustomize)
+#   2. IRSA annotations on the mount/refresh ServiceAccounts (terraform outputs)
+#   3. ConfigMaps (refresh script + model manifest)
+#   4. Mount DaemonSet (rclone FUSE → read-only /mnt/hf_cache on each runner node)
+#   5. Refresh CronJob (downloads curated models → publishes to shared S3 bucket)
+#
+# The model cache data lives in a single shared S3 bucket (see
+# terraform/hf-cache-bucket/). This per-cluster deploy only wires the IRSA roles,
+# the node mount, and the refresh job.
+
+CLUSTER="$1"
+export CNAME="$2"
+export REGION="$3"
+MODULE_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="${OSDC_ROOT:-$(cd "$MODULE_DIR/../.." && pwd)}"
+UPSTREAM_ROOT="${OSDC_UPSTREAM:-$REPO_ROOT}"
+# shellcheck source=/dev/null
+source "$UPSTREAM_ROOT/scripts/mise-activate.sh"
+# shellcheck source=/dev/null
+source "$UPSTREAM_ROOT/scripts/kubectl-apply.sh"
+# shellcheck source=/dev/null
+source "$UPSTREAM_ROOT/scripts/state-config.sh"
+: "${STATE_REGION:?state-config.sh did not export STATE_REGION}"
+CFG="$UPSTREAM_ROOT/scripts/cluster-config.py"
+
+# --- Read hf_cache config (with defaults) ---
+NAMESPACE=$(uv run "$CFG" "$CLUSTER" hf_cache.namespace hf-cache)
+BUCKET_REGION=$(uv run "$CFG" "$CLUSTER" hf_cache.bucket_region us-east-2)
+RCLONE_IMAGE=$(uv run "$CFG" "$CLUSTER" hf_cache.rclone_image "rclone/rclone:1.69.1")
+VFS_CACHE_MAX_SIZE=$(uv run "$CFG" "$CLUSTER" hf_cache.vfs_cache_max_size 200G)
+REFRESH_SCHEDULE=$(uv run "$CFG" "$CLUSTER" hf_cache.refresh_schedule "0 7 * * *")
+BUCKET_CFG=$(uv run "$CFG" "$CLUSTER" state_bucket)
+
+# --- Read terraform outputs: IRSA role ARNs + bucket name ---
+echo "[hf-cache] Reading terraform outputs..."
+cd "$MODULE_DIR/terraform"
+tofu init -reconfigure \
+  -backend-config="bucket=${BUCKET_CFG}" \
+  -backend-config="key=${CLUSTER}/hf-cache/terraform.tfstate" \
+  -backend-config="region=${STATE_REGION}" \
+  -backend-config="dynamodb_table=ciforge-terraform-locks" \
+  >/dev/null 2>&1
+MOUNT_ROLE_ARN=$(tofu output -raw mount_role_arn)
+REFRESH_ROLE_ARN=$(tofu output -raw refresh_role_arn)
+BUCKET=$(tofu output -raw hf_cache_bucket)
+cd - >/dev/null
+echo "[hf-cache] Bucket: ${BUCKET} (${BUCKET_REGION})"
+echo "[hf-cache] Mount role ARN: ${MOUNT_ROLE_ARN}"
+echo "[hf-cache] Refresh role ARN: ${REFRESH_ROLE_ARN}"
+
+# --- Apply base k8s resources (namespace, ServiceAccounts) ---
+echo "[hf-cache] Applying base resources (namespace, ServiceAccounts)..."
+kubectl_apply_if_changed -k "$MODULE_DIR/kubernetes/"
+
+echo "[hf-cache] Annotating ServiceAccounts with IRSA roles..."
+kubectl annotate sa hf-cache-mount -n "$NAMESPACE" \
+  eks.amazonaws.com/role-arn="$MOUNT_ROLE_ARN" --overwrite
+kubectl annotate sa hf-cache-refresh -n "$NAMESPACE" \
+  eks.amazonaws.com/role-arn="$REFRESH_ROLE_ARN" --overwrite
+
+# --- ConfigMaps (refresh script + model manifest) ---
+echo "[hf-cache] Creating hf-cache-refresh-scripts ConfigMap..."
+kubectl create configmap hf-cache-refresh-scripts \
+  --from-file=refresh_cache.py="$MODULE_DIR/scripts/python/refresh_cache.py" \
+  -n "$NAMESPACE" \
+  --dry-run=client -o yaml | kubectl apply -f -
+kubectl label configmap hf-cache-refresh-scripts -n "$NAMESPACE" \
+  osdc.io/module=hf-cache --overwrite
+
+echo "[hf-cache] Creating hf-cache-models ConfigMap..."
+kubectl create configmap hf-cache-models \
+  --from-file=models.txt="$MODULE_DIR/models.txt" \
+  -n "$NAMESPACE" \
+  --dry-run=client -o yaml | kubectl apply -f -
+kubectl label configmap hf-cache-models -n "$NAMESPACE" \
+  osdc.io/module=hf-cache --overwrite
+
+# --- Render + apply the mount DaemonSet ---
+echo "[hf-cache] Applying mount DaemonSet..."
+sed -e "s|__NAMESPACE__|${NAMESPACE}|g" \
+  -e "s|__BUCKET__|${BUCKET}|g" \
+  -e "s|__REGION__|${BUCKET_REGION}|g" \
+  -e "s|__RCLONE_IMAGE__|${RCLONE_IMAGE}|g" \
+  -e "s|__VFS_CACHE_MAX_SIZE__|${VFS_CACHE_MAX_SIZE}|g" \
+  "$MODULE_DIR/kubernetes/mount-daemonset.yaml.tpl" \
+  | kubectl_apply_if_changed -f -
+
+# --- Render + apply the refresh CronJob ---
+echo "[hf-cache] Applying refresh CronJob..."
+sed -e "s|__NAMESPACE__|${NAMESPACE}|g" \
+  -e "s|__BUCKET__|${BUCKET}|g" \
+  -e "s|__REGION__|${BUCKET_REGION}|g" \
+  -e "s|__RCLONE_IMAGE__|${RCLONE_IMAGE}|g" \
+  -e "s|__SCHEDULE__|${REFRESH_SCHEDULE}|g" \
+  "$MODULE_DIR/kubernetes/refresh-cronjob.yaml.tpl" \
+  | kubectl_apply_if_changed -f -
+
+# --- Wait for the mount DaemonSet to roll out ---
+echo "[hf-cache] Waiting for mount DaemonSet rollout..."
+kubectl rollout status daemonset hf-cache-mount \
+  -n "$NAMESPACE" --timeout=300s || {
+  echo "[hf-cache] WARNING: mount DaemonSet rollout did not complete within timeout"
+  echo "[hf-cache] Check: kubectl get pods -n $NAMESPACE -l app=hf-cache-mount"
+}
+
+echo "[hf-cache] Deployed — rclone mount serving read-only /mnt/hf_cache on runner nodes."
+echo "[hf-cache] To populate now: kubectl create job -n $NAMESPACE --from=cronjob/hf-cache-refresh hf-cache-refresh-manual"
@@ -0,0 +1,9 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+# Base resources only. The mount DaemonSet and refresh CronJob are rendered
+# from templates by deploy.sh (they need terraform outputs and per-cluster
+# config substituted in), so they are not part of this kustomization.
+resources:
+  - namespace.yaml
+  - serviceaccounts.yaml