Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions osdc/clusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,16 @@ defaults:
- "x86_64"
- "aarch64"
target_manylinux: "2_28"
hf_cache:
# Shared HuggingFace model cache (modules/hf-cache). Not enabled on any
# cluster by default — add "hf-cache" to a cluster's modules list to turn it
# on. The model data lives in a single shared, private S3 bucket
# (pytorch-hf-model-cache in us-east-2); these are just the knobs.
namespace: hf-cache
bucket_region: us-east-2 # region of the shared model-cache bucket
rclone_image: "rclone/rclone:1.69.1"
vfs_cache_max_size: 200G # per-node NVMe cap for the rclone read cache
refresh_schedule: "0 7 * * *" # daily 07:00 UTC

clusters:
meta-staging-aws-uw1:
Expand Down
31 changes: 29 additions & 2 deletions osdc/modules/arc-runners/scripts/python/generate_runners.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,13 +173,26 @@ def resolve_max_runners(value, def_file, cluster_id):
return value


def generate_runner(def_file, template_content, cluster_config, output_dir, module_name, pypi_cache_enabled=True):
def generate_runner(
def_file,
template_content,
cluster_config,
output_dir,
module_name,
pypi_cache_enabled=True,
hf_cache_enabled=False,
):
"""Generate a single runner config from its definition.

pypi_cache_enabled controls whether the `# BEGIN_PYPI_CACHE` / `# END_PYPI_CACHE`
block in the template is preserved (True) or stripped (False). Strip when the
cluster does not deploy the pypi-cache module — the env vars would otherwise
point at a Service that doesn't exist on this cluster.

hf_cache_enabled does the same for the `# BEGIN_HF_CACHE` / `# END_HF_CACHE`
block (HF_HOME env + the read-only /mnt/hf_cache hostPath mount). Strip when the
cluster does not deploy the hf-cache module — the hostPath would otherwise be
empty and HF_HUB_OFFLINE=1 would make every model load fail.
"""
with open(def_file) as f:
data = yaml.safe_load(f)
Expand Down Expand Up @@ -338,6 +351,7 @@ def generate_runner(def_file, template_content, cluster_config, output_dir, modu
# Replace all template placeholders
output_content = template_content
output_content = strip_conditional_block(output_content, "PYPI_CACHE", keep=pypi_cache_enabled)
output_content = strip_conditional_block(output_content, "HF_CACHE", keep=hf_cache_enabled)
replacements = {
"{{GITHUB_CONFIG_URL}}": github_url,
"{{GITHUB_SECRET_NAME}}": k8s_secret_ref,
Expand Down Expand Up @@ -499,9 +513,22 @@ def main():
# that doesn't exist on this cluster.
pypi_cache_enabled = "pypi-cache" in (cluster_cfg.get("modules") or [])

# hf-cache module is cluster-scoped: when absent, strip the HF_CACHE env vars
# and the /mnt/hf_cache hostPath mount from the workflow pod template so jobs
# don't mount an empty path and fail offline model loads.
hf_cache_enabled = "hf-cache" in (cluster_cfg.get("modules") or [])

count = 0
for def_file in def_files:
if generate_runner(def_file, template_content, cluster_config, output_dir, module_name, pypi_cache_enabled):
if generate_runner(
def_file,
template_content,
cluster_config,
output_dir,
module_name,
pypi_cache_enabled,
hf_cache_enabled,
):
count += 1

print()
Expand Down
27 changes: 27 additions & 0 deletions osdc/modules/arc-runners/templates/runner.yaml.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,18 @@ data:
- name: PYPI_CACHE_WHL_URL
value: "http://pypi-cache-cpu.pypi-cache.svc.cluster.local:8080/whl/cpu/"
# END_PYPI_CACHE
# BEGIN_HF_CACHE
- name: HF_HOME
value: "/mnt/hf_cache"
- name: HF_HUB_CACHE
value: "/mnt/hf_cache/hub"
- name: HF_HUB_OFFLINE
value: "1"
- name: TRANSFORMERS_OFFLINE
value: "1"
- name: HF_DATASETS_OFFLINE
value: "1"
# END_HF_CACHE
- name: TORCH_CI_MAX_MEMORY
value: "{{MEMORY_BYTES}}"
# Workflow container gets the actual compute resources
Expand All @@ -450,6 +462,15 @@ data:
memory: "{{MEMORY}}"
ephemeral-storage: "{{DISK_SIZE}}"{{GPU_LIMIT}}
volumeMounts:
# BEGIN_HF_CACHE
# Shared HuggingFace model cache. The hf-cache mount DaemonSet keeps
# this FUSE mount alive on the host; HostToContainer propagation lets
# the job pod see it. Read-only — refresh is a separate writer job.
- name: hf-cache
mountPath: /mnt/hf_cache
readOnly: true
mountPropagation: HostToContainer
# END_HF_CACHE
# K8s default /dev/shm is 64Mi (container runtime tmpfs). NCCL
# blows past that on multi-GPU workloads; PyTorch docker-based CI
# also runs with --shm-size=1g-2g, so match that ceiling for all
Expand All @@ -461,6 +482,12 @@ data:
emptyDir:
medium: Memory
sizeLimit: 2Gi
# BEGIN_HF_CACHE
- name: hf-cache
hostPath:
path: /mnt/hf_cache
type: DirectoryOrCreate
# END_HF_CACHE
wrapper.js: |
#!/usr/bin/env node
'use strict';
Expand Down
77 changes: 77 additions & 0 deletions osdc/modules/hf-cache/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# hf-cache — shared HuggingFace model cache

Gives OSDC runners a shared, read-only HuggingFace model cache at `/mnt/hf_cache`,
the OSDC equivalent of the old EC2 CI `/mnt/hf_cache` mount. Jobs read model
weights from a local cache instead of downloading from the Hub on every run.

## Design in one paragraph

The model cache is stored as **plain, symlink-free HuggingFace cache-layout files
in a single shared S3 bucket** (the portable source of truth — any object store
can host the same layout). A privileged per-node **rclone FUSE mount**
(`mount-daemonset`) exposes that bucket **read-only** at the host path
`/mnt/hf_cache`; reads are lazy and cached on node-local NVMe, so a cold Karpenter
node only pulls the models its jobs touch. Job pods (ARC kubernetes mode) get the
path bind-mounted via the gated `# BEGIN_HF_CACHE` block in
`modules/arc-runners/templates/runner.yaml.tpl`. A **refresh CronJob** is the only
writer: it downloads the curated model set and publishes a symlink-free copy to S3.

No metadata engine, no EFS — just S3 + rclone, which keeps it cloud-portable.

## Components

| Component | What it does |
|-----------|--------------|
| `terraform/hf-cache-bucket/` | Shared, private S3 bucket `pytorch-hf-model-cache` (one-time, manual apply) |
| `terraform/` | Per-cluster IRSA roles: `hf-cache-mount` (read-only), `hf-cache-refresh` (read/write) |
| `kubernetes/mount-daemonset.yaml.tpl` | rclone FUSE mount → read-only `/mnt/hf_cache` on every runner/workflow node |
| `kubernetes/refresh-cronjob.yaml.tpl` | Downloads `models.txt` from the Hub, publishes symlink-free to S3 |
| `scripts/python/refresh_cache.py` | Refresh driver (download + `rclone copy -L`, dropping `blobs/`) |
| `models.txt` | Curated model manifest (kept in sync with pytorch/pytorch CI pins) |

## Runner consumption

When `hf-cache` is in a cluster's `modules:` list, `generate_runners.py` keeps the
`# BEGIN_HF_CACHE` block, which adds to every job pod:

- volume + read-only `hostPath` mount of `/mnt/hf_cache` (`HostToContainer` propagation)
- env: `HF_HOME=/mnt/hf_cache`, `HF_HUB_CACHE=/mnt/hf_cache/hub`, `HF_HUB_OFFLINE=1`,
`TRANSFORMERS_OFFLINE=1`, `HF_DATASETS_OFFLINE=1`

`from_pretrained(...)` / `vllm.LLM(model=...)` then resolve from the cache with no
code changes. When the module is absent the block is stripped, so this is a no-op
for clusters that don't enable it.

## Enable on a cluster

1. One-time (per account): apply the shared bucket
```
cd modules/hf-cache/terraform/hf-cache-bucket
tofu init -backend-config=... && tofu apply
```
2. (Optional) create the gated/private-model token Secret:
```
kubectl create secret generic hf-cache-token -n hf-cache --from-literal=token=hf_xxx
```
3. Add `hf-cache` to the cluster's `modules:` list in `clusters.yaml` (after
`arc-runners`), then redeploy:
```
just deploy-module <cluster> hf-cache
just deploy-module <cluster> arc-runners # re-render job pods with the HF_CACHE block
```
4. Populate the cache immediately (otherwise it waits for the CronJob):
```
kubectl create job -n hf-cache --from=cronjob/hf-cache-refresh hf-cache-refresh-manual
```

## Open items (see PR description)

- **Symlink-free layout** is assumed to resolve transparently via `from_pretrained`
from an `rclone -L`, `blobs/`-excluded layout — needs a validation spike before
enabling on a real cluster.
- The mount DaemonSet is **privileged** (FUSE + Bidirectional propagation); confirm
this is acceptable under the cluster's Pod Security posture.
- Single shared bucket in `us-east-2` means cross-region S3 reads for other regions
(node-local cache absorbs repeats). Per-region buckets / replication is a follow-up.
- Strict-offline (`HF_HUB_OFFLINE=1`): an uncached model errors out (matches EC2).
Graceful online fallback (overlay) is a possible enhancement.
115 changes: 115 additions & 0 deletions osdc/modules/hf-cache/deploy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env bash
set -euo pipefail
#
# HF cache module deploy script.
# Called by: just deploy-module <cluster> hf-cache
# Args: $1=cluster-id $2=cluster-name $3=region
#
# Deploys:
# 1. hf-cache namespace + ServiceAccounts (kustomize)
# 2. IRSA annotations on the mount/refresh ServiceAccounts (terraform outputs)
# 3. ConfigMaps (refresh script + model manifest)
# 4. Mount DaemonSet (rclone FUSE → read-only /mnt/hf_cache on each runner node)
# 5. Refresh CronJob (downloads curated models → publishes to shared S3 bucket)
#
# The model cache data lives in a single shared S3 bucket (see
# terraform/hf-cache-bucket/). This per-cluster deploy only wires the IRSA roles,
# the node mount, and the refresh job.

CLUSTER="$1"
export CNAME="$2"
export REGION="$3"
MODULE_DIR="$(cd "$(dirname "$0")" && pwd)"
REPO_ROOT="${OSDC_ROOT:-$(cd "$MODULE_DIR/../.." && pwd)}"
UPSTREAM_ROOT="${OSDC_UPSTREAM:-$REPO_ROOT}"
# shellcheck source=/dev/null
source "$UPSTREAM_ROOT/scripts/mise-activate.sh"
# shellcheck source=/dev/null
source "$UPSTREAM_ROOT/scripts/kubectl-apply.sh"
# shellcheck source=/dev/null
source "$UPSTREAM_ROOT/scripts/state-config.sh"
: "${STATE_REGION:?state-config.sh did not export STATE_REGION}"
CFG="$UPSTREAM_ROOT/scripts/cluster-config.py"

# --- Read hf_cache config (with defaults) ---
NAMESPACE=$(uv run "$CFG" "$CLUSTER" hf_cache.namespace hf-cache)
BUCKET_REGION=$(uv run "$CFG" "$CLUSTER" hf_cache.bucket_region us-east-2)
RCLONE_IMAGE=$(uv run "$CFG" "$CLUSTER" hf_cache.rclone_image "rclone/rclone:1.69.1")
VFS_CACHE_MAX_SIZE=$(uv run "$CFG" "$CLUSTER" hf_cache.vfs_cache_max_size 200G)
REFRESH_SCHEDULE=$(uv run "$CFG" "$CLUSTER" hf_cache.refresh_schedule "0 7 * * *")
BUCKET_CFG=$(uv run "$CFG" "$CLUSTER" state_bucket)

# --- Read terraform outputs: IRSA role ARNs + bucket name ---
echo "[hf-cache] Reading terraform outputs..."
cd "$MODULE_DIR/terraform"
tofu init -reconfigure \
-backend-config="bucket=${BUCKET_CFG}" \
-backend-config="key=${CLUSTER}/hf-cache/terraform.tfstate" \
-backend-config="region=${STATE_REGION}" \
-backend-config="dynamodb_table=ciforge-terraform-locks" \
>/dev/null 2>&1
MOUNT_ROLE_ARN=$(tofu output -raw mount_role_arn)
REFRESH_ROLE_ARN=$(tofu output -raw refresh_role_arn)
BUCKET=$(tofu output -raw hf_cache_bucket)
cd - >/dev/null
echo "[hf-cache] Bucket: ${BUCKET} (${BUCKET_REGION})"
echo "[hf-cache] Mount role ARN: ${MOUNT_ROLE_ARN}"
echo "[hf-cache] Refresh role ARN: ${REFRESH_ROLE_ARN}"

# --- Apply base k8s resources (namespace, ServiceAccounts) ---
echo "[hf-cache] Applying base resources (namespace, ServiceAccounts)..."
kubectl_apply_if_changed -k "$MODULE_DIR/kubernetes/"

echo "[hf-cache] Annotating ServiceAccounts with IRSA roles..."
kubectl annotate sa hf-cache-mount -n "$NAMESPACE" \
eks.amazonaws.com/role-arn="$MOUNT_ROLE_ARN" --overwrite
kubectl annotate sa hf-cache-refresh -n "$NAMESPACE" \
eks.amazonaws.com/role-arn="$REFRESH_ROLE_ARN" --overwrite

# --- ConfigMaps (refresh script + model manifest) ---
echo "[hf-cache] Creating hf-cache-refresh-scripts ConfigMap..."
kubectl create configmap hf-cache-refresh-scripts \
--from-file=refresh_cache.py="$MODULE_DIR/scripts/python/refresh_cache.py" \
-n "$NAMESPACE" \
--dry-run=client -o yaml | kubectl apply -f -
kubectl label configmap hf-cache-refresh-scripts -n "$NAMESPACE" \
osdc.io/module=hf-cache --overwrite

echo "[hf-cache] Creating hf-cache-models ConfigMap..."
kubectl create configmap hf-cache-models \
--from-file=models.txt="$MODULE_DIR/models.txt" \
-n "$NAMESPACE" \
--dry-run=client -o yaml | kubectl apply -f -
kubectl label configmap hf-cache-models -n "$NAMESPACE" \
osdc.io/module=hf-cache --overwrite

# --- Render + apply the mount DaemonSet ---
echo "[hf-cache] Applying mount DaemonSet..."
sed -e "s|__NAMESPACE__|${NAMESPACE}|g" \
-e "s|__BUCKET__|${BUCKET}|g" \
-e "s|__REGION__|${BUCKET_REGION}|g" \
-e "s|__RCLONE_IMAGE__|${RCLONE_IMAGE}|g" \
-e "s|__VFS_CACHE_MAX_SIZE__|${VFS_CACHE_MAX_SIZE}|g" \
"$MODULE_DIR/kubernetes/mount-daemonset.yaml.tpl" \
| kubectl_apply_if_changed -f -

# --- Render + apply the refresh CronJob ---
echo "[hf-cache] Applying refresh CronJob..."
sed -e "s|__NAMESPACE__|${NAMESPACE}|g" \
-e "s|__BUCKET__|${BUCKET}|g" \
-e "s|__REGION__|${BUCKET_REGION}|g" \
-e "s|__RCLONE_IMAGE__|${RCLONE_IMAGE}|g" \
-e "s|__SCHEDULE__|${REFRESH_SCHEDULE}|g" \
"$MODULE_DIR/kubernetes/refresh-cronjob.yaml.tpl" \
| kubectl_apply_if_changed -f -

# --- Wait for the mount DaemonSet to roll out ---
echo "[hf-cache] Waiting for mount DaemonSet rollout..."
kubectl rollout status daemonset hf-cache-mount \
-n "$NAMESPACE" --timeout=300s || {
echo "[hf-cache] WARNING: mount DaemonSet rollout did not complete within timeout"
echo "[hf-cache] Check: kubectl get pods -n $NAMESPACE -l app=hf-cache-mount"
}

echo "[hf-cache] Deployed — rclone mount serving read-only /mnt/hf_cache on runner nodes."
echo "[hf-cache] To populate now: kubectl create job -n $NAMESPACE --from=cronjob/hf-cache-refresh hf-cache-refresh-manual"
9 changes: 9 additions & 0 deletions osdc/modules/hf-cache/kubernetes/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

# Base resources only. The mount DaemonSet and refresh CronJob are rendered
# from templates by deploy.sh (they need terraform outputs and per-cluster
# config substituted in), so they are not part of this kustomization.
resources:
- namespace.yaml
- serviceaccounts.yaml
Loading
Loading