diff --git a/arena/notebooks/cks_kubernetes.py b/arena/notebooks/cks_kubernetes.py
new file mode 100644
index 0000000..6e8e23d
--- /dev/null
+++ b/arena/notebooks/cks_kubernetes.py
@@ -0,0 +1,716 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+# "kubernetes==35.0.0",
+# "marimo>=0.23.6",
+# "ruamel-yaml>=0.19.1",
+# ]
+# ///
+
+import marimo
+
+__generated_with = "0.23.6"
+app = marimo.App(width="medium", app_title="CoreWeave ARENA")
+
+with app.setup:
+ import os
+
+ import marimo as mo
+ from lib.k8s import ensure_kubectl_access
+ from lib.remote_execution_helpers import shell
+ from lib.ui import about, banner, security_disclaimer, table_of_contents
+
+ MANIFEST_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cks_kubernetes")
+
+ LAB_NAMESPACE = "arena-k8s-lab"
+ LAB_LABEL_KEY = "app.kubernetes.io/part-of"
+ LAB_LABEL_VALUE = "arena-k8s-lab"
+
+ CONFIGMAP_NAME = "arena-k8s-config"
+ DEPLOYMENT_NAME = "arena-k8s-web"
+ SERVICE_NAME = "arena-k8s-web"
+ JOB_NAME = "arena-k8s-job"
+
+ def truncate(text: str, max_chars: int = 4000) -> str:
+ """Limit command output to a readable size for notebook display."""
+ if len(text) <= max_chars:
+ return text
+ return f"{text[:max_chars]}\n...output truncated..."
+
+ def step_cell(title: str, cmd: str, btn, clicked: bool, explanation: str = "") -> mo.Html:
+ """Render an imperative step: optional explanation, title + button, command, and result if clicked."""
+ _output = None
+ if clicked:
+ _r = shell(cmd, quiet=True, timeout=180)
+ _status = "SUCCESS" if _r.ok else "FAILED"
+ _stdout_text = _r.strip()
+ _stdout = truncate(_stdout_text) if _stdout_text else "(no output)"
+ _stderr_block = ""
+ _stderr_text = _r.stderr.strip()
+ if _stderr_text and not _r.ok:
+ _stderr_block = f"\n**stderr:**\n```\n{truncate(_stderr_text)}\n```"
+ _output = mo.md(f"`{_status}`\n```\n{_stdout}\n```{_stderr_block}")
+ parts = [
+ mo.hstack([mo.md(f"**{title}**"), btn], justify="space-between", align="center"),
+ ]
+ if explanation:
+ parts.append(mo.md(explanation))
+ parts.append(mo.md(f"```\n{cmd}\n```"))
+ if _output is not None:
+ parts.append(_output)
+ return mo.vstack(parts)
+
+ def manifest_step_cell(
+ title: str,
+ form,
+ namespace: str | None,
+ explanation: str = "",
+ ) -> mo.Html:
+ """Render a declarative step: title, optional explanation, YAML form, result on submit.
+
+ `form` is a `mo.ui.code_editor(...).form()` — `form.value` is None until the user
+ clicks the form's submit button, then becomes the current editor contents. Each
+ subsequent submit re-applies the (possibly edited) manifest; edits made without
+ clicking submit do not trigger a re-apply.
+ """
+ _output = None
+ manifest = form.value
+ if manifest is not None:
+ ns_flag = f"-n {namespace} " if namespace else ""
+ _r = shell(f"kubectl {ns_flag}apply -f -", quiet=True, timeout=180, input=manifest)
+ _status = "SUCCESS" if _r.ok else "FAILED"
+ _stdout_text = _r.strip()
+ _stdout = truncate(_stdout_text) if _stdout_text else "(no output)"
+ _stderr_block = ""
+ _stderr_text = _r.stderr.strip()
+ if _stderr_text and not _r.ok:
+ _stderr_block = f"\n**stderr:**\n```\n{truncate(_stderr_text)}\n```"
+ _output = mo.md(f"`{_status}`\n```\n{_stdout}\n```{_stderr_block}")
+ ns_label = f" *(namespace: `{namespace}`)*" if namespace else ""
+ parts = [mo.md(f"**{title}**{ns_label}")]
+ if explanation:
+ parts.append(mo.md(explanation))
+ parts.append(form)
+ if _output is not None:
+ parts.append(_output)
+ return mo.vstack(parts)
+
+ def load_manifest(filename: str) -> str:
+ """Load a YAML manifest from the `cks_kubernetes/` directory next to this notebook."""
+ with open(os.path.join(MANIFEST_DIR, filename), "r", encoding="utf-8") as file:
+ return file.read()
+
+ RUN_BUTTON = {"value": 0, "on_click": lambda v: v + 1, "label": "Run"}
+
+ # ----------------------------------------------------------------
+ # Manifests (the source of truth for each lab object).
+ # Stored as real YAML files in `cks_kubernetes/` so they're easy to edit locally.
+ # ----------------------------------------------------------------
+
+ CONFIGMAP_MANIFEST = load_manifest("configmap.yaml")
+ DEPLOYMENT_MANIFEST = load_manifest("deployment.yaml")
+ SERVICE_MANIFEST = load_manifest("service.yaml")
+ JOB_MANIFEST = load_manifest("job.yaml")
+
+
+# ============================================================
+# Header
+# ============================================================
+
+
+@app.cell(hide_code=True)
+def _():
+ mo.vstack(
+ [
+ banner(),
+ about(
+ "Kubernetes",
+ """This notebook is a hands-on introduction to Kubernetes on CoreWeave Kubernetes Service (CKS).
+ Each section presents the actual YAML manifest, applies it declaratively, and then observes what the controllers do.
+ _If you are running this notebook in edit mode, start by running all cells in the bottom right._
+ """,
+ ),
+ table_of_contents(
+ [
+ {"title": "Cluster Access", "description": "Verify kubectl connectivity."},
+ {"title": "Environment and Namespace Setup", "description": "Create the lab namespace."},
+ {"title": "ConfigMap Lab", "description": "Apply, read, patch, and delete a ConfigMap."},
+ {"title": "Deployment Lab", "description": "Apply, scale, watch self-healing, and roll an update."},
+ {"title": "Service Lab", "description": "Expose a deployment and inspect endpoints."},
+ {"title": "Job Lab", "description": "Run a batch job to completion and inspect logs."},
+ {"title": "Cleanup", "description": "Delete lab resources and namespace."},
+ ]
+ ),
+ security_disclaimer(),
+ ]
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _():
+ kubectl_ready, access_message = ensure_kubectl_access()
+ mo.md(f"""
+---
+## Cluster Access
+
+**kubectl status:** `{"READY" if kubectl_ready else "NOT READY"}`
+
+{access_message}
+""")
+ return (kubectl_ready,)
+
+
+# ============================================================
+# Environment and Namespace Setup
+# ============================================================
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+ mo.stop(not kubectl_ready)
+ mo.md(
+ "---\n## Environment and Namespace Setup\n\n"
+ "Create and verify a dedicated lab namespace. Every object below will live inside it, "
+ "so deleting the namespace at the end cleans everything up at once."
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+ mo.stop(not kubectl_ready)
+ env_b1 = mo.ui.button(**RUN_BUTTON)
+ env_b2 = mo.ui.button(**RUN_BUTTON)
+ env_b3 = mo.ui.button(**RUN_BUTTON)
+ env_b4 = mo.ui.button(**RUN_BUTTON)
+ return env_b1, env_b2, env_b3, env_b4
+
+
+@app.cell(hide_code=True)
+def _(env_b1):
+ step_cell(
+ "1. Check API Server",
+ "kubectl cluster-info",
+ env_b1,
+ env_b1.value > 0,
+ explanation=(
+ "`cluster-info` hits the Kubernetes API server and shows its address. "
+ "The API server is the only component you talk to directly — everything else (controllers, kubelet, scheduler) "
+ "reads and writes through it."
+ ),
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(env_b2):
+ step_cell(
+ "2. List Namespaces",
+ "kubectl get namespace",
+ env_b2,
+ env_b2.value > 0,
+ explanation="Namespaces partition cluster objects. You'll see system namespaces (`kube-system`, `kube-public`) alongside any others.",
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(env_b3):
+ step_cell(
+ "3. Create Lab Namespace",
+ f"kubectl create namespace {LAB_NAMESPACE} --dry-run=client -o yaml | kubectl apply -f -",
+ env_b3,
+ env_b3.value > 0,
+ explanation=(
+ f"Idempotently create `{LAB_NAMESPACE}`. The `--dry-run=client -o yaml | apply` pattern produces a manifest "
+ "client-side and applies it — re-running won't error if the namespace already exists."
+ ),
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(env_b4):
+ step_cell(
+ "4. Verify Namespace Labels",
+ f"kubectl get namespace {LAB_NAMESPACE} --show-labels",
+ env_b4,
+ env_b4.value > 0,
+ )
+ return
+
+
+# ============================================================
+# ConfigMap Lab
+# ============================================================
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+ mo.stop(not kubectl_ready)
+ mo.md(
+ "---\n## ConfigMap Lab\n\n"
+ "A ConfigMap stores non-secret key/value config. Below is the actual manifest — "
+ "the API object Kubernetes stores. `kubectl create configmap ...` is just a shortcut "
+ "that builds this YAML for you."
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+ mo.stop(not kubectl_ready)
+ cm_form = mo.ui.code_editor(value=CONFIGMAP_MANIFEST, language="yaml", min_height=200).form(
+ submit_button_label="Apply", bordered=False
+ )
+ cm_b2 = mo.ui.button(**RUN_BUTTON)
+ cm_b3 = mo.ui.button(**RUN_BUTTON)
+ cm_b4 = mo.ui.button(**RUN_BUTTON)
+ cm_b5 = mo.ui.button(**RUN_BUTTON)
+ return cm_form, cm_b2, cm_b3, cm_b4, cm_b5
+
+
+@app.cell(hide_code=True)
+def _(cm_form):
+ manifest_step_cell(
+ "1. Apply ConfigMap Manifest",
+ cm_form,
+ LAB_NAMESPACE,
+ explanation=(
+ "Two top-level fields matter: `metadata` (identity — name, namespace, labels) "
+ "and `data` (the key/value payload). Edit the YAML and click **Apply** to see how "
+ "`kubectl apply` reconciles your change."
+ ),
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(cm_b2):
+ step_cell(
+ "2. Read ConfigMap",
+ f"kubectl -n {LAB_NAMESPACE} get configmap {CONFIGMAP_NAME} -o yaml",
+ cm_b2,
+ cm_b2.value > 0,
+ explanation=(
+ "Notice the server added fields you didn't write — `creationTimestamp`, `resourceVersion`, `uid`. "
+ "These are managed by the API server, not you."
+ ),
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(cm_b3):
+ step_cell(
+ "3. Patch ConfigMap Data (imperative drift)",
+ f"""kubectl -n {LAB_NAMESPACE} patch configmap {CONFIGMAP_NAME} --type merge -p '{{"data":{{"MODE":"advanced","TOPIC":"crud"}}}}'""",
+ cm_b3,
+ cm_b3.value > 0,
+ explanation=(
+ "`patch` mutates the live object in-place. After this, the cluster state diverges from the YAML in step 1 — "
+ "re-applying the manifest would revert `MODE` and remove `TOPIC`. This is the declarative-vs-imperative tension."
+ ),
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(cm_b4):
+ step_cell(
+ "4. Verify Patched ConfigMap",
+ f"kubectl -n {LAB_NAMESPACE} get configmap {CONFIGMAP_NAME} -o yaml",
+ cm_b4,
+ cm_b4.value > 0,
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(cm_b5):
+ step_cell(
+ "5. Delete ConfigMap",
+ f"kubectl -n {LAB_NAMESPACE} delete configmap {CONFIGMAP_NAME} --ignore-not-found=true",
+ cm_b5,
+ cm_b5.value > 0,
+ )
+ return
+
+
+# ============================================================
+# Deployment Lab
+# ============================================================
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+ mo.stop(not kubectl_ready)
+ mo.md(
+ "---\n## Deployment Lab\n\n"
+ 'A Deployment is a controller. You give it a desired state — _"I want N pods running this image"_ — '
+ "and it creates a ReplicaSet, which in turn creates Pods. The controller continuously reconciles "
+ "actual cluster state toward the spec. We'll watch that happen explicitly in step 4."
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+ mo.stop(not kubectl_ready)
+ dep_form = mo.ui.code_editor(value=DEPLOYMENT_MANIFEST, language="yaml", min_height=320).form(
+ submit_button_label="Apply", bordered=False
+ )
+ dep_b2 = mo.ui.button(**RUN_BUTTON)
+ dep_b3 = mo.ui.button(**RUN_BUTTON)
+ dep_b4 = mo.ui.button(**RUN_BUTTON)
+ dep_b5 = mo.ui.button(**RUN_BUTTON)
+ dep_b6 = mo.ui.button(**RUN_BUTTON)
+ dep_b7 = mo.ui.button(**RUN_BUTTON)
+ dep_b8 = mo.ui.button(**RUN_BUTTON)
+ return dep_form, dep_b2, dep_b3, dep_b4, dep_b5, dep_b6, dep_b7, dep_b8
+
+
+@app.cell(hide_code=True)
+def _(dep_form):
+ manifest_step_cell(
+ "1. Apply Deployment Manifest",
+ dep_form,
+ LAB_NAMESPACE,
+ explanation=(
+ "Three nested layers: the Deployment's `selector.matchLabels` finds Pods whose labels match. "
+ "`template.metadata.labels` *stamps* those labels onto every Pod it creates. "
+ "If the selector and template labels disagree, the Deployment refuses to start — try it."
+ ),
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(dep_b2):
+ step_cell(
+ "2. Scale to 2 Replicas",
+ f"kubectl -n {LAB_NAMESPACE} scale deployment {DEPLOYMENT_NAME} --replicas=2",
+ dep_b2,
+ dep_b2.value > 0,
+ explanation=(
+ "Scaling changes the `spec.replicas` field on the Deployment. The controller notices the new desired count "
+ "and creates one more Pod to match. (`scale` is imperative — equivalent to patching the field directly.)"
+ ),
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(dep_b3):
+ step_cell(
+ "3. Wait for Rollout",
+ f"kubectl -n {LAB_NAMESPACE} rollout status deployment/{DEPLOYMENT_NAME} --timeout=120s",
+ dep_b3,
+ dep_b3.value > 0,
+ explanation="Block until the Deployment reports that all desired Pods are ready.",
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(dep_b4):
+ reconcile_cmd = (
+ f"echo '=== pods BEFORE delete ==='; "
+ f"kubectl -n {LAB_NAMESPACE} get pods -l app={DEPLOYMENT_NAME} -o wide; "
+ f"POD=$(kubectl -n {LAB_NAMESPACE} get pods -l app={DEPLOYMENT_NAME} "
+ f"-o jsonpath='{{.items[0].metadata.name}}'); "
+ f'echo; echo "=== deleting pod: $POD ==="; '
+ f'kubectl -n {LAB_NAMESPACE} delete pod "$POD" --wait=false; '
+ f"echo; echo '=== waiting 6s for the ReplicaSet controller to react ==='; sleep 6; "
+ f"echo; echo '=== pods AFTER delete ==='; "
+ f"kubectl -n {LAB_NAMESPACE} get pods -l app={DEPLOYMENT_NAME} -o wide"
+ )
+ step_cell(
+ "4. Watch Self-Healing (delete a pod, observe replacement)",
+ reconcile_cmd,
+ dep_b4,
+ dep_b4.value > 0,
+ explanation=(
+ "**This is the core idea of Kubernetes.** You deleted a Pod — but the Deployment's "
+ "ReplicaSet still wants 2 Pods running. The controller detects the mismatch and creates a new Pod "
+ "with a fresh name. Compare the `NAME` column before and after: one of the names will be new."
+ ),
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(dep_b5):
+ step_cell(
+ "5. Update Deployment Image",
+ f"kubectl -n {LAB_NAMESPACE} set image deployment/{DEPLOYMENT_NAME} '*=nginx:1.27.5'",
+ dep_b5,
+ dep_b5.value > 0,
+ explanation=(
+ "Changes the image in `spec.template.spec.containers[*]`. The Deployment then creates a new ReplicaSet "
+ "for the new image and scales the old one down — a rolling update, again driven by the controller."
+ ),
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(dep_b6):
+ step_cell(
+ "6. Wait for Updated Rollout",
+ f"kubectl -n {LAB_NAMESPACE} rollout status deployment/{DEPLOYMENT_NAME} --timeout=120s",
+ dep_b6,
+ dep_b6.value > 0,
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(dep_b7):
+ step_cell(
+ "7. Read Deployment",
+ f"kubectl -n {LAB_NAMESPACE} get deployment {DEPLOYMENT_NAME}",
+ dep_b7,
+ dep_b7.value > 0,
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(dep_b8):
+ step_cell(
+ "8. Read Deployment Pods",
+ f"kubectl -n {LAB_NAMESPACE} get pods -l app={DEPLOYMENT_NAME}",
+ dep_b8,
+ dep_b8.value > 0,
+ )
+ return
+
+
+# ============================================================
+# Service Lab
+# ============================================================
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+ mo.stop(not kubectl_ready)
+ mo.md(
+ "---\n## Service Lab\n\n"
+ "A Service gives Pods a stable virtual IP and DNS name. Its `selector` is what wires it to Pods — "
+ "the endpoints object is regenerated whenever matching Pods come and go."
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+ mo.stop(not kubectl_ready)
+ svc_form = mo.ui.code_editor(value=SERVICE_MANIFEST, language="yaml", min_height=220).form(
+ submit_button_label="Apply", bordered=False
+ )
+ svc_b2 = mo.ui.button(**RUN_BUTTON)
+ svc_b3 = mo.ui.button(**RUN_BUTTON)
+ svc_b4 = mo.ui.button(**RUN_BUTTON)
+ svc_b5 = mo.ui.button(**RUN_BUTTON)
+ return svc_form, svc_b2, svc_b3, svc_b4, svc_b5
+
+
+@app.cell(hide_code=True)
+def _(svc_form):
+ manifest_step_cell(
+ "1. Apply Service Manifest",
+ svc_form,
+ LAB_NAMESPACE,
+ explanation=(
+ f"`spec.selector: app: {DEPLOYMENT_NAME}` is the only link between this Service and the Deployment's Pods — "
+ "no foreign key, just label matching. Try changing the selector to a label no Pod has and click **Apply**; "
+ "the Endpoints in step 3 will go empty."
+ ),
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(svc_b2):
+ step_cell(
+ "2. Read Service",
+ f"kubectl -n {LAB_NAMESPACE} get service {SERVICE_NAME} -o wide",
+ svc_b2,
+ svc_b2.value > 0,
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(svc_b3):
+ step_cell(
+ "3. Read Endpoints",
+ f"kubectl -n {LAB_NAMESPACE} get endpoints {SERVICE_NAME}",
+ svc_b3,
+ svc_b3.value > 0,
+ explanation=(
+ "Endpoints are the actual Pod IPs the Service routes to. They were populated automatically "
+ "from the selector — and they update as Pods come and go (try deleting a Pod and re-running this)."
+ ),
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(svc_b4):
+ step_cell(
+ "4. Patch Service Session Affinity",
+ f"""kubectl -n {LAB_NAMESPACE} patch service {SERVICE_NAME} --type merge -p '{{"spec":{{"sessionAffinity":"ClientIP"}}}}'""",
+ svc_b4,
+ svc_b4.value > 0,
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(svc_b5):
+ step_cell(
+ "5. Read Service YAML",
+ f"kubectl -n {LAB_NAMESPACE} get service {SERVICE_NAME} -o yaml",
+ svc_b5,
+ svc_b5.value > 0,
+ )
+ return
+
+
+# ============================================================
+# Job Lab
+# ============================================================
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+ mo.stop(not kubectl_ready)
+ mo.md(
+ "---\n## Job Lab\n\n"
+ "A Job runs Pods to **completion** — the opposite of a Deployment, which restarts them forever. "
+ "When a Job's Pod exits 0, the Job is done and the Pod stays around for log inspection."
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+ mo.stop(not kubectl_ready)
+ job_form = mo.ui.code_editor(value=JOB_MANIFEST, language="yaml", min_height=280).form(
+ submit_button_label="Apply", bordered=False
+ )
+ job_b1 = mo.ui.button(**RUN_BUTTON)
+ job_b3 = mo.ui.button(**RUN_BUTTON)
+ job_b4 = mo.ui.button(**RUN_BUTTON)
+ return job_form, job_b1, job_b3, job_b4
+
+
+@app.cell(hide_code=True)
+def _(job_b1):
+ step_cell(
+ "1. Delete Old Job (if any)",
+ f"kubectl -n {LAB_NAMESPACE} delete job {JOB_NAME} --ignore-not-found=true",
+ job_b1,
+ job_b1.value > 0,
+ explanation="Jobs are immutable once `spec.template` is set, so we delete any previous run before re-applying.",
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(job_form):
+ manifest_step_cell(
+ "2. Apply Job Manifest",
+ job_form,
+ LAB_NAMESPACE,
+ explanation=(
+ "`restartPolicy: Never` is required for Jobs (or `OnFailure`) — it tells the kubelet not to restart "
+ "a Pod that exits. `backoffLimit: 2` caps retries if the Pod fails."
+ ),
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(job_b3):
+ step_cell(
+ "3. Wait for Completion",
+ f"kubectl -n {LAB_NAMESPACE} wait --for=condition=complete --timeout=120s job/{JOB_NAME}",
+ job_b3,
+ job_b3.value > 0,
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(job_b4):
+ step_cell(
+ "4. Read Job Logs",
+ f"kubectl -n {LAB_NAMESPACE} logs job/{JOB_NAME}",
+ job_b4,
+ job_b4.value > 0,
+ explanation="The completed Pod is preserved so you can still read its logs after the Job is done.",
+ )
+ return
+
+
+# ============================================================
+# Cleanup
+# ============================================================
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+ mo.stop(not kubectl_ready)
+ mo.md("---\n## Cleanup\n\nDelete the lab namespace — every object inside it is cascaded away.")
+ return
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+ mo.stop(not kubectl_ready)
+ cln_b1 = mo.ui.button(**RUN_BUTTON)
+ cln_b2 = mo.ui.button(**RUN_BUTTON)
+ cln_b3 = mo.ui.button(**RUN_BUTTON)
+ return cln_b1, cln_b2, cln_b3
+
+
+@app.cell(hide_code=True)
+def _(cln_b1):
+ step_cell(
+ "1. List Remaining Namespace Resources",
+ f"kubectl -n {LAB_NAMESPACE} get all || true",
+ cln_b1,
+ cln_b1.value > 0,
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(cln_b2):
+ step_cell(
+ "2. Delete Lab Namespace",
+ f"kubectl delete namespace {LAB_NAMESPACE} --ignore-not-found=true",
+ cln_b2,
+ cln_b2.value > 0,
+ explanation="Deleting the namespace cascades to every object inside it — Pods, ReplicaSets, Services, Endpoints, ConfigMaps, Jobs.",
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(cln_b3):
+ step_cell(
+ "3. Verify Namespace Removal",
+ f"kubectl get namespace {LAB_NAMESPACE} || true",
+ cln_b3,
+ cln_b3.value > 0,
+ )
+ return
+
+
+if __name__ == "__main__":
+ app.run()
diff --git a/arena/notebooks/cks_kubernetes/configmap.yaml b/arena/notebooks/cks_kubernetes/configmap.yaml
new file mode 100644
index 0000000..1cb820e
--- /dev/null
+++ b/arena/notebooks/cks_kubernetes/configmap.yaml
@@ -0,0 +1,9 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: arena-k8s-config
+ labels:
+ app.kubernetes.io/part-of: arena-k8s-lab
+data:
+ MODE: learning
+ OWNER: arena
diff --git a/arena/notebooks/cks_kubernetes/deployment.yaml b/arena/notebooks/cks_kubernetes/deployment.yaml
new file mode 100644
index 0000000..5d71df4
--- /dev/null
+++ b/arena/notebooks/cks_kubernetes/deployment.yaml
@@ -0,0 +1,22 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: arena-k8s-web
+ labels:
+ app.kubernetes.io/part-of: arena-k8s-lab
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: arena-k8s-web
+ template:
+ metadata:
+ labels:
+ app: arena-k8s-web
+ app.kubernetes.io/part-of: arena-k8s-lab
+ spec:
+ containers:
+ - name: web
+ image: nginx:1.27
+ ports:
+ - containerPort: 80
diff --git a/arena/notebooks/cks_kubernetes/job.yaml b/arena/notebooks/cks_kubernetes/job.yaml
new file mode 100644
index 0000000..4e7bb27
--- /dev/null
+++ b/arena/notebooks/cks_kubernetes/job.yaml
@@ -0,0 +1,18 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: arena-k8s-job
+ labels:
+ app.kubernetes.io/part-of: arena-k8s-lab
+spec:
+ backoffLimit: 2
+ template:
+ metadata:
+ labels:
+ app.kubernetes.io/part-of: arena-k8s-lab
+ spec:
+ restartPolicy: Never
+ containers:
+ - name: worker
+ image: busybox:1.36
+ command: ["/bin/sh", "-c", "echo hello-from-cks; sleep 3; echo completed"]
diff --git a/arena/notebooks/cks_kubernetes/service.yaml b/arena/notebooks/cks_kubernetes/service.yaml
new file mode 100644
index 0000000..7c27f7d
--- /dev/null
+++ b/arena/notebooks/cks_kubernetes/service.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: arena-k8s-web
+ labels:
+ app.kubernetes.io/part-of: arena-k8s-lab
+spec:
+ type: ClusterIP
+ selector:
+ app: arena-k8s-web
+ ports:
+ - port: 80
+ targetPort: 80
diff --git a/arena/notebooks/lib/k8s.py b/arena/notebooks/lib/k8s.py
index a75e683..267bb72 100644
--- a/arena/notebooks/lib/k8s.py
+++ b/arena/notebooks/lib/k8s.py
@@ -7,9 +7,15 @@
from kubernetes.client.rest import ApiException
from ruamel.yaml import YAML
+from .remote_execution_helpers import shell
+
# We don't have a good way to get this off node labels currently
AVAILABILITY_ZONE = os.getenv("AVAILABILITY_ZONE", "A")
+SERVICE_ACCOUNT_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+SERVICE_ACCOUNT_CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
+KUBECTL_INCLUSTER_KUBECONFIG_PATH = "/tmp/arena-incluster-kubectl.yaml"
+
class KubernetesError(Exception):
"""Base exception for k8s helper errors."""
@@ -543,6 +549,85 @@ def delete_by_label(self, namespace: str, label_selector: str) -> dict[str, list
return results
+def _write_incluster_kubeconfig(kubeconfig_path: str) -> None:
+ """Write a kubeconfig that references the mounted service-account token via `tokenFile:`.
+
+ Using `tokenFile:` (rather than inlining `token:`) keeps the bearer token in the
+ kernel-protected `/var/run/secrets/...` mount and picks up token rotation automatically.
+
+ Assumes `config.load_incluster_config()` has already validated the mount exists.
+ """
+ host = os.environ["KUBERNETES_SERVICE_HOST"]
+ port = os.getenv("KUBERNETES_SERVICE_PORT_HTTPS") or os.getenv("KUBERNETES_SERVICE_PORT", "443")
+ kubeconfig = f"""apiVersion: v1
+kind: Config
+clusters:
+- name: in-cluster
+ cluster:
+ server: https://{host}:{port}
+ certificate-authority: {SERVICE_ACCOUNT_CA_PATH}
+users:
+- name: service-account
+ user:
+ tokenFile: {SERVICE_ACCOUNT_TOKEN_PATH}
+contexts:
+- name: in-cluster
+ context:
+ cluster: in-cluster
+ user: service-account
+current-context: in-cluster
+"""
+ with open(kubeconfig_path, "w", encoding="utf-8") as file:
+ file.write(kubeconfig)
+ os.chmod(kubeconfig_path, 0o600)
+
+
+def ensure_kubectl_access() -> tuple[bool, str]:
+ """Make sure `kubectl` has a usable context, without disturbing the user's existing KUBECONFIG.
+
+ Order of preference:
+ 1. An existing `kubectl config current-context` (local or already-configured) — leave env alone.
+ 2. An in-cluster service-account identity — write a bridge kubeconfig at
+ `KUBECTL_INCLUSTER_KUBECONFIG_PATH` and set `KUBECONFIG` so kubectl picks it up.
+ """
+ if not shell("kubectl version --client", quiet=True).ok:
+ return (
+ False,
+ "`kubectl` is not available. Install it in this environment and rerun all cells.",
+ )
+
+ local_context = shell("kubectl config current-context", quiet=True)
+ cluster_probe = None
+ if local_context.ok:
+ cluster_probe = shell("kubectl cluster-info", quiet=True, timeout=15)
+ if cluster_probe.ok:
+ # Non-fatal Python-side connectivity probe — kubectl will still work even if this fails.
+ try:
+ K8s().validate_config()
+ except KubernetesConfigError:
+ pass
+ return True, f"Using local kubectl context `{local_context.strip()}`."
+
+ # No local kubectl context — try in-cluster. `load_incluster_config()` is the canonical
+ # detection: it succeeds only when the SA token + CA + service env vars are all in place.
+ try:
+ config.load_incluster_config()
+ except config.ConfigException as e:
+ details = (
+ cluster_probe.stderr.strip() if cluster_probe is not None else local_context.stderr.strip()
+ ) or str(e)
+ return (
+ False,
+ "kubectl is installed but no cluster context is usable. "
+ "If running locally, set a context with `kubectl config use-context `. "
+ f"Details: `{details}`",
+ )
+
+ _write_incluster_kubeconfig(KUBECTL_INCLUSTER_KUBECONFIG_PATH)
+ os.environ["KUBECONFIG"] = KUBECTL_INCLUSTER_KUBECONFIG_PATH
+ return True, f"Using in-cluster service-account context via `{KUBECTL_INCLUSTER_KUBECONFIG_PATH}`."
+
+
def kubeconfig_input() -> tuple[mo.Html | None, mo.ui.form | None]:
"""Create a form for a user to input their kubeconfig path.
diff --git a/arena/notebooks/lib/remote_execution_helpers.py b/arena/notebooks/lib/remote_execution_helpers.py
index 3147dc0..253fc8f 100644
--- a/arena/notebooks/lib/remote_execution_helpers.py
+++ b/arena/notebooks/lib/remote_execution_helpers.py
@@ -205,9 +205,54 @@ def ssh(command: str, verbose: bool = True, stream: bool = False) -> str:
return result.stdout
-def shell(command: str | list, quiet: bool = False, check: bool = False, stream: bool = False) -> str: # noqa: C901
+class ShellResult(str):
+ """String-like result from shell()/bash() that also carries structured fields.
+
+ Subclasses str so existing callers that treat the return value as a string
+ keep working. New callers can read .returncode, .stderr, .ok, .command for
+ structured access.
+ """
+
+ returncode: int
+ stderr: str
+ command: str
+ ok: bool
+
+ def __new__(
+ cls,
+ stdout: str,
+ *,
+ returncode: int = 0,
+ stderr: str = "",
+ command: str = "",
+ ok: bool = True,
+ ) -> "ShellResult":
+ """Create a string result with subprocess metadata attached."""
+ instance = super().__new__(cls, stdout)
+ instance.returncode = returncode
+ instance.stderr = stderr
+ instance.command = command
+ instance.ok = ok
+ return instance
+
+ def __bool__(self) -> bool:
+ """True iff the command exited successfully (returncode 0)."""
+ return self.ok
+
+
+def shell( # noqa: C901
+ command: str | list,
+ quiet: bool = False,
+ check: bool = False,
+ stream: bool = False,
+ timeout: int | None = None,
+ input: str | None = None,
+) -> ShellResult:
"""Run a local shell command with clean output.
+ Returns a ShellResult — a str subclass equal to the captured stdout, with
+ .returncode, .stderr, .ok, .command for callers that need structured access.
+
Args:
command: Command string or list of arguments
quiet: If True, suppress status messages (only show command output)
@@ -215,15 +260,18 @@ def shell(command: str | list, quiet: bool = False, check: bool = False, stream:
stream: If True, stream stdout/stderr in real-time (for long-running commands).
Note: In Marimo notebooks, streaming may still buffer until
cell completion due to Marimo's output capture mechanism.
-
- Returns:
- The stdout output as a string (empty if streaming)
+ timeout: Optional timeout in seconds (non-stream mode only). On timeout
+ the result has returncode=124 and ok=False.
+ input: Optional string piped to the command's stdin (non-stream mode only).
Example:
- shell("aws s3 ls")
- shell("aws configure set s3.addressing_style virtual")
+ result = shell("aws s3 ls")
+ if not result.ok:
+ print(result.stderr)
+
shell(["kubectl", "get", "pods"])
- shell("s5cmd cp ...", stream=True) # Stream output in real-time
+ shell("kubectl apply -f -", input=manifest_yaml)
+ shell("s5cmd cp ...", stream=True)
"""
if isinstance(command, str):
cmd_display = command
@@ -261,44 +309,92 @@ def shell(command: str | list, quiet: bool = False, check: bool = False, stream:
output_lines.append(line)
proc.wait()
+ stdout = "".join(output_lines)
if proc.returncode == 0:
if not quiet:
sys.stdout.write("✓ Done (exit: 0)\n")
sys.stdout.flush()
else:
- sys.stdout.write(f"✗ Failed (exit: {proc.returncode})\n")
- sys.stdout.flush()
+ if not quiet:
+ sys.stdout.write(f"✗ Failed (exit: {proc.returncode})\n")
+ sys.stdout.flush()
if check:
raise subprocess.CalledProcessError(proc.returncode, command)
- return "".join(output_lines)
+ return ShellResult(
+ stdout,
+ returncode=proc.returncode,
+ stderr="",
+ command=cmd_display,
+ ok=proc.returncode == 0,
+ )
- else:
- # Capture output (original behavior)
+ try:
result = subprocess.run(
command,
shell=shell_mode,
capture_output=True,
text=True,
+ timeout=timeout,
+ input=input,
)
-
- if result.returncode == 0:
- if not quiet:
- print(f"✓ {cmd_display}")
- if result.stdout.strip():
- print(result.stdout.rstrip())
- return result.stdout
- else:
+ except subprocess.TimeoutExpired as err:
+ out = err.stdout or ""
+ err_text = err.stderr or ""
+ if isinstance(out, bytes):
+ out = out.decode("utf-8", errors="replace")
+ if isinstance(err_text, bytes):
+ err_text = err_text.decode("utf-8", errors="replace")
+ stderr_msg = f"Command timed out after {timeout}s. {err_text.strip()}".strip()
+ if not quiet:
print(f"✗ {cmd_display}")
- if result.stderr.strip():
- print(f" Error: {result.stderr.strip()}")
- if check:
- raise subprocess.CalledProcessError(result.returncode, command)
- return ""
+ print(f" Error: {stderr_msg}")
+ if check:
+ raise
+ return ShellResult(
+ out,
+ returncode=124,
+ stderr=stderr_msg,
+ command=cmd_display,
+ ok=False,
+ )
+ if result.returncode == 0:
+ if not quiet:
+ print(f"✓ {cmd_display}")
+ if result.stdout.strip():
+ print(result.stdout.rstrip())
+ return ShellResult(
+ result.stdout,
+ returncode=0,
+ stderr=result.stderr,
+ command=cmd_display,
+ ok=True,
+ )
+
+ if not quiet:
+ print(f"✗ {cmd_display}")
+ if result.stderr.strip():
+ print(f" Error: {result.stderr.strip()}")
+ if check:
+ raise subprocess.CalledProcessError(result.returncode, command)
+ return ShellResult(
+ result.stdout,
+ returncode=result.returncode,
+ stderr=result.stderr,
+ command=cmd_display,
+ ok=False,
+ )
-def bash(command: str, quiet: bool = False, stream: bool = False) -> str:
+
+def bash(
+ command: str,
+ quiet: bool = False,
+ stream: bool = False,
+ timeout: int | None = None,
+ input: str | None = None,
+) -> ShellResult:
"""Alias for shell() - run a bash command.
Example:
@@ -306,7 +402,7 @@ def bash(command: str, quiet: bool = False, stream: bool = False) -> str:
bash("aws s3 ls")
bash("s5cmd cp ...", stream=True)
"""
- return shell(command, quiet=quiet, stream=stream)
+ return shell(command, quiet=quiet, stream=stream, timeout=timeout, input=input)
if __name__ == "__main__":