From 7bf5dc388e720d55cfd06e85b8663e04675bc52c Mon Sep 17 00:00:00 2001
From: David Flanagan <dflanagan@coreweave.com>
Date: Wed, 8 Apr 2026 15:40:45 +0100
Subject: [PATCH 1/4] feat(arena): introduction to kubernetes notebook

---
 arena/notebooks/cks_kubernetes.py             | 850 ++++++++++++++++++
 arena/notebooks/cks_kubernetes/configmap.yaml |   9 +
 .../notebooks/cks_kubernetes/deployment.yaml  |  22 +
 arena/notebooks/cks_kubernetes/job.yaml       |  18 +
 arena/notebooks/cks_kubernetes/service.yaml   |  13 +
 5 files changed, 912 insertions(+)
 create mode 100644 arena/notebooks/cks_kubernetes.py
 create mode 100644 arena/notebooks/cks_kubernetes/configmap.yaml
 create mode 100644 arena/notebooks/cks_kubernetes/deployment.yaml
 create mode 100644 arena/notebooks/cks_kubernetes/job.yaml
 create mode 100644 arena/notebooks/cks_kubernetes/service.yaml

diff --git a/arena/notebooks/cks_kubernetes.py b/arena/notebooks/cks_kubernetes.py
new file mode 100644
index 0000000..69d40a0
--- /dev/null
+++ b/arena/notebooks/cks_kubernetes.py
@@ -0,0 +1,850 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "kubernetes==35.0.0",
+#     "marimo>=0.23.6",
+#     "ruamel-yaml>=0.19.1",
+# ]
+# ///
+
+import marimo
+
+__generated_with = "0.23.6"
+app = marimo.App(width="medium", app_title="CoreWeave ARENA")
+
+with app.setup:
+    import os
+    import subprocess
+
+    import marimo as mo
+    from lib.k8s import K8s, KubernetesConfigError
+    from lib.ui import about, banner, security_disclaimer, table_of_contents
+
+    MANIFEST_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cks_kubernetes")
+
+    LAB_NAMESPACE = "arena-k8s-lab"
+    LAB_LABEL_KEY = "app.kubernetes.io/part-of"
+    LAB_LABEL_VALUE = "arena-k8s-lab"
+
+    CONFIGMAP_NAME = "arena-k8s-config"
+    DEPLOYMENT_NAME = "arena-k8s-web"
+    SERVICE_NAME = "arena-k8s-web"
+    JOB_NAME = "arena-k8s-job"
+
+    SERVICE_ACCOUNT_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+    SERVICE_ACCOUNT_CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
+    KUBECTL_KUBECONFIG_PATH = "/tmp/arena-incluster-kubectl.yaml"
+
+    def truncate(text: str, max_chars: int = 4000) -> str:
+        if len(text) <= max_chars:
+            return text
+        return f"{text[:max_chars]}\n...output truncated..."
+
+    def run_command(command: str, timeout_seconds: int = 180) -> dict:
+        """Run a shell command and return structured result."""
+        try:
+            result = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=timeout_seconds)
+            return {
+                "command": command,
+                "returncode": result.returncode,
+                "stdout": result.stdout.strip(),
+                "stderr": result.stderr.strip(),
+                "ok": result.returncode == 0,
+            }
+        except subprocess.TimeoutExpired as err:
+            stdout = str(err.stdout or "").strip() if err.stdout else ""
+            stderr = str(err.stderr or "").strip() if err.stderr else ""
+            return {
+                "command": command,
+                "returncode": 124,
+                "stdout": stdout,
+                "stderr": f"Command timed out after {timeout_seconds}s. {stderr}".strip(),
+                "ok": False,
+            }
+
+    def apply_manifest(manifest: str, namespace: str | None = None, timeout_seconds: int = 180) -> dict:
+        """Apply a YAML manifest by piping it to `kubectl apply -f -` via stdin."""
+        ns_flag = f"-n {namespace} " if namespace else ""
+        command = f"kubectl {ns_flag}apply -f -"
+        try:
+            result = subprocess.run(
+                command,
+                shell=True,
+                input=manifest,
+                capture_output=True,
+                text=True,
+                timeout=timeout_seconds,
+            )
+            return {
+                "command": f"{command}   # manifest piped via stdin",
+                "returncode": result.returncode,
+                "stdout": result.stdout.strip(),
+                "stderr": result.stderr.strip(),
+                "ok": result.returncode == 0,
+            }
+        except subprocess.TimeoutExpired:
+            return {
+                "command": command,
+                "returncode": 124,
+                "stdout": "",
+                "stderr": f"Apply timed out after {timeout_seconds}s.",
+                "ok": False,
+            }
+
+    def write_incluster_kubeconfig(kubeconfig_path: str) -> tuple[bool, str]:
+        """Write a kubeconfig that references the mounted service-account token via `tokenFile:`.
+
+        Using `tokenFile:` (rather than inlining `token:`) keeps the bearer token in the
+        kernel-protected `/var/run/secrets/...` mount and picks up token rotation automatically.
+        """
+        host = os.getenv("KUBERNETES_SERVICE_HOST", "")
+        port = os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "") or os.getenv("KUBERNETES_SERVICE_PORT", "443")
+
+        if not host:
+            return False, "KUBERNETES_SERVICE_HOST is not set."
+        if not os.path.exists(SERVICE_ACCOUNT_TOKEN_PATH):
+            return False, f"Service account token not found at {SERVICE_ACCOUNT_TOKEN_PATH}."
+        if not os.path.exists(SERVICE_ACCOUNT_CA_PATH):
+            return False, f"Service account CA cert not found at {SERVICE_ACCOUNT_CA_PATH}."
+
+        kubeconfig = f"""apiVersion: v1
+kind: Config
+clusters:
+- name: in-cluster
+  cluster:
+    server: https://{host}:{port}
+    certificate-authority: {SERVICE_ACCOUNT_CA_PATH}
+users:
+- name: service-account
+  user:
+    tokenFile: {SERVICE_ACCOUNT_TOKEN_PATH}
+contexts:
+- name: in-cluster
+  context:
+    cluster: in-cluster
+    user: service-account
+current-context: in-cluster
+"""
+        with open(kubeconfig_path, "w", encoding="utf-8") as file:
+            file.write(kubeconfig)
+        return True, kubeconfig_path
+
+    def ensure_kubectl_access() -> tuple[bool, str]:
+        """Make sure kubectl has a usable context, without disturbing the user's existing KUBECONFIG.
+
+        Order of preference:
+          1. An existing `kubectl config current-context` (local or already-configured) — leave env alone.
+          2. Generated in-cluster kubeconfig at `KUBECTL_KUBECONFIG_PATH` — only then set `KUBECONFIG`.
+        """
+        kubectl_check = run_command("kubectl version --client")
+        if not bool(kubectl_check["ok"]):
+            return (
+                False,
+                "`kubectl` is not available. Install it in this environment and rerun all cells.",
+            )
+
+        local_context = run_command("kubectl config current-context")
+        if bool(local_context["ok"]):
+            # Probe Python-side connectivity via the shared K8s helper. Failure is non-fatal — the
+            # lab's shell-kubectl commands will still work.
+            try:
+                K8s().validate_config()
+            except KubernetesConfigError:
+                pass
+            return True, f"Using local kubectl context `{local_context['stdout']}`."
+
+        incluster_ok, incluster_message = write_incluster_kubeconfig(KUBECTL_KUBECONFIG_PATH)
+        if incluster_ok:
+            os.environ["KUBECONFIG"] = KUBECTL_KUBECONFIG_PATH
+            return True, f"Using in-cluster service-account context via `{KUBECTL_KUBECONFIG_PATH}`."
+
+        details = str(local_context["stderr"]) if local_context["stderr"] else incluster_message
+        return (
+            False,
+            "kubectl is installed but no cluster context is usable. "
+            "If running locally, set a context with `kubectl config use-context <name>`. "
+            f"Details: `{details}`",
+        )
+
+    def step_cell(title: str, cmd: str, btn, clicked: bool, explanation: str = "") -> mo.Html:
+        """Render an imperative step: optional explanation, title + button, command, and result if clicked."""
+        _output = None
+        if clicked:
+            _r = run_command(cmd)
+            _status = "SUCCESS" if _r["ok"] else "FAILED"
+            _stdout = truncate(str(_r["stdout"])) if _r["stdout"] else "(no output)"
+            _stderr_block = ""
+            if _r["stderr"] and not _r["ok"]:
+                _stderr_block = f"\n**stderr:**\n```\n{truncate(str(_r['stderr']))}\n```"
+            _output = mo.md(f"`{_status}`\n```\n{_stdout}\n```{_stderr_block}")
+        parts = [
+            mo.hstack([mo.md(f"**{title}**"), btn], justify="space-between", align="center"),
+        ]
+        if explanation:
+            parts.append(mo.md(explanation))
+        parts.append(mo.md(f"```\n{cmd}\n```"))
+        if _output is not None:
+            parts.append(_output)
+        return mo.vstack(parts)
+
+    def manifest_step_cell(
+        title: str,
+        editor,
+        namespace: str | None,
+        btn,
+        clicked: bool,
+        explanation: str = "",
+    ) -> mo.Html:
+        """Render a declarative step: title + button, optional explanation, editable YAML, result on click.
+
+        `editor` is a `mo.ui.code_editor` whose current `.value` is applied to the cluster
+        when the Run button has been clicked. Editing the YAML and re-clicking re-applies.
+        """
+        _output = None
+        if clicked:
+            _r = apply_manifest(editor.value, namespace)
+            _status = "SUCCESS" if _r["ok"] else "FAILED"
+            _stdout = truncate(str(_r["stdout"])) if _r["stdout"] else "(no output)"
+            _stderr_block = ""
+            if _r["stderr"] and not _r["ok"]:
+                _stderr_block = f"\n**stderr:**\n```\n{truncate(str(_r['stderr']))}\n```"
+            _output = mo.md(f"`{_status}`\n```\n{_stdout}\n```{_stderr_block}")
+        ns_label = f"  *(namespace: `{namespace}`)*" if namespace else ""
+        parts = [
+            mo.hstack(
+                [mo.md(f"**{title}**{ns_label}"), btn],
+                justify="space-between",
+                align="center",
+            ),
+        ]
+        if explanation:
+            parts.append(mo.md(explanation))
+        parts.append(editor)
+        parts.append(mo.md("_Edit the manifest above if you like, then click **Run** to_ `kubectl apply -f -` _it._"))
+        if _output is not None:
+            parts.append(_output)
+        return mo.vstack(parts)
+
+    def load_manifest(filename: str) -> str:
+        """Load a YAML manifest from the `cks_kubernetes/` directory next to this notebook."""
+        with open(os.path.join(MANIFEST_DIR, filename), "r", encoding="utf-8") as file:
+            return file.read()
+
+    RUN_BUTTON = dict(value=0, on_click=lambda v: v + 1, label="Run")
+
+    # ----------------------------------------------------------------
+    # Manifests (the source of truth for each lab object).
+    # Stored as real YAML files in `cks_kubernetes/` so they're easy to edit locally.
+    # ----------------------------------------------------------------
+
+    CONFIGMAP_MANIFEST = load_manifest("configmap.yaml")
+    DEPLOYMENT_MANIFEST = load_manifest("deployment.yaml")
+    SERVICE_MANIFEST = load_manifest("service.yaml")
+    JOB_MANIFEST = load_manifest("job.yaml")
+
+
+# ============================================================
+# Header
+# ============================================================
+
+
+@app.cell(hide_code=True)
+def _():
+    mo.vstack(
+        [
+            banner(),
+            about(
+                "Kubernetes",
+                """This notebook is a hands-on introduction to Kubernetes on CoreWeave Kubernetes Service (CKS).<br>
+               Each section presents the actual YAML manifest, applies it declaratively, and then observes what the controllers do.<br>
+               _If you are running this notebook in edit mode, start by running all cells in the bottom right._
+            """,
+            ),
+            table_of_contents(
+                [
+                    {"title": "Cluster Access", "description": "Verify kubectl connectivity."},
+                    {"title": "Environment and Namespace Setup", "description": "Create the lab namespace."},
+                    {"title": "ConfigMap Lab", "description": "Apply, read, patch, and delete a ConfigMap."},
+                    {"title": "Deployment Lab", "description": "Apply, scale, watch self-healing, and roll an update."},
+                    {"title": "Service Lab", "description": "Expose a deployment and inspect endpoints."},
+                    {"title": "Job Lab", "description": "Run a batch job to completion and inspect logs."},
+                    {"title": "Cleanup", "description": "Delete lab resources and namespace."},
+                ]
+            ),
+            security_disclaimer(),
+        ]
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _():
+    kubectl_ready, access_message = ensure_kubectl_access()
+    mo.md(f"""
+---
+## Cluster Access
+
+**kubectl status:** `{"READY" if kubectl_ready else "NOT READY"}`
+
+{access_message}
+""")
+    return (kubectl_ready,)
+
+
+# ============================================================
+# Environment and Namespace Setup
+# ============================================================
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+    mo.stop(not kubectl_ready)
+    mo.md(
+        "---\n## Environment and Namespace Setup\n\n"
+        "Create and verify a dedicated lab namespace. Every object below will live inside it, "
+        "so deleting the namespace at the end cleans everything up at once."
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+    mo.stop(not kubectl_ready)
+    env_b1 = mo.ui.button(**RUN_BUTTON)
+    env_b2 = mo.ui.button(**RUN_BUTTON)
+    env_b3 = mo.ui.button(**RUN_BUTTON)
+    env_b4 = mo.ui.button(**RUN_BUTTON)
+    return env_b1, env_b2, env_b3, env_b4
+
+
+@app.cell(hide_code=True)
+def _(env_b1):
+    step_cell(
+        "1. Check API Server",
+        "kubectl cluster-info",
+        env_b1,
+        env_b1.value > 0,
+        explanation=(
+            "`cluster-info` hits the Kubernetes API server and shows its address. "
+            "The API server is the only component you talk to directly — everything else (controllers, kubelet, scheduler) "
+            "reads and writes through it."
+        ),
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(env_b2):
+    step_cell(
+        "2. List Namespaces",
+        "kubectl get namespace",
+        env_b2,
+        env_b2.value > 0,
+        explanation="Namespaces partition cluster objects. You'll see system namespaces (`kube-system`, `kube-public`) alongside any others.",
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(env_b3):
+    step_cell(
+        "3. Create Lab Namespace",
+        f"kubectl create namespace {LAB_NAMESPACE} --dry-run=client -o yaml | kubectl apply -f -",
+        env_b3,
+        env_b3.value > 0,
+        explanation=(
+            f"Idempotently create `{LAB_NAMESPACE}`. The `--dry-run=client -o yaml | apply` pattern produces a manifest "
+            "client-side and applies it — re-running won't error if the namespace already exists."
+        ),
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(env_b4):
+    step_cell(
+        "4. Verify Namespace Labels",
+        f"kubectl get namespace {LAB_NAMESPACE} --show-labels",
+        env_b4,
+        env_b4.value > 0,
+    )
+    return
+
+
+# ============================================================
+# ConfigMap Lab
+# ============================================================
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+    mo.stop(not kubectl_ready)
+    mo.md(
+        "---\n## ConfigMap Lab\n\n"
+        "A ConfigMap stores non-secret key/value config. Below is the actual manifest — "
+        "the API object Kubernetes stores. `kubectl create configmap ...` is just a shortcut "
+        "that builds this YAML for you."
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+    mo.stop(not kubectl_ready)
+    cm_editor = mo.ui.code_editor(value=CONFIGMAP_MANIFEST, language="yaml", min_height=200, debounce=True)
+    cm_b1 = mo.ui.button(**RUN_BUTTON)
+    cm_b2 = mo.ui.button(**RUN_BUTTON)
+    cm_b3 = mo.ui.button(**RUN_BUTTON)
+    cm_b4 = mo.ui.button(**RUN_BUTTON)
+    cm_b5 = mo.ui.button(**RUN_BUTTON)
+    return cm_editor, cm_b1, cm_b2, cm_b3, cm_b4, cm_b5
+
+
+@app.cell(hide_code=True)
+def _(cm_editor, cm_b1):
+    manifest_step_cell(
+        "1. Apply ConfigMap Manifest",
+        cm_editor,
+        LAB_NAMESPACE,
+        cm_b1,
+        cm_b1.value > 0,
+        explanation=(
+            "Two top-level fields matter: `metadata` (identity — name, namespace, labels) "
+            "and `data` (the key/value payload). Edit the YAML and re-click **Run** to see how "
+            "`kubectl apply` reconciles your change."
+        ),
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(cm_b2):
+    step_cell(
+        "2. Read ConfigMap",
+        f"kubectl -n {LAB_NAMESPACE} get configmap {CONFIGMAP_NAME} -o yaml",
+        cm_b2,
+        cm_b2.value > 0,
+        explanation=(
+            "Notice the server added fields you didn't write — `creationTimestamp`, `resourceVersion`, `uid`. "
+            "These are managed by the API server, not you."
+        ),
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(cm_b3):
+    step_cell(
+        "3. Patch ConfigMap Data (imperative drift)",
+        f"""kubectl -n {LAB_NAMESPACE} patch configmap {CONFIGMAP_NAME} --type merge -p '{{"data":{{"MODE":"advanced","TOPIC":"crud"}}}}'""",
+        cm_b3,
+        cm_b3.value > 0,
+        explanation=(
+            "`patch` mutates the live object in-place. After this, the cluster state diverges from the YAML in step 1 — "
+            "re-applying the manifest would revert `MODE` and remove `TOPIC`. This is the declarative-vs-imperative tension."
+        ),
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(cm_b4):
+    step_cell(
+        "4. Verify Patched ConfigMap",
+        f"kubectl -n {LAB_NAMESPACE} get configmap {CONFIGMAP_NAME} -o yaml",
+        cm_b4,
+        cm_b4.value > 0,
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(cm_b5):
+    step_cell(
+        "5. Delete ConfigMap",
+        f"kubectl -n {LAB_NAMESPACE} delete configmap {CONFIGMAP_NAME} --ignore-not-found=true",
+        cm_b5,
+        cm_b5.value > 0,
+    )
+    return
+
+
+# ============================================================
+# Deployment Lab
+# ============================================================
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+    mo.stop(not kubectl_ready)
+    mo.md(
+        "---\n## Deployment Lab\n\n"
+        'A Deployment is a controller. You give it a desired state — _"I want N pods running this image"_ — '
+        "and it creates a ReplicaSet, which in turn creates Pods. The controller continuously reconciles "
+        "actual cluster state toward the spec. We'll watch that happen explicitly in step 4."
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+    mo.stop(not kubectl_ready)
+    dep_editor = mo.ui.code_editor(value=DEPLOYMENT_MANIFEST, language="yaml", min_height=320, debounce=True)
+    dep_b1 = mo.ui.button(**RUN_BUTTON)
+    dep_b2 = mo.ui.button(**RUN_BUTTON)
+    dep_b3 = mo.ui.button(**RUN_BUTTON)
+    dep_b4 = mo.ui.button(**RUN_BUTTON)
+    dep_b5 = mo.ui.button(**RUN_BUTTON)
+    dep_b6 = mo.ui.button(**RUN_BUTTON)
+    dep_b7 = mo.ui.button(**RUN_BUTTON)
+    dep_b8 = mo.ui.button(**RUN_BUTTON)
+    return dep_editor, dep_b1, dep_b2, dep_b3, dep_b4, dep_b5, dep_b6, dep_b7, dep_b8
+
+
+@app.cell(hide_code=True)
+def _(dep_editor, dep_b1):
+    manifest_step_cell(
+        "1. Apply Deployment Manifest",
+        dep_editor,
+        LAB_NAMESPACE,
+        dep_b1,
+        dep_b1.value > 0,
+        explanation=(
+            "Three nested layers: the Deployment's `selector.matchLabels` finds Pods whose labels match. "
+            "`template.metadata.labels` *stamps* those labels onto every Pod it creates. "
+            "If the selector and template labels disagree, the Deployment refuses to start — try it."
+        ),
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(dep_b2):
+    step_cell(
+        "2. Scale to 2 Replicas",
+        f"kubectl -n {LAB_NAMESPACE} scale deployment {DEPLOYMENT_NAME} --replicas=2",
+        dep_b2,
+        dep_b2.value > 0,
+        explanation=(
+            "Scaling changes the `spec.replicas` field on the Deployment. The controller notices the new desired count "
+            "and creates one more Pod to match. (`scale` is imperative — equivalent to patching the field directly.)"
+        ),
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(dep_b3):
+    step_cell(
+        "3. Wait for Rollout",
+        f"kubectl -n {LAB_NAMESPACE} rollout status deployment/{DEPLOYMENT_NAME} --timeout=120s",
+        dep_b3,
+        dep_b3.value > 0,
+        explanation="Block until the Deployment reports that all desired Pods are ready.",
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(dep_b4):
+    reconcile_cmd = (
+        f"echo '=== pods BEFORE delete ==='; "
+        f"kubectl -n {LAB_NAMESPACE} get pods -l app={DEPLOYMENT_NAME} -o wide; "
+        f"POD=$(kubectl -n {LAB_NAMESPACE} get pods -l app={DEPLOYMENT_NAME} "
+        f"-o jsonpath='{{.items[0].metadata.name}}'); "
+        f'echo; echo "=== deleting pod: $POD ==="; '
+        f'kubectl -n {LAB_NAMESPACE} delete pod "$POD" --wait=false; '
+        f"echo; echo '=== waiting 6s for the ReplicaSet controller to react ==='; sleep 6; "
+        f"echo; echo '=== pods AFTER delete ==='; "
+        f"kubectl -n {LAB_NAMESPACE} get pods -l app={DEPLOYMENT_NAME} -o wide"
+    )
+    step_cell(
+        "4. Watch Self-Healing (delete a pod, observe replacement)",
+        reconcile_cmd,
+        dep_b4,
+        dep_b4.value > 0,
+        explanation=(
+            "**This is the core idea of Kubernetes.** You deleted a Pod — but the Deployment's "
+            "ReplicaSet still wants 2 Pods running. The controller detects the mismatch and creates a new Pod "
+            "with a fresh name. Compare the `NAME` column before and after: one of the names will be new."
+        ),
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(dep_b5):
+    step_cell(
+        "5. Update Deployment Image",
+        f"kubectl -n {LAB_NAMESPACE} set image deployment/{DEPLOYMENT_NAME} '*=nginx:1.27.5'",
+        dep_b5,
+        dep_b5.value > 0,
+        explanation=(
+            "Changes the image in `spec.template.spec.containers[*]`. The Deployment then creates a new ReplicaSet "
+            "for the new image and scales the old one down — a rolling update, again driven by the controller."
+        ),
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(dep_b6):
+    step_cell(
+        "6. Wait for Updated Rollout",
+        f"kubectl -n {LAB_NAMESPACE} rollout status deployment/{DEPLOYMENT_NAME} --timeout=120s",
+        dep_b6,
+        dep_b6.value > 0,
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(dep_b7):
+    step_cell(
+        "7. Read Deployment",
+        f"kubectl -n {LAB_NAMESPACE} get deployment {DEPLOYMENT_NAME}",
+        dep_b7,
+        dep_b7.value > 0,
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(dep_b8):
+    step_cell(
+        "8. Read Deployment Pods",
+        f"kubectl -n {LAB_NAMESPACE} get pods -l app={DEPLOYMENT_NAME}",
+        dep_b8,
+        dep_b8.value > 0,
+    )
+    return
+
+
+# ============================================================
+# Service Lab
+# ============================================================
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+    mo.stop(not kubectl_ready)
+    mo.md(
+        "---\n## Service Lab\n\n"
+        "A Service gives Pods a stable virtual IP and DNS name. Its `selector` is what wires it to Pods — "
+        "the endpoints object is regenerated whenever matching Pods come and go."
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+    mo.stop(not kubectl_ready)
+    svc_editor = mo.ui.code_editor(value=SERVICE_MANIFEST, language="yaml", min_height=220, debounce=True)
+    svc_b1 = mo.ui.button(**RUN_BUTTON)
+    svc_b2 = mo.ui.button(**RUN_BUTTON)
+    svc_b3 = mo.ui.button(**RUN_BUTTON)
+    svc_b4 = mo.ui.button(**RUN_BUTTON)
+    svc_b5 = mo.ui.button(**RUN_BUTTON)
+    return svc_editor, svc_b1, svc_b2, svc_b3, svc_b4, svc_b5
+
+
+@app.cell(hide_code=True)
+def _(svc_editor, svc_b1):
+    manifest_step_cell(
+        "1. Apply Service Manifest",
+        svc_editor,
+        LAB_NAMESPACE,
+        svc_b1,
+        svc_b1.value > 0,
+        explanation=(
+            f"`spec.selector: app: {DEPLOYMENT_NAME}` is the only link between this Service and the Deployment's Pods — "
+            "no foreign key, just label matching. Try changing the selector to a label no Pod has and re-run; "
+            "the Endpoints in step 3 will go empty."
+        ),
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(svc_b2):
+    step_cell(
+        "2. Read Service",
+        f"kubectl -n {LAB_NAMESPACE} get service {SERVICE_NAME} -o wide",
+        svc_b2,
+        svc_b2.value > 0,
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(svc_b3):
+    step_cell(
+        "3. Read Endpoints",
+        f"kubectl -n {LAB_NAMESPACE} get endpoints {SERVICE_NAME}",
+        svc_b3,
+        svc_b3.value > 0,
+        explanation=(
+            "Endpoints are the actual Pod IPs the Service routes to. They were populated automatically "
+            "from the selector — and they update as Pods come and go (try deleting a Pod and re-running this)."
+        ),
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(svc_b4):
+    step_cell(
+        "4. Patch Service Session Affinity",
+        f"""kubectl -n {LAB_NAMESPACE} patch service {SERVICE_NAME} --type merge -p '{{"spec":{{"sessionAffinity":"ClientIP"}}}}'""",
+        svc_b4,
+        svc_b4.value > 0,
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(svc_b5):
+    step_cell(
+        "5. Read Service YAML",
+        f"kubectl -n {LAB_NAMESPACE} get service {SERVICE_NAME} -o yaml",
+        svc_b5,
+        svc_b5.value > 0,
+    )
+    return
+
+
+# ============================================================
+# Job Lab
+# ============================================================
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+    mo.stop(not kubectl_ready)
+    mo.md(
+        "---\n## Job Lab\n\n"
+        "A Job runs Pods to **completion** — the opposite of a Deployment, which restarts them forever. "
+        "When a Job's Pod exits 0, the Job is done and the Pod stays around for log inspection."
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+    mo.stop(not kubectl_ready)
+    job_editor = mo.ui.code_editor(value=JOB_MANIFEST, language="yaml", min_height=280, debounce=True)
+    job_b1 = mo.ui.button(**RUN_BUTTON)
+    job_b2 = mo.ui.button(**RUN_BUTTON)
+    job_b3 = mo.ui.button(**RUN_BUTTON)
+    job_b4 = mo.ui.button(**RUN_BUTTON)
+    return job_editor, job_b1, job_b2, job_b3, job_b4
+
+
+@app.cell(hide_code=True)
+def _(job_b1):
+    step_cell(
+        "1. Delete Old Job (if any)",
+        f"kubectl -n {LAB_NAMESPACE} delete job {JOB_NAME} --ignore-not-found=true",
+        job_b1,
+        job_b1.value > 0,
+        explanation="Jobs are immutable once `spec.template` is set, so we delete any previous run before re-applying.",
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(job_editor, job_b2):
+    manifest_step_cell(
+        "2. Apply Job Manifest",
+        job_editor,
+        LAB_NAMESPACE,
+        job_b2,
+        job_b2.value > 0,
+        explanation=(
+            "`restartPolicy: Never` is required for Jobs (or `OnFailure`) — it tells the kubelet not to restart "
+            "a Pod that exits. `backoffLimit: 2` caps retries if the Pod fails."
+        ),
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(job_b3):
+    step_cell(
+        "3. Wait for Completion",
+        f"kubectl -n {LAB_NAMESPACE} wait --for=condition=complete --timeout=120s job/{JOB_NAME}",
+        job_b3,
+        job_b3.value > 0,
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(job_b4):
+    step_cell(
+        "4. Read Job Logs",
+        f"kubectl -n {LAB_NAMESPACE} logs job/{JOB_NAME}",
+        job_b4,
+        job_b4.value > 0,
+        explanation="The completed Pod is preserved so you can still read its logs after the Job is done.",
+    )
+    return
+
+
+# ============================================================
+# Cleanup
+# ============================================================
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+    mo.stop(not kubectl_ready)
+    mo.md("---\n## Cleanup\n\nDelete the lab namespace — every object inside it is cascaded away.")
+    return
+
+
+@app.cell(hide_code=True)
+def _(kubectl_ready: bool):
+    mo.stop(not kubectl_ready)
+    cln_b1 = mo.ui.button(**RUN_BUTTON)
+    cln_b2 = mo.ui.button(**RUN_BUTTON)
+    cln_b3 = mo.ui.button(**RUN_BUTTON)
+    return cln_b1, cln_b2, cln_b3
+
+
+@app.cell(hide_code=True)
+def _(cln_b1):
+    step_cell(
+        "1. List Remaining Namespace Resources",
+        f"kubectl -n {LAB_NAMESPACE} get all || true",
+        cln_b1,
+        cln_b1.value > 0,
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(cln_b2):
+    step_cell(
+        "2. Delete Lab Namespace",
+        f"kubectl delete namespace {LAB_NAMESPACE} --ignore-not-found=true",
+        cln_b2,
+        cln_b2.value > 0,
+        explanation="Deleting the namespace cascades to every object inside it — Pods, ReplicaSets, Services, Endpoints, ConfigMaps, Jobs.",
+    )
+    return
+
+
+@app.cell(hide_code=True)
+def _(cln_b3):
+    step_cell(
+        "3. Verify Namespace Removal",
+        f"kubectl get namespace {LAB_NAMESPACE} || true",
+        cln_b3,
+        cln_b3.value > 0,
+    )
+    return
+
+
+if __name__ == "__main__":
+    app.run()
diff --git a/arena/notebooks/cks_kubernetes/configmap.yaml b/arena/notebooks/cks_kubernetes/configmap.yaml
new file mode 100644
index 0000000..1cb820e
--- /dev/null
+++ b/arena/notebooks/cks_kubernetes/configmap.yaml
@@ -0,0 +1,9 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: arena-k8s-config
+  labels:
+    app.kubernetes.io/part-of: arena-k8s-lab
+data:
+  MODE: learning
+  OWNER: arena
diff --git a/arena/notebooks/cks_kubernetes/deployment.yaml b/arena/notebooks/cks_kubernetes/deployment.yaml
new file mode 100644
index 0000000..5d71df4
--- /dev/null
+++ b/arena/notebooks/cks_kubernetes/deployment.yaml
@@ -0,0 +1,22 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: arena-k8s-web
+  labels:
+    app.kubernetes.io/part-of: arena-k8s-lab
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: arena-k8s-web
+  template:
+    metadata:
+      labels:
+        app: arena-k8s-web
+        app.kubernetes.io/part-of: arena-k8s-lab
+    spec:
+      containers:
+        - name: web
+          image: nginx:1.27
+          ports:
+            - containerPort: 80
diff --git a/arena/notebooks/cks_kubernetes/job.yaml b/arena/notebooks/cks_kubernetes/job.yaml
new file mode 100644
index 0000000..4e7bb27
--- /dev/null
+++ b/arena/notebooks/cks_kubernetes/job.yaml
@@ -0,0 +1,18 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: arena-k8s-job
+  labels:
+    app.kubernetes.io/part-of: arena-k8s-lab
+spec:
+  backoffLimit: 2
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/part-of: arena-k8s-lab
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: worker
+          image: busybox:1.36
+          command: ["/bin/sh", "-c", "echo hello-from-cks; sleep 3; echo completed"]
diff --git a/arena/notebooks/cks_kubernetes/service.yaml b/arena/notebooks/cks_kubernetes/service.yaml
new file mode 100644
index 0000000..7c27f7d
--- /dev/null
+++ b/arena/notebooks/cks_kubernetes/service.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: arena-k8s-web
+  labels:
+    app.kubernetes.io/part-of: arena-k8s-lab
+spec:
+  type: ClusterIP
+  selector:
+    app: arena-k8s-web
+  ports:
+    - port: 80
+      targetPort: 80

From 33e6e783d804a0132f1a8155374295fce67f376d Mon Sep 17 00:00:00 2001
From: David Flanagan <dflanagan@coreweave.com>
Date: Wed, 13 May 2026 16:31:12 +0100
Subject: [PATCH 2/4] chore: refactor to shared shell function

---
 arena/notebooks/caios_warp.py                 |   2 +-
 arena/notebooks/cks_kubernetes.py             |  90 +++--------
 .../notebooks/lib/remote_execution_helpers.py | 145 ++++++++++++++----
 3 files changed, 141 insertions(+), 96 deletions(-)

diff --git a/arena/notebooks/caios_warp.py b/arena/notebooks/caios_warp.py
index 33cb384..bfa53cd 100644
--- a/arena/notebooks/caios_warp.py
+++ b/arena/notebooks/caios_warp.py
@@ -38,8 +38,8 @@ def _():
     _elements = [
         banner(),
         about(
-            "Warp",
             """This notebook provides a walkthrough for benchmarking CoreWeave AI Object Storage (CAIOS) and LOTA inside a CoreWeave cluster using the Minio Warp tool.<br>
+            "Warp",
                _If you are running this notebook in edit mode, make sure you start by running all cells in the bottom right._
             """,
         ),
diff --git a/arena/notebooks/cks_kubernetes.py b/arena/notebooks/cks_kubernetes.py
index 69d40a0..142a854 100644
--- a/arena/notebooks/cks_kubernetes.py
+++ b/arena/notebooks/cks_kubernetes.py
@@ -14,10 +14,10 @@
 
 with app.setup:
     import os
-    import subprocess
 
     import marimo as mo
     from lib.k8s import K8s, KubernetesConfigError
+    from lib.remote_execution_helpers import shell
     from lib.ui import about, banner, security_disclaimer, table_of_contents
 
     MANIFEST_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cks_kubernetes")
@@ -40,57 +40,6 @@ def truncate(text: str, max_chars: int = 4000) -> str:
             return text
         return f"{text[:max_chars]}\n...output truncated..."
 
-    def run_command(command: str, timeout_seconds: int = 180) -> dict:
-        """Run a shell command and return structured result."""
-        try:
-            result = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=timeout_seconds)
-            return {
-                "command": command,
-                "returncode": result.returncode,
-                "stdout": result.stdout.strip(),
-                "stderr": result.stderr.strip(),
-                "ok": result.returncode == 0,
-            }
-        except subprocess.TimeoutExpired as err:
-            stdout = str(err.stdout or "").strip() if err.stdout else ""
-            stderr = str(err.stderr or "").strip() if err.stderr else ""
-            return {
-                "command": command,
-                "returncode": 124,
-                "stdout": stdout,
-                "stderr": f"Command timed out after {timeout_seconds}s. {stderr}".strip(),
-                "ok": False,
-            }
-
-    def apply_manifest(manifest: str, namespace: str | None = None, timeout_seconds: int = 180) -> dict:
-        """Apply a YAML manifest by piping it to `kubectl apply -f -` via stdin."""
-        ns_flag = f"-n {namespace} " if namespace else ""
-        command = f"kubectl {ns_flag}apply -f -"
-        try:
-            result = subprocess.run(
-                command,
-                shell=True,
-                input=manifest,
-                capture_output=True,
-                text=True,
-                timeout=timeout_seconds,
-            )
-            return {
-                "command": f"{command}   # manifest piped via stdin",
-                "returncode": result.returncode,
-                "stdout": result.stdout.strip(),
-                "stderr": result.stderr.strip(),
-                "ok": result.returncode == 0,
-            }
-        except subprocess.TimeoutExpired:
-            return {
-                "command": command,
-                "returncode": 124,
-                "stdout": "",
-                "stderr": f"Apply timed out after {timeout_seconds}s.",
-                "ok": False,
-            }
-
     def write_incluster_kubeconfig(kubeconfig_path: str) -> tuple[bool, str]:
         """Write a kubeconfig that references the mounted service-account token via `tokenFile:`.
 
@@ -136,29 +85,29 @@ def ensure_kubectl_access() -> tuple[bool, str]:
           1. An existing `kubectl config current-context` (local or already-configured) — leave env alone.
           2. Generated in-cluster kubeconfig at `KUBECTL_KUBECONFIG_PATH` — only then set `KUBECONFIG`.
         """
-        kubectl_check = run_command("kubectl version --client")
-        if not bool(kubectl_check["ok"]):
+        kubectl_check = shell("kubectl version --client", quiet=True)
+        if not kubectl_check.ok:
             return (
                 False,
                 "`kubectl` is not available. Install it in this environment and rerun all cells.",
             )
 
-        local_context = run_command("kubectl config current-context")
-        if bool(local_context["ok"]):
+        local_context = shell("kubectl config current-context", quiet=True)
+        if local_context.ok:
             # Probe Python-side connectivity via the shared K8s helper. Failure is non-fatal — the
             # lab's shell-kubectl commands will still work.
             try:
                 K8s().validate_config()
             except KubernetesConfigError:
                 pass
-            return True, f"Using local kubectl context `{local_context['stdout']}`."
+            return True, f"Using local kubectl context `{local_context.strip()}`."
 
         incluster_ok, incluster_message = write_incluster_kubeconfig(KUBECTL_KUBECONFIG_PATH)
         if incluster_ok:
             os.environ["KUBECONFIG"] = KUBECTL_KUBECONFIG_PATH
             return True, f"Using in-cluster service-account context via `{KUBECTL_KUBECONFIG_PATH}`."
 
-        details = str(local_context["stderr"]) if local_context["stderr"] else incluster_message
+        details = local_context.stderr.strip() or incluster_message
         return (
             False,
             "kubectl is installed but no cluster context is usable. "
@@ -170,12 +119,14 @@ def step_cell(title: str, cmd: str, btn, clicked: bool, explanation: str = "") -
         """Render an imperative step: optional explanation, title + button, command, and result if clicked."""
         _output = None
         if clicked:
-            _r = run_command(cmd)
-            _status = "SUCCESS" if _r["ok"] else "FAILED"
-            _stdout = truncate(str(_r["stdout"])) if _r["stdout"] else "(no output)"
+            _r = shell(cmd, quiet=True, timeout=180)
+            _status = "SUCCESS" if _r.ok else "FAILED"
+            _stdout_text = _r.strip()
+            _stdout = truncate(_stdout_text) if _stdout_text else "(no output)"
             _stderr_block = ""
-            if _r["stderr"] and not _r["ok"]:
-                _stderr_block = f"\n**stderr:**\n```\n{truncate(str(_r['stderr']))}\n```"
+            _stderr_text = _r.stderr.strip()
+            if _stderr_text and not _r.ok:
+                _stderr_block = f"\n**stderr:**\n```\n{truncate(_stderr_text)}\n```"
             _output = mo.md(f"`{_status}`\n```\n{_stdout}\n```{_stderr_block}")
         parts = [
             mo.hstack([mo.md(f"**{title}**"), btn], justify="space-between", align="center"),
@@ -202,12 +153,15 @@ def manifest_step_cell(
         """
         _output = None
         if clicked:
-            _r = apply_manifest(editor.value, namespace)
-            _status = "SUCCESS" if _r["ok"] else "FAILED"
-            _stdout = truncate(str(_r["stdout"])) if _r["stdout"] else "(no output)"
+            ns_flag = f"-n {namespace} " if namespace else ""
+            _r = shell(f"kubectl {ns_flag}apply -f -", quiet=True, timeout=180, input=editor.value)
+            _status = "SUCCESS" if _r.ok else "FAILED"
+            _stdout_text = _r.strip()
+            _stdout = truncate(_stdout_text) if _stdout_text else "(no output)"
             _stderr_block = ""
-            if _r["stderr"] and not _r["ok"]:
-                _stderr_block = f"\n**stderr:**\n```\n{truncate(str(_r['stderr']))}\n```"
+            _stderr_text = _r.stderr.strip()
+            if _stderr_text and not _r.ok:
+                _stderr_block = f"\n**stderr:**\n```\n{truncate(_stderr_text)}\n```"
             _output = mo.md(f"`{_status}`\n```\n{_stdout}\n```{_stderr_block}")
         ns_label = f"  *(namespace: `{namespace}`)*" if namespace else ""
         parts = [
diff --git a/arena/notebooks/lib/remote_execution_helpers.py b/arena/notebooks/lib/remote_execution_helpers.py
index 3147dc0..67bfd65 100644
--- a/arena/notebooks/lib/remote_execution_helpers.py
+++ b/arena/notebooks/lib/remote_execution_helpers.py
@@ -205,9 +205,49 @@ def ssh(command: str, verbose: bool = True, stream: bool = False) -> str:
         return result.stdout
 
 
-def shell(command: str | list, quiet: bool = False, check: bool = False, stream: bool = False) -> str:  # noqa: C901
+class ShellResult(str):
+    """String-like result from shell()/bash() that also carries structured fields.
+
+    Subclasses str so existing callers that treat the return value as a string
+    keep working. New callers can read .returncode, .stderr, .ok, .command for
+    structured access.
+    """
+
+    returncode: int
+    stderr: str
+    command: str
+    ok: bool
+
+    def __new__(
+        cls,
+        stdout: str,
+        *,
+        returncode: int = 0,
+        stderr: str = "",
+        command: str = "",
+        ok: bool = True,
+    ) -> "ShellResult":
+        instance = super().__new__(cls, stdout)
+        instance.returncode = returncode
+        instance.stderr = stderr
+        instance.command = command
+        instance.ok = ok
+        return instance
+
+
+def shell(  # noqa: C901
+    command: str | list,
+    quiet: bool = False,
+    check: bool = False,
+    stream: bool = False,
+    timeout: int | None = None,
+    input: str | None = None,
+) -> ShellResult:
     """Run a local shell command with clean output.
 
+    Returns a ShellResult — a str subclass equal to the captured stdout, with
+    .returncode, .stderr, .ok, .command for callers that need structured access.
+
     Args:
         command: Command string or list of arguments
         quiet: If True, suppress status messages (only show command output)
@@ -215,15 +255,18 @@ def shell(command: str | list, quiet: bool = False, check: bool = False, stream:
         stream: If True, stream stdout/stderr in real-time (for long-running commands).
                 Note: In Marimo notebooks, streaming may still buffer until
                 cell completion due to Marimo's output capture mechanism.
-
-    Returns:
-        The stdout output as a string (empty if streaming)
+        timeout: Optional timeout in seconds (non-stream mode only). On timeout
+                 the result has returncode=124 and ok=False.
+        input: Optional string piped to the command's stdin (non-stream mode only).
 
     Example:
-        shell("aws s3 ls")
-        shell("aws configure set s3.addressing_style virtual")
+        result = shell("aws s3 ls")
+        if not result.ok:
+            print(result.stderr)
+
         shell(["kubectl", "get", "pods"])
-        shell("s5cmd cp ...", stream=True)  # Stream output in real-time
+        shell("kubectl apply -f -", input=manifest_yaml)
+        shell("s5cmd cp ...", stream=True)
     """
     if isinstance(command, str):
         cmd_display = command
@@ -261,44 +304,92 @@ def shell(command: str | list, quiet: bool = False, check: bool = False, stream:
                 output_lines.append(line)
 
         proc.wait()
+        stdout = "".join(output_lines)
 
         if proc.returncode == 0:
             if not quiet:
                 sys.stdout.write("✓ Done (exit: 0)\n")
                 sys.stdout.flush()
         else:
-            sys.stdout.write(f"✗ Failed (exit: {proc.returncode})\n")
-            sys.stdout.flush()
+            if not quiet:
+                sys.stdout.write(f"✗ Failed (exit: {proc.returncode})\n")
+                sys.stdout.flush()
             if check:
                 raise subprocess.CalledProcessError(proc.returncode, command)
 
-        return "".join(output_lines)
+        return ShellResult(
+            stdout,
+            returncode=proc.returncode,
+            stderr="",
+            command=cmd_display,
+            ok=proc.returncode == 0,
+        )
 
-    else:
-        # Capture output (original behavior)
+    try:
         result = subprocess.run(
             command,
             shell=shell_mode,
             capture_output=True,
             text=True,
+            timeout=timeout,
+            input=input,
         )
-
-        if result.returncode == 0:
-            if not quiet:
-                print(f"✓ {cmd_display}")
-            if result.stdout.strip():
-                print(result.stdout.rstrip())
-            return result.stdout
-        else:
+    except subprocess.TimeoutExpired as err:
+        out = err.stdout or ""
+        err_text = err.stderr or ""
+        if isinstance(out, bytes):
+            out = out.decode("utf-8", errors="replace")
+        if isinstance(err_text, bytes):
+            err_text = err_text.decode("utf-8", errors="replace")
+        stderr_msg = f"Command timed out after {timeout}s. {err_text.strip()}".strip()
+        if not quiet:
             print(f"✗ {cmd_display}")
-            if result.stderr.strip():
-                print(f"  Error: {result.stderr.strip()}")
-            if check:
-                raise subprocess.CalledProcessError(result.returncode, command)
-            return ""
+            print(f"  Error: {stderr_msg}")
+        if check:
+            raise
+        return ShellResult(
+            out,
+            returncode=124,
+            stderr=stderr_msg,
+            command=cmd_display,
+            ok=False,
+        )
 
+    if result.returncode == 0:
+        if not quiet:
+            print(f"✓ {cmd_display}")
+        if result.stdout.strip():
+            print(result.stdout.rstrip())
+        return ShellResult(
+            result.stdout,
+            returncode=0,
+            stderr=result.stderr,
+            command=cmd_display,
+            ok=True,
+        )
+
+    if not quiet:
+        print(f"✗ {cmd_display}")
+        if result.stderr.strip():
+            print(f"  Error: {result.stderr.strip()}")
+    if check:
+        raise subprocess.CalledProcessError(result.returncode, command)
+    return ShellResult(
+        result.stdout,
+        returncode=result.returncode,
+        stderr=result.stderr,
+        command=cmd_display,
+        ok=False,
+    )
 
-def bash(command: str, quiet: bool = False, stream: bool = False) -> str:
+
+def bash(
+    command: str,
+    quiet: bool = False,
+    stream: bool = False,
+    timeout: int | None = None,
+    input: str | None = None,
+) -> ShellResult:
     """Alias for shell() - run a bash command.
 
     Example:
@@ -306,7 +397,7 @@ def bash(command: str, quiet: bool = False, stream: bool = False) -> str:
         bash("aws s3 ls")
         bash("s5cmd cp ...", stream=True)
     """
-    return shell(command, quiet=quiet, stream=stream)
+    return shell(command, quiet=quiet, stream=stream, timeout=timeout, input=input)
 
 
 if __name__ == "__main__":

From a959205ed19cf4176e41a8f5f1aace8b2be8c520 Mon Sep 17 00:00:00 2001
From: David Flanagan <dflanagan@coreweave.com>
Date: Wed, 13 May 2026 16:39:40 +0100
Subject: [PATCH 3/4] chore: refactor kubernetes access helpers

---
 arena/notebooks/caios_warp.py                 |  2 +-
 arena/notebooks/cks_kubernetes.py             | 84 +------------------
 arena/notebooks/lib/k8s.py                    | 84 +++++++++++++++++++
 .../notebooks/lib/remote_execution_helpers.py |  5 ++
 4 files changed, 93 insertions(+), 82 deletions(-)

diff --git a/arena/notebooks/caios_warp.py b/arena/notebooks/caios_warp.py
index bfa53cd..33cb384 100644
--- a/arena/notebooks/caios_warp.py
+++ b/arena/notebooks/caios_warp.py
@@ -38,8 +38,8 @@ def _():
     _elements = [
         banner(),
         about(
-            """This notebook provides a walkthrough for benchmarking CoreWeave AI Object Storage (CAIOS) and LOTA inside a CoreWeave cluster using the Minio Warp tool.<br>
             "Warp",
+            """This notebook provides a walkthrough for benchmarking CoreWeave AI Object Storage (CAIOS) and LOTA inside a CoreWeave cluster using the Minio Warp tool.<br>
                _If you are running this notebook in edit mode, make sure you start by running all cells in the bottom right._
             """,
         ),
diff --git a/arena/notebooks/cks_kubernetes.py b/arena/notebooks/cks_kubernetes.py
index 142a854..1e2116a 100644
--- a/arena/notebooks/cks_kubernetes.py
+++ b/arena/notebooks/cks_kubernetes.py
@@ -16,7 +16,7 @@
     import os
 
     import marimo as mo
-    from lib.k8s import K8s, KubernetesConfigError
+    from lib.k8s import ensure_kubectl_access
     from lib.remote_execution_helpers import shell
     from lib.ui import about, banner, security_disclaimer, table_of_contents
 
@@ -31,90 +31,12 @@
     SERVICE_NAME = "arena-k8s-web"
     JOB_NAME = "arena-k8s-job"
 
-    SERVICE_ACCOUNT_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
-    SERVICE_ACCOUNT_CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
-    KUBECTL_KUBECONFIG_PATH = "/tmp/arena-incluster-kubectl.yaml"
-
     def truncate(text: str, max_chars: int = 4000) -> str:
+        """Limit command output to a readable size for notebook display."""
         if len(text) <= max_chars:
             return text
         return f"{text[:max_chars]}\n...output truncated..."
 
-    def write_incluster_kubeconfig(kubeconfig_path: str) -> tuple[bool, str]:
-        """Write a kubeconfig that references the mounted service-account token via `tokenFile:`.
-
-        Using `tokenFile:` (rather than inlining `token:`) keeps the bearer token in the
-        kernel-protected `/var/run/secrets/...` mount and picks up token rotation automatically.
-        """
-        host = os.getenv("KUBERNETES_SERVICE_HOST", "")
-        port = os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "") or os.getenv("KUBERNETES_SERVICE_PORT", "443")
-
-        if not host:
-            return False, "KUBERNETES_SERVICE_HOST is not set."
-        if not os.path.exists(SERVICE_ACCOUNT_TOKEN_PATH):
-            return False, f"Service account token not found at {SERVICE_ACCOUNT_TOKEN_PATH}."
-        if not os.path.exists(SERVICE_ACCOUNT_CA_PATH):
-            return False, f"Service account CA cert not found at {SERVICE_ACCOUNT_CA_PATH}."
-
-        kubeconfig = f"""apiVersion: v1
-kind: Config
-clusters:
-- name: in-cluster
-  cluster:
-    server: https://{host}:{port}
-    certificate-authority: {SERVICE_ACCOUNT_CA_PATH}
-users:
-- name: service-account
-  user:
-    tokenFile: {SERVICE_ACCOUNT_TOKEN_PATH}
-contexts:
-- name: in-cluster
-  context:
-    cluster: in-cluster
-    user: service-account
-current-context: in-cluster
-"""
-        with open(kubeconfig_path, "w", encoding="utf-8") as file:
-            file.write(kubeconfig)
-        return True, kubeconfig_path
-
-    def ensure_kubectl_access() -> tuple[bool, str]:
-        """Make sure kubectl has a usable context, without disturbing the user's existing KUBECONFIG.
-
-        Order of preference:
-          1. An existing `kubectl config current-context` (local or already-configured) — leave env alone.
-          2. Generated in-cluster kubeconfig at `KUBECTL_KUBECONFIG_PATH` — only then set `KUBECONFIG`.
-        """
-        kubectl_check = shell("kubectl version --client", quiet=True)
-        if not kubectl_check.ok:
-            return (
-                False,
-                "`kubectl` is not available. Install it in this environment and rerun all cells.",
-            )
-
-        local_context = shell("kubectl config current-context", quiet=True)
-        if local_context.ok:
-            # Probe Python-side connectivity via the shared K8s helper. Failure is non-fatal — the
-            # lab's shell-kubectl commands will still work.
-            try:
-                K8s().validate_config()
-            except KubernetesConfigError:
-                pass
-            return True, f"Using local kubectl context `{local_context.strip()}`."
-
-        incluster_ok, incluster_message = write_incluster_kubeconfig(KUBECTL_KUBECONFIG_PATH)
-        if incluster_ok:
-            os.environ["KUBECONFIG"] = KUBECTL_KUBECONFIG_PATH
-            return True, f"Using in-cluster service-account context via `{KUBECTL_KUBECONFIG_PATH}`."
-
-        details = local_context.stderr.strip() or incluster_message
-        return (
-            False,
-            "kubectl is installed but no cluster context is usable. "
-            "If running locally, set a context with `kubectl config use-context <name>`. "
-            f"Details: `{details}`",
-        )
-
     def step_cell(title: str, cmd: str, btn, clicked: bool, explanation: str = "") -> mo.Html:
         """Render an imperative step: optional explanation, title + button, command, and result if clicked."""
         _output = None
@@ -184,7 +106,7 @@ def load_manifest(filename: str) -> str:
         with open(os.path.join(MANIFEST_DIR, filename), "r", encoding="utf-8") as file:
             return file.read()
 
-    RUN_BUTTON = dict(value=0, on_click=lambda v: v + 1, label="Run")
+    RUN_BUTTON = {"value": 0, "on_click": lambda v: v + 1, "label": "Run"}
 
     # ----------------------------------------------------------------
     # Manifests (the source of truth for each lab object).
diff --git a/arena/notebooks/lib/k8s.py b/arena/notebooks/lib/k8s.py
index a75e683..55e46b7 100644
--- a/arena/notebooks/lib/k8s.py
+++ b/arena/notebooks/lib/k8s.py
@@ -7,9 +7,15 @@
 from kubernetes.client.rest import ApiException
 from ruamel.yaml import YAML
 
+from .remote_execution_helpers import shell
+
 # We don't have a good way to get this off node labels currently
 AVAILABILITY_ZONE = os.getenv("AVAILABILITY_ZONE", "A")
 
+SERVICE_ACCOUNT_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+SERVICE_ACCOUNT_CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
+KUBECTL_INCLUSTER_KUBECONFIG_PATH = "/tmp/arena-incluster-kubectl.yaml"
+
 
 class KubernetesError(Exception):
     """Base exception for k8s helper errors."""
@@ -543,6 +549,84 @@ def delete_by_label(self, namespace: str, label_selector: str) -> dict[str, list
         return results
 
 
+def _write_incluster_kubeconfig(kubeconfig_path: str) -> None:
+    """Write a kubeconfig that references the mounted service-account token via `tokenFile:`.
+
+    Using `tokenFile:` (rather than inlining `token:`) keeps the bearer token in the
+    kernel-protected `/var/run/secrets/...` mount and picks up token rotation automatically.
+
+    Assumes `config.load_incluster_config()` has already validated the mount exists.
+    """
+    host = os.environ["KUBERNETES_SERVICE_HOST"]
+    port = os.getenv("KUBERNETES_SERVICE_PORT_HTTPS") or os.getenv("KUBERNETES_SERVICE_PORT", "443")
+    kubeconfig = f"""apiVersion: v1
+kind: Config
+clusters:
+- name: in-cluster
+  cluster:
+    server: https://{host}:{port}
+    certificate-authority: {SERVICE_ACCOUNT_CA_PATH}
+users:
+- name: service-account
+  user:
+    tokenFile: {SERVICE_ACCOUNT_TOKEN_PATH}
+contexts:
+- name: in-cluster
+  context:
+    cluster: in-cluster
+    user: service-account
+current-context: in-cluster
+"""
+    with open(kubeconfig_path, "w", encoding="utf-8") as file:
+        file.write(kubeconfig)
+
+
+def ensure_kubectl_access() -> tuple[bool, str]:
+    """Make sure `kubectl` has a usable context, without disturbing the user's existing KUBECONFIG.
+
+    Order of preference:
+      1. An existing `kubectl config current-context` (local or already-configured) — leave env alone.
+      2. An in-cluster service-account identity — write a bridge kubeconfig at
+         `KUBECTL_INCLUSTER_KUBECONFIG_PATH` and set `KUBECONFIG` so kubectl picks it up.
+    """
+    if not shell("kubectl version --client", quiet=True).ok:
+        return (
+            False,
+            "`kubectl` is not available. Install it in this environment and rerun all cells.",
+        )
+
+    local_context = shell("kubectl config current-context", quiet=True)
+    cluster_probe = None
+    if local_context.ok:
+        cluster_probe = shell("kubectl cluster-info", quiet=True, timeout=15)
+        if cluster_probe.ok:
+            # Non-fatal Python-side connectivity probe — kubectl will still work even if this fails.
+            try:
+                K8s().validate_config()
+            except KubernetesConfigError:
+                pass
+            return True, f"Using local kubectl context `{local_context.strip()}`."
+
+    # No local kubectl context — try in-cluster. `load_incluster_config()` is the canonical
+    # detection: it succeeds only when the SA token + CA + service env vars are all in place.
+    try:
+        config.load_incluster_config()
+    except config.ConfigException as e:
+        details = (
+            cluster_probe.stderr.strip() if cluster_probe is not None else local_context.stderr.strip()
+        ) or str(e)
+        return (
+            False,
+            "kubectl is installed but no cluster context is usable. "
+            "If running locally, set a context with `kubectl config use-context <name>`. "
+            f"Details: `{details}`",
+        )
+
+    _write_incluster_kubeconfig(KUBECTL_INCLUSTER_KUBECONFIG_PATH)
+    os.environ["KUBECONFIG"] = KUBECTL_INCLUSTER_KUBECONFIG_PATH
+    return True, f"Using in-cluster service-account context via `{KUBECTL_INCLUSTER_KUBECONFIG_PATH}`."
+
+
 def kubeconfig_input() -> tuple[mo.Html | None, mo.ui.form | None]:
     """Create a form for a user to input their kubeconfig path.
 
diff --git a/arena/notebooks/lib/remote_execution_helpers.py b/arena/notebooks/lib/remote_execution_helpers.py
index 67bfd65..44a4c50 100644
--- a/arena/notebooks/lib/remote_execution_helpers.py
+++ b/arena/notebooks/lib/remote_execution_helpers.py
@@ -227,6 +227,7 @@ def __new__(
         command: str = "",
         ok: bool = True,
     ) -> "ShellResult":
+        """Create a string result with subprocess metadata attached."""
         instance = super().__new__(cls, stdout)
         instance.returncode = returncode
         instance.stderr = stderr
@@ -234,6 +235,10 @@ def __new__(
         instance.ok = ok
         return instance
 
+    def __bool__(self) -> bool:
+        """Treat failed commands as false while preserving string truthiness for stdout."""
+        return self.ok and len(self) > 0
+
 
 def shell(  # noqa: C901
     command: str | list,

From 1865265ae51c4d6c5b64dbdb21c87d49e5631faf Mon Sep 17 00:00:00 2001
From: David Flanagan <dflanagan@coreweave.com>
Date: Tue, 2 Jun 2026 14:42:53 +0100
Subject: [PATCH 4/4] chore(k8s): selection of fixes

---
 arena/notebooks/cks_kubernetes.py             | 84 ++++++++-----------
 arena/notebooks/lib/k8s.py                    |  1 +
 .../notebooks/lib/remote_execution_helpers.py |  4 +-
 3 files changed, 40 insertions(+), 49 deletions(-)

diff --git a/arena/notebooks/cks_kubernetes.py b/arena/notebooks/cks_kubernetes.py
index 1e2116a..6e8e23d 100644
--- a/arena/notebooks/cks_kubernetes.py
+++ b/arena/notebooks/cks_kubernetes.py
@@ -62,21 +62,22 @@ def step_cell(title: str, cmd: str, btn, clicked: bool, explanation: str = "") -
 
     def manifest_step_cell(
         title: str,
-        editor,
+        form,
         namespace: str | None,
-        btn,
-        clicked: bool,
         explanation: str = "",
     ) -> mo.Html:
-        """Render a declarative step: title + button, optional explanation, editable YAML, result on click.
+        """Render a declarative step: title, optional explanation, YAML form, result on submit.
 
-        `editor` is a `mo.ui.code_editor` whose current `.value` is applied to the cluster
-        when the Run button has been clicked. Editing the YAML and re-clicking re-applies.
+        `form` is a `mo.ui.code_editor(...).form()` — `form.value` is None until the user
+        clicks the form's submit button, then becomes the current editor contents. Each
+        subsequent submit re-applies the (possibly edited) manifest; edits made without
+        clicking submit do not trigger a re-apply.
         """
         _output = None
-        if clicked:
+        manifest = form.value
+        if manifest is not None:
             ns_flag = f"-n {namespace} " if namespace else ""
-            _r = shell(f"kubectl {ns_flag}apply -f -", quiet=True, timeout=180, input=editor.value)
+            _r = shell(f"kubectl {ns_flag}apply -f -", quiet=True, timeout=180, input=manifest)
             _status = "SUCCESS" if _r.ok else "FAILED"
             _stdout_text = _r.strip()
             _stdout = truncate(_stdout_text) if _stdout_text else "(no output)"
@@ -86,17 +87,10 @@ def manifest_step_cell(
                 _stderr_block = f"\n**stderr:**\n```\n{truncate(_stderr_text)}\n```"
             _output = mo.md(f"`{_status}`\n```\n{_stdout}\n```{_stderr_block}")
         ns_label = f"  *(namespace: `{namespace}`)*" if namespace else ""
-        parts = [
-            mo.hstack(
-                [mo.md(f"**{title}**{ns_label}"), btn],
-                justify="space-between",
-                align="center",
-            ),
-        ]
+        parts = [mo.md(f"**{title}**{ns_label}")]
         if explanation:
             parts.append(mo.md(explanation))
-        parts.append(editor)
-        parts.append(mo.md("_Edit the manifest above if you like, then click **Run** to_ `kubectl apply -f -` _it._"))
+        parts.append(form)
         if _output is not None:
             parts.append(_output)
         return mo.vstack(parts)
@@ -267,26 +261,25 @@ def _(kubectl_ready: bool):
 @app.cell(hide_code=True)
 def _(kubectl_ready: bool):
     mo.stop(not kubectl_ready)
-    cm_editor = mo.ui.code_editor(value=CONFIGMAP_MANIFEST, language="yaml", min_height=200, debounce=True)
-    cm_b1 = mo.ui.button(**RUN_BUTTON)
+    cm_form = mo.ui.code_editor(value=CONFIGMAP_MANIFEST, language="yaml", min_height=200).form(
+        submit_button_label="Apply", bordered=False
+    )
     cm_b2 = mo.ui.button(**RUN_BUTTON)
     cm_b3 = mo.ui.button(**RUN_BUTTON)
     cm_b4 = mo.ui.button(**RUN_BUTTON)
     cm_b5 = mo.ui.button(**RUN_BUTTON)
-    return cm_editor, cm_b1, cm_b2, cm_b3, cm_b4, cm_b5
+    return cm_form, cm_b2, cm_b3, cm_b4, cm_b5
 
 
 @app.cell(hide_code=True)
-def _(cm_editor, cm_b1):
+def _(cm_form):
     manifest_step_cell(
         "1. Apply ConfigMap Manifest",
-        cm_editor,
+        cm_form,
         LAB_NAMESPACE,
-        cm_b1,
-        cm_b1.value > 0,
         explanation=(
             "Two top-level fields matter: `metadata` (identity — name, namespace, labels) "
-            "and `data` (the key/value payload). Edit the YAML and re-click **Run** to see how "
+            "and `data` (the key/value payload). Edit the YAML and click **Apply** to see how "
             "`kubectl apply` reconciles your change."
         ),
     )
@@ -365,8 +358,9 @@ def _(kubectl_ready: bool):
 @app.cell(hide_code=True)
 def _(kubectl_ready: bool):
     mo.stop(not kubectl_ready)
-    dep_editor = mo.ui.code_editor(value=DEPLOYMENT_MANIFEST, language="yaml", min_height=320, debounce=True)
-    dep_b1 = mo.ui.button(**RUN_BUTTON)
+    dep_form = mo.ui.code_editor(value=DEPLOYMENT_MANIFEST, language="yaml", min_height=320).form(
+        submit_button_label="Apply", bordered=False
+    )
     dep_b2 = mo.ui.button(**RUN_BUTTON)
     dep_b3 = mo.ui.button(**RUN_BUTTON)
     dep_b4 = mo.ui.button(**RUN_BUTTON)
@@ -374,17 +368,15 @@ def _(kubectl_ready: bool):
     dep_b6 = mo.ui.button(**RUN_BUTTON)
     dep_b7 = mo.ui.button(**RUN_BUTTON)
     dep_b8 = mo.ui.button(**RUN_BUTTON)
-    return dep_editor, dep_b1, dep_b2, dep_b3, dep_b4, dep_b5, dep_b6, dep_b7, dep_b8
+    return dep_form, dep_b2, dep_b3, dep_b4, dep_b5, dep_b6, dep_b7, dep_b8
 
 
 @app.cell(hide_code=True)
-def _(dep_editor, dep_b1):
+def _(dep_form):
     manifest_step_cell(
         "1. Apply Deployment Manifest",
-        dep_editor,
+        dep_form,
         LAB_NAMESPACE,
-        dep_b1,
-        dep_b1.value > 0,
         explanation=(
             "Three nested layers: the Deployment's `selector.matchLabels` finds Pods whose labels match. "
             "`template.metadata.labels` *stamps* those labels onto every Pod it creates. "
@@ -515,26 +507,25 @@ def _(kubectl_ready: bool):
 @app.cell(hide_code=True)
 def _(kubectl_ready: bool):
     mo.stop(not kubectl_ready)
-    svc_editor = mo.ui.code_editor(value=SERVICE_MANIFEST, language="yaml", min_height=220, debounce=True)
-    svc_b1 = mo.ui.button(**RUN_BUTTON)
+    svc_form = mo.ui.code_editor(value=SERVICE_MANIFEST, language="yaml", min_height=220).form(
+        submit_button_label="Apply", bordered=False
+    )
     svc_b2 = mo.ui.button(**RUN_BUTTON)
     svc_b3 = mo.ui.button(**RUN_BUTTON)
     svc_b4 = mo.ui.button(**RUN_BUTTON)
     svc_b5 = mo.ui.button(**RUN_BUTTON)
-    return svc_editor, svc_b1, svc_b2, svc_b3, svc_b4, svc_b5
+    return svc_form, svc_b2, svc_b3, svc_b4, svc_b5
 
 
 @app.cell(hide_code=True)
-def _(svc_editor, svc_b1):
+def _(svc_form):
     manifest_step_cell(
         "1. Apply Service Manifest",
-        svc_editor,
+        svc_form,
         LAB_NAMESPACE,
-        svc_b1,
-        svc_b1.value > 0,
         explanation=(
             f"`spec.selector: app: {DEPLOYMENT_NAME}` is the only link between this Service and the Deployment's Pods — "
-            "no foreign key, just label matching. Try changing the selector to a label no Pod has and re-run; "
+            "no foreign key, just label matching. Try changing the selector to a label no Pod has and click **Apply**; "
             "the Endpoints in step 3 will go empty."
         ),
     )
@@ -608,12 +599,13 @@ def _(kubectl_ready: bool):
 @app.cell(hide_code=True)
 def _(kubectl_ready: bool):
     mo.stop(not kubectl_ready)
-    job_editor = mo.ui.code_editor(value=JOB_MANIFEST, language="yaml", min_height=280, debounce=True)
+    job_form = mo.ui.code_editor(value=JOB_MANIFEST, language="yaml", min_height=280).form(
+        submit_button_label="Apply", bordered=False
+    )
     job_b1 = mo.ui.button(**RUN_BUTTON)
-    job_b2 = mo.ui.button(**RUN_BUTTON)
     job_b3 = mo.ui.button(**RUN_BUTTON)
     job_b4 = mo.ui.button(**RUN_BUTTON)
-    return job_editor, job_b1, job_b2, job_b3, job_b4
+    return job_form, job_b1, job_b3, job_b4
 
 
 @app.cell(hide_code=True)
@@ -629,13 +621,11 @@ def _(job_b1):
 
 
 @app.cell(hide_code=True)
-def _(job_editor, job_b2):
+def _(job_form):
     manifest_step_cell(
         "2. Apply Job Manifest",
-        job_editor,
+        job_form,
         LAB_NAMESPACE,
-        job_b2,
-        job_b2.value > 0,
         explanation=(
             "`restartPolicy: Never` is required for Jobs (or `OnFailure`) — it tells the kubelet not to restart "
             "a Pod that exits. `backoffLimit: 2` caps retries if the Pod fails."
diff --git a/arena/notebooks/lib/k8s.py b/arena/notebooks/lib/k8s.py
index 55e46b7..267bb72 100644
--- a/arena/notebooks/lib/k8s.py
+++ b/arena/notebooks/lib/k8s.py
@@ -579,6 +579,7 @@ def _write_incluster_kubeconfig(kubeconfig_path: str) -> None:
 """
     with open(kubeconfig_path, "w", encoding="utf-8") as file:
         file.write(kubeconfig)
+    os.chmod(kubeconfig_path, 0o600)
 
 
 def ensure_kubectl_access() -> tuple[bool, str]:
diff --git a/arena/notebooks/lib/remote_execution_helpers.py b/arena/notebooks/lib/remote_execution_helpers.py
index 44a4c50..253fc8f 100644
--- a/arena/notebooks/lib/remote_execution_helpers.py
+++ b/arena/notebooks/lib/remote_execution_helpers.py
@@ -236,8 +236,8 @@ def __new__(
         return instance
 
     def __bool__(self) -> bool:
-        """Treat failed commands as false while preserving string truthiness for stdout."""
-        return self.ok and len(self) > 0
+        """True iff the command exited successfully (returncode 0)."""
+        return self.ok
 
 
 def shell(  # noqa: C901