From 7bf5dc388e720d55cfd06e85b8663e04675bc52c Mon Sep 17 00:00:00 2001 From: David Flanagan Date: Wed, 8 Apr 2026 15:40:45 +0100 Subject: [PATCH 1/4] feat(arena): introduction to kubernetes notebook --- arena/notebooks/cks_kubernetes.py | 850 ++++++++++++++++++ arena/notebooks/cks_kubernetes/configmap.yaml | 9 + .../notebooks/cks_kubernetes/deployment.yaml | 22 + arena/notebooks/cks_kubernetes/job.yaml | 18 + arena/notebooks/cks_kubernetes/service.yaml | 13 + 5 files changed, 912 insertions(+) create mode 100644 arena/notebooks/cks_kubernetes.py create mode 100644 arena/notebooks/cks_kubernetes/configmap.yaml create mode 100644 arena/notebooks/cks_kubernetes/deployment.yaml create mode 100644 arena/notebooks/cks_kubernetes/job.yaml create mode 100644 arena/notebooks/cks_kubernetes/service.yaml diff --git a/arena/notebooks/cks_kubernetes.py b/arena/notebooks/cks_kubernetes.py new file mode 100644 index 0000000..69d40a0 --- /dev/null +++ b/arena/notebooks/cks_kubernetes.py @@ -0,0 +1,850 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "kubernetes==35.0.0", +# "marimo>=0.23.6", +# "ruamel-yaml>=0.19.1", +# ] +# /// + +import marimo + +__generated_with = "0.23.6" +app = marimo.App(width="medium", app_title="CoreWeave ARENA") + +with app.setup: + import os + import subprocess + + import marimo as mo + from lib.k8s import K8s, KubernetesConfigError + from lib.ui import about, banner, security_disclaimer, table_of_contents + + MANIFEST_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cks_kubernetes") + + LAB_NAMESPACE = "arena-k8s-lab" + LAB_LABEL_KEY = "app.kubernetes.io/part-of" + LAB_LABEL_VALUE = "arena-k8s-lab" + + CONFIGMAP_NAME = "arena-k8s-config" + DEPLOYMENT_NAME = "arena-k8s-web" + SERVICE_NAME = "arena-k8s-web" + JOB_NAME = "arena-k8s-job" + + SERVICE_ACCOUNT_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token" + SERVICE_ACCOUNT_CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + KUBECTL_KUBECONFIG_PATH = "/tmp/arena-incluster-kubectl.yaml" + + def truncate(text: str, max_chars: int = 4000) -> str: + if len(text) <= max_chars: + return text + return f"{text[:max_chars]}\n...output truncated..." + + def run_command(command: str, timeout_seconds: int = 180) -> dict: + """Run a shell command and return structured result.""" + try: + result = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=timeout_seconds) + return { + "command": command, + "returncode": result.returncode, + "stdout": result.stdout.strip(), + "stderr": result.stderr.strip(), + "ok": result.returncode == 0, + } + except subprocess.TimeoutExpired as err: + stdout = str(err.stdout or "").strip() if err.stdout else "" + stderr = str(err.stderr or "").strip() if err.stderr else "" + return { + "command": command, + "returncode": 124, + "stdout": stdout, + "stderr": f"Command timed out after {timeout_seconds}s. {stderr}".strip(), + "ok": False, + } + + def apply_manifest(manifest: str, namespace: str | None = None, timeout_seconds: int = 180) -> dict: + """Apply a YAML manifest by piping it to `kubectl apply -f -` via stdin.""" + ns_flag = f"-n {namespace} " if namespace else "" + command = f"kubectl {ns_flag}apply -f -" + try: + result = subprocess.run( + command, + shell=True, + input=manifest, + capture_output=True, + text=True, + timeout=timeout_seconds, + ) + return { + "command": f"{command} # manifest piped via stdin", + "returncode": result.returncode, + "stdout": result.stdout.strip(), + "stderr": result.stderr.strip(), + "ok": result.returncode == 0, + } + except subprocess.TimeoutExpired: + return { + "command": command, + "returncode": 124, + "stdout": "", + "stderr": f"Apply timed out after {timeout_seconds}s.", + "ok": False, + } + + def write_incluster_kubeconfig(kubeconfig_path: str) -> tuple[bool, str]: + """Write a kubeconfig that references the mounted service-account token via `tokenFile:`. + + Using `tokenFile:` (rather than inlining `token:`) keeps the bearer token in the + kernel-protected `/var/run/secrets/...` mount and picks up token rotation automatically. + """ + host = os.getenv("KUBERNETES_SERVICE_HOST", "") + port = os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "") or os.getenv("KUBERNETES_SERVICE_PORT", "443") + + if not host: + return False, "KUBERNETES_SERVICE_HOST is not set." + if not os.path.exists(SERVICE_ACCOUNT_TOKEN_PATH): + return False, f"Service account token not found at {SERVICE_ACCOUNT_TOKEN_PATH}." + if not os.path.exists(SERVICE_ACCOUNT_CA_PATH): + return False, f"Service account CA cert not found at {SERVICE_ACCOUNT_CA_PATH}." + + kubeconfig = f"""apiVersion: v1 +kind: Config +clusters: +- name: in-cluster + cluster: + server: https://{host}:{port} + certificate-authority: {SERVICE_ACCOUNT_CA_PATH} +users: +- name: service-account + user: + tokenFile: {SERVICE_ACCOUNT_TOKEN_PATH} +contexts: +- name: in-cluster + context: + cluster: in-cluster + user: service-account +current-context: in-cluster +""" + with open(kubeconfig_path, "w", encoding="utf-8") as file: + file.write(kubeconfig) + return True, kubeconfig_path + + def ensure_kubectl_access() -> tuple[bool, str]: + """Make sure kubectl has a usable context, without disturbing the user's existing KUBECONFIG. + + Order of preference: + 1. An existing `kubectl config current-context` (local or already-configured) — leave env alone. + 2. Generated in-cluster kubeconfig at `KUBECTL_KUBECONFIG_PATH` — only then set `KUBECONFIG`. + """ + kubectl_check = run_command("kubectl version --client") + if not bool(kubectl_check["ok"]): + return ( + False, + "`kubectl` is not available. Install it in this environment and rerun all cells.", + ) + + local_context = run_command("kubectl config current-context") + if bool(local_context["ok"]): + # Probe Python-side connectivity via the shared K8s helper. Failure is non-fatal — the + # lab's shell-kubectl commands will still work. + try: + K8s().validate_config() + except KubernetesConfigError: + pass + return True, f"Using local kubectl context `{local_context['stdout']}`." + + incluster_ok, incluster_message = write_incluster_kubeconfig(KUBECTL_KUBECONFIG_PATH) + if incluster_ok: + os.environ["KUBECONFIG"] = KUBECTL_KUBECONFIG_PATH + return True, f"Using in-cluster service-account context via `{KUBECTL_KUBECONFIG_PATH}`." + + details = str(local_context["stderr"]) if local_context["stderr"] else incluster_message + return ( + False, + "kubectl is installed but no cluster context is usable. " + "If running locally, set a context with `kubectl config use-context `. " + f"Details: `{details}`", + ) + + def step_cell(title: str, cmd: str, btn, clicked: bool, explanation: str = "") -> mo.Html: + """Render an imperative step: optional explanation, title + button, command, and result if clicked.""" + _output = None + if clicked: + _r = run_command(cmd) + _status = "SUCCESS" if _r["ok"] else "FAILED" + _stdout = truncate(str(_r["stdout"])) if _r["stdout"] else "(no output)" + _stderr_block = "" + if _r["stderr"] and not _r["ok"]: + _stderr_block = f"\n**stderr:**\n```\n{truncate(str(_r['stderr']))}\n```" + _output = mo.md(f"`{_status}`\n```\n{_stdout}\n```{_stderr_block}") + parts = [ + mo.hstack([mo.md(f"**{title}**"), btn], justify="space-between", align="center"), + ] + if explanation: + parts.append(mo.md(explanation)) + parts.append(mo.md(f"```\n{cmd}\n```")) + if _output is not None: + parts.append(_output) + return mo.vstack(parts) + + def manifest_step_cell( + title: str, + editor, + namespace: str | None, + btn, + clicked: bool, + explanation: str = "", + ) -> mo.Html: + """Render a declarative step: title + button, optional explanation, editable YAML, result on click. + + `editor` is a `mo.ui.code_editor` whose current `.value` is applied to the cluster + when the Run button has been clicked. Editing the YAML and re-clicking re-applies. + """ + _output = None + if clicked: + _r = apply_manifest(editor.value, namespace) + _status = "SUCCESS" if _r["ok"] else "FAILED" + _stdout = truncate(str(_r["stdout"])) if _r["stdout"] else "(no output)" + _stderr_block = "" + if _r["stderr"] and not _r["ok"]: + _stderr_block = f"\n**stderr:**\n```\n{truncate(str(_r['stderr']))}\n```" + _output = mo.md(f"`{_status}`\n```\n{_stdout}\n```{_stderr_block}") + ns_label = f" *(namespace: `{namespace}`)*" if namespace else "" + parts = [ + mo.hstack( + [mo.md(f"**{title}**{ns_label}"), btn], + justify="space-between", + align="center", + ), + ] + if explanation: + parts.append(mo.md(explanation)) + parts.append(editor) + parts.append(mo.md("_Edit the manifest above if you like, then click **Run** to_ `kubectl apply -f -` _it._")) + if _output is not None: + parts.append(_output) + return mo.vstack(parts) + + def load_manifest(filename: str) -> str: + """Load a YAML manifest from the `cks_kubernetes/` directory next to this notebook.""" + with open(os.path.join(MANIFEST_DIR, filename), "r", encoding="utf-8") as file: + return file.read() + + RUN_BUTTON = dict(value=0, on_click=lambda v: v + 1, label="Run") + + # ---------------------------------------------------------------- + # Manifests (the source of truth for each lab object). + # Stored as real YAML files in `cks_kubernetes/` so they're easy to edit locally. + # ---------------------------------------------------------------- + + CONFIGMAP_MANIFEST = load_manifest("configmap.yaml") + DEPLOYMENT_MANIFEST = load_manifest("deployment.yaml") + SERVICE_MANIFEST = load_manifest("service.yaml") + JOB_MANIFEST = load_manifest("job.yaml") + + +# ============================================================ +# Header +# ============================================================ + + +@app.cell(hide_code=True) +def _(): + mo.vstack( + [ + banner(), + about( + "Kubernetes", + """This notebook is a hands-on introduction to Kubernetes on CoreWeave Kubernetes Service (CKS).
+ Each section presents the actual YAML manifest, applies it declaratively, and then observes what the controllers do.
+ _If you are running this notebook in edit mode, start by running all cells in the bottom right._ + """, + ), + table_of_contents( + [ + {"title": "Cluster Access", "description": "Verify kubectl connectivity."}, + {"title": "Environment and Namespace Setup", "description": "Create the lab namespace."}, + {"title": "ConfigMap Lab", "description": "Apply, read, patch, and delete a ConfigMap."}, + {"title": "Deployment Lab", "description": "Apply, scale, watch self-healing, and roll an update."}, + {"title": "Service Lab", "description": "Expose a deployment and inspect endpoints."}, + {"title": "Job Lab", "description": "Run a batch job to completion and inspect logs."}, + {"title": "Cleanup", "description": "Delete lab resources and namespace."}, + ] + ), + security_disclaimer(), + ] + ) + return + + +@app.cell(hide_code=True) +def _(): + kubectl_ready, access_message = ensure_kubectl_access() + mo.md(f""" +--- +## Cluster Access + +**kubectl status:** `{"READY" if kubectl_ready else "NOT READY"}` + +{access_message} +""") + return (kubectl_ready,) + + +# ============================================================ +# Environment and Namespace Setup +# ============================================================ + + +@app.cell(hide_code=True) +def _(kubectl_ready: bool): + mo.stop(not kubectl_ready) + mo.md( + "---\n## Environment and Namespace Setup\n\n" + "Create and verify a dedicated lab namespace. Every object below will live inside it, " + "so deleting the namespace at the end cleans everything up at once." + ) + return + + +@app.cell(hide_code=True) +def _(kubectl_ready: bool): + mo.stop(not kubectl_ready) + env_b1 = mo.ui.button(**RUN_BUTTON) + env_b2 = mo.ui.button(**RUN_BUTTON) + env_b3 = mo.ui.button(**RUN_BUTTON) + env_b4 = mo.ui.button(**RUN_BUTTON) + return env_b1, env_b2, env_b3, env_b4 + + +@app.cell(hide_code=True) +def _(env_b1): + step_cell( + "1. Check API Server", + "kubectl cluster-info", + env_b1, + env_b1.value > 0, + explanation=( + "`cluster-info` hits the Kubernetes API server and shows its address. " + "The API server is the only component you talk to directly — everything else (controllers, kubelet, scheduler) " + "reads and writes through it." + ), + ) + return + + +@app.cell(hide_code=True) +def _(env_b2): + step_cell( + "2. List Namespaces", + "kubectl get namespace", + env_b2, + env_b2.value > 0, + explanation="Namespaces partition cluster objects. You'll see system namespaces (`kube-system`, `kube-public`) alongside any others.", + ) + return + + +@app.cell(hide_code=True) +def _(env_b3): + step_cell( + "3. Create Lab Namespace", + f"kubectl create namespace {LAB_NAMESPACE} --dry-run=client -o yaml | kubectl apply -f -", + env_b3, + env_b3.value > 0, + explanation=( + f"Idempotently create `{LAB_NAMESPACE}`. The `--dry-run=client -o yaml | apply` pattern produces a manifest " + "client-side and applies it — re-running won't error if the namespace already exists." + ), + ) + return + + +@app.cell(hide_code=True) +def _(env_b4): + step_cell( + "4. Verify Namespace Labels", + f"kubectl get namespace {LAB_NAMESPACE} --show-labels", + env_b4, + env_b4.value > 0, + ) + return + + +# ============================================================ +# ConfigMap Lab +# ============================================================ + + +@app.cell(hide_code=True) +def _(kubectl_ready: bool): + mo.stop(not kubectl_ready) + mo.md( + "---\n## ConfigMap Lab\n\n" + "A ConfigMap stores non-secret key/value config. Below is the actual manifest — " + "the API object Kubernetes stores. `kubectl create configmap ...` is just a shortcut " + "that builds this YAML for you." + ) + return + + +@app.cell(hide_code=True) +def _(kubectl_ready: bool): + mo.stop(not kubectl_ready) + cm_editor = mo.ui.code_editor(value=CONFIGMAP_MANIFEST, language="yaml", min_height=200, debounce=True) + cm_b1 = mo.ui.button(**RUN_BUTTON) + cm_b2 = mo.ui.button(**RUN_BUTTON) + cm_b3 = mo.ui.button(**RUN_BUTTON) + cm_b4 = mo.ui.button(**RUN_BUTTON) + cm_b5 = mo.ui.button(**RUN_BUTTON) + return cm_editor, cm_b1, cm_b2, cm_b3, cm_b4, cm_b5 + + +@app.cell(hide_code=True) +def _(cm_editor, cm_b1): + manifest_step_cell( + "1. Apply ConfigMap Manifest", + cm_editor, + LAB_NAMESPACE, + cm_b1, + cm_b1.value > 0, + explanation=( + "Two top-level fields matter: `metadata` (identity — name, namespace, labels) " + "and `data` (the key/value payload). Edit the YAML and re-click **Run** to see how " + "`kubectl apply` reconciles your change." + ), + ) + return + + +@app.cell(hide_code=True) +def _(cm_b2): + step_cell( + "2. Read ConfigMap", + f"kubectl -n {LAB_NAMESPACE} get configmap {CONFIGMAP_NAME} -o yaml", + cm_b2, + cm_b2.value > 0, + explanation=( + "Notice the server added fields you didn't write — `creationTimestamp`, `resourceVersion`, `uid`. " + "These are managed by the API server, not you." + ), + ) + return + + +@app.cell(hide_code=True) +def _(cm_b3): + step_cell( + "3. Patch ConfigMap Data (imperative drift)", + f"""kubectl -n {LAB_NAMESPACE} patch configmap {CONFIGMAP_NAME} --type merge -p '{{"data":{{"MODE":"advanced","TOPIC":"crud"}}}}'""", + cm_b3, + cm_b3.value > 0, + explanation=( + "`patch` mutates the live object in-place. After this, the cluster state diverges from the YAML in step 1 — " + "re-applying the manifest would revert `MODE` and remove `TOPIC`. This is the declarative-vs-imperative tension." + ), + ) + return + + +@app.cell(hide_code=True) +def _(cm_b4): + step_cell( + "4. Verify Patched ConfigMap", + f"kubectl -n {LAB_NAMESPACE} get configmap {CONFIGMAP_NAME} -o yaml", + cm_b4, + cm_b4.value > 0, + ) + return + + +@app.cell(hide_code=True) +def _(cm_b5): + step_cell( + "5. Delete ConfigMap", + f"kubectl -n {LAB_NAMESPACE} delete configmap {CONFIGMAP_NAME} --ignore-not-found=true", + cm_b5, + cm_b5.value > 0, + ) + return + + +# ============================================================ +# Deployment Lab +# ============================================================ + + +@app.cell(hide_code=True) +def _(kubectl_ready: bool): + mo.stop(not kubectl_ready) + mo.md( + "---\n## Deployment Lab\n\n" + 'A Deployment is a controller. You give it a desired state — _"I want N pods running this image"_ — ' + "and it creates a ReplicaSet, which in turn creates Pods. The controller continuously reconciles " + "actual cluster state toward the spec. We'll watch that happen explicitly in step 4." + ) + return + + +@app.cell(hide_code=True) +def _(kubectl_ready: bool): + mo.stop(not kubectl_ready) + dep_editor = mo.ui.code_editor(value=DEPLOYMENT_MANIFEST, language="yaml", min_height=320, debounce=True) + dep_b1 = mo.ui.button(**RUN_BUTTON) + dep_b2 = mo.ui.button(**RUN_BUTTON) + dep_b3 = mo.ui.button(**RUN_BUTTON) + dep_b4 = mo.ui.button(**RUN_BUTTON) + dep_b5 = mo.ui.button(**RUN_BUTTON) + dep_b6 = mo.ui.button(**RUN_BUTTON) + dep_b7 = mo.ui.button(**RUN_BUTTON) + dep_b8 = mo.ui.button(**RUN_BUTTON) + return dep_editor, dep_b1, dep_b2, dep_b3, dep_b4, dep_b5, dep_b6, dep_b7, dep_b8 + + +@app.cell(hide_code=True) +def _(dep_editor, dep_b1): + manifest_step_cell( + "1. Apply Deployment Manifest", + dep_editor, + LAB_NAMESPACE, + dep_b1, + dep_b1.value > 0, + explanation=( + "Three nested layers: the Deployment's `selector.matchLabels` finds Pods whose labels match. " + "`template.metadata.labels` *stamps* those labels onto every Pod it creates. " + "If the selector and template labels disagree, the Deployment refuses to start — try it." + ), + ) + return + + +@app.cell(hide_code=True) +def _(dep_b2): + step_cell( + "2. Scale to 2 Replicas", + f"kubectl -n {LAB_NAMESPACE} scale deployment {DEPLOYMENT_NAME} --replicas=2", + dep_b2, + dep_b2.value > 0, + explanation=( + "Scaling changes the `spec.replicas` field on the Deployment. The controller notices the new desired count " + "and creates one more Pod to match. (`scale` is imperative — equivalent to patching the field directly.)" + ), + ) + return + + +@app.cell(hide_code=True) +def _(dep_b3): + step_cell( + "3. Wait for Rollout", + f"kubectl -n {LAB_NAMESPACE} rollout status deployment/{DEPLOYMENT_NAME} --timeout=120s", + dep_b3, + dep_b3.value > 0, + explanation="Block until the Deployment reports that all desired Pods are ready.", + ) + return + + +@app.cell(hide_code=True) +def _(dep_b4): + reconcile_cmd = ( + f"echo '=== pods BEFORE delete ==='; " + f"kubectl -n {LAB_NAMESPACE} get pods -l app={DEPLOYMENT_NAME} -o wide; " + f"POD=$(kubectl -n {LAB_NAMESPACE} get pods -l app={DEPLOYMENT_NAME} " + f"-o jsonpath='{{.items[0].metadata.name}}'); " + f'echo; echo "=== deleting pod: $POD ==="; ' + f'kubectl -n {LAB_NAMESPACE} delete pod "$POD" --wait=false; ' + f"echo; echo '=== waiting 6s for the ReplicaSet controller to react ==='; sleep 6; " + f"echo; echo '=== pods AFTER delete ==='; " + f"kubectl -n {LAB_NAMESPACE} get pods -l app={DEPLOYMENT_NAME} -o wide" + ) + step_cell( + "4. Watch Self-Healing (delete a pod, observe replacement)", + reconcile_cmd, + dep_b4, + dep_b4.value > 0, + explanation=( + "**This is the core idea of Kubernetes.** You deleted a Pod — but the Deployment's " + "ReplicaSet still wants 2 Pods running. The controller detects the mismatch and creates a new Pod " + "with a fresh name. Compare the `NAME` column before and after: one of the names will be new." + ), + ) + return + + +@app.cell(hide_code=True) +def _(dep_b5): + step_cell( + "5. Update Deployment Image", + f"kubectl -n {LAB_NAMESPACE} set image deployment/{DEPLOYMENT_NAME} '*=nginx:1.27.5'", + dep_b5, + dep_b5.value > 0, + explanation=( + "Changes the image in `spec.template.spec.containers[*]`. The Deployment then creates a new ReplicaSet " + "for the new image and scales the old one down — a rolling update, again driven by the controller." + ), + ) + return + + +@app.cell(hide_code=True) +def _(dep_b6): + step_cell( + "6. Wait for Updated Rollout", + f"kubectl -n {LAB_NAMESPACE} rollout status deployment/{DEPLOYMENT_NAME} --timeout=120s", + dep_b6, + dep_b6.value > 0, + ) + return + + +@app.cell(hide_code=True) +def _(dep_b7): + step_cell( + "7. Read Deployment", + f"kubectl -n {LAB_NAMESPACE} get deployment {DEPLOYMENT_NAME}", + dep_b7, + dep_b7.value > 0, + ) + return + + +@app.cell(hide_code=True) +def _(dep_b8): + step_cell( + "8. Read Deployment Pods", + f"kubectl -n {LAB_NAMESPACE} get pods -l app={DEPLOYMENT_NAME}", + dep_b8, + dep_b8.value > 0, + ) + return + + +# ============================================================ +# Service Lab +# ============================================================ + + +@app.cell(hide_code=True) +def _(kubectl_ready: bool): + mo.stop(not kubectl_ready) + mo.md( + "---\n## Service Lab\n\n" + "A Service gives Pods a stable virtual IP and DNS name. Its `selector` is what wires it to Pods — " + "the endpoints object is regenerated whenever matching Pods come and go." + ) + return + + +@app.cell(hide_code=True) +def _(kubectl_ready: bool): + mo.stop(not kubectl_ready) + svc_editor = mo.ui.code_editor(value=SERVICE_MANIFEST, language="yaml", min_height=220, debounce=True) + svc_b1 = mo.ui.button(**RUN_BUTTON) + svc_b2 = mo.ui.button(**RUN_BUTTON) + svc_b3 = mo.ui.button(**RUN_BUTTON) + svc_b4 = mo.ui.button(**RUN_BUTTON) + svc_b5 = mo.ui.button(**RUN_BUTTON) + return svc_editor, svc_b1, svc_b2, svc_b3, svc_b4, svc_b5 + + +@app.cell(hide_code=True) +def _(svc_editor, svc_b1): + manifest_step_cell( + "1. Apply Service Manifest", + svc_editor, + LAB_NAMESPACE, + svc_b1, + svc_b1.value > 0, + explanation=( + f"`spec.selector: app: {DEPLOYMENT_NAME}` is the only link between this Service and the Deployment's Pods — " + "no foreign key, just label matching. Try changing the selector to a label no Pod has and re-run; " + "the Endpoints in step 3 will go empty." + ), + ) + return + + +@app.cell(hide_code=True) +def _(svc_b2): + step_cell( + "2. Read Service", + f"kubectl -n {LAB_NAMESPACE} get service {SERVICE_NAME} -o wide", + svc_b2, + svc_b2.value > 0, + ) + return + + +@app.cell(hide_code=True) +def _(svc_b3): + step_cell( + "3. Read Endpoints", + f"kubectl -n {LAB_NAMESPACE} get endpoints {SERVICE_NAME}", + svc_b3, + svc_b3.value > 0, + explanation=( + "Endpoints are the actual Pod IPs the Service routes to. They were populated automatically " + "from the selector — and they update as Pods come and go (try deleting a Pod and re-running this)." + ), + ) + return + + +@app.cell(hide_code=True) +def _(svc_b4): + step_cell( + "4. Patch Service Session Affinity", + f"""kubectl -n {LAB_NAMESPACE} patch service {SERVICE_NAME} --type merge -p '{{"spec":{{"sessionAffinity":"ClientIP"}}}}'""", + svc_b4, + svc_b4.value > 0, + ) + return + + +@app.cell(hide_code=True) +def _(svc_b5): + step_cell( + "5. Read Service YAML", + f"kubectl -n {LAB_NAMESPACE} get service {SERVICE_NAME} -o yaml", + svc_b5, + svc_b5.value > 0, + ) + return + + +# ============================================================ +# Job Lab +# ============================================================ + + +@app.cell(hide_code=True) +def _(kubectl_ready: bool): + mo.stop(not kubectl_ready) + mo.md( + "---\n## Job Lab\n\n" + "A Job runs Pods to **completion** — the opposite of a Deployment, which restarts them forever. " + "When a Job's Pod exits 0, the Job is done and the Pod stays around for log inspection." + ) + return + + +@app.cell(hide_code=True) +def _(kubectl_ready: bool): + mo.stop(not kubectl_ready) + job_editor = mo.ui.code_editor(value=JOB_MANIFEST, language="yaml", min_height=280, debounce=True) + job_b1 = mo.ui.button(**RUN_BUTTON) + job_b2 = mo.ui.button(**RUN_BUTTON) + job_b3 = mo.ui.button(**RUN_BUTTON) + job_b4 = mo.ui.button(**RUN_BUTTON) + return job_editor, job_b1, job_b2, job_b3, job_b4 + + +@app.cell(hide_code=True) +def _(job_b1): + step_cell( + "1. Delete Old Job (if any)", + f"kubectl -n {LAB_NAMESPACE} delete job {JOB_NAME} --ignore-not-found=true", + job_b1, + job_b1.value > 0, + explanation="Jobs are immutable once `spec.template` is set, so we delete any previous run before re-applying.", + ) + return + + +@app.cell(hide_code=True) +def _(job_editor, job_b2): + manifest_step_cell( + "2. Apply Job Manifest", + job_editor, + LAB_NAMESPACE, + job_b2, + job_b2.value > 0, + explanation=( + "`restartPolicy: Never` is required for Jobs (or `OnFailure`) — it tells the kubelet not to restart " + "a Pod that exits. `backoffLimit: 2` caps retries if the Pod fails." + ), + ) + return + + +@app.cell(hide_code=True) +def _(job_b3): + step_cell( + "3. Wait for Completion", + f"kubectl -n {LAB_NAMESPACE} wait --for=condition=complete --timeout=120s job/{JOB_NAME}", + job_b3, + job_b3.value > 0, + ) + return + + +@app.cell(hide_code=True) +def _(job_b4): + step_cell( + "4. Read Job Logs", + f"kubectl -n {LAB_NAMESPACE} logs job/{JOB_NAME}", + job_b4, + job_b4.value > 0, + explanation="The completed Pod is preserved so you can still read its logs after the Job is done.", + ) + return + + +# ============================================================ +# Cleanup +# ============================================================ + + +@app.cell(hide_code=True) +def _(kubectl_ready: bool): + mo.stop(not kubectl_ready) + mo.md("---\n## Cleanup\n\nDelete the lab namespace — every object inside it is cascaded away.") + return + + +@app.cell(hide_code=True) +def _(kubectl_ready: bool): + mo.stop(not kubectl_ready) + cln_b1 = mo.ui.button(**RUN_BUTTON) + cln_b2 = mo.ui.button(**RUN_BUTTON) + cln_b3 = mo.ui.button(**RUN_BUTTON) + return cln_b1, cln_b2, cln_b3 + + +@app.cell(hide_code=True) +def _(cln_b1): + step_cell( + "1. List Remaining Namespace Resources", + f"kubectl -n {LAB_NAMESPACE} get all || true", + cln_b1, + cln_b1.value > 0, + ) + return + + +@app.cell(hide_code=True) +def _(cln_b2): + step_cell( + "2. Delete Lab Namespace", + f"kubectl delete namespace {LAB_NAMESPACE} --ignore-not-found=true", + cln_b2, + cln_b2.value > 0, + explanation="Deleting the namespace cascades to every object inside it — Pods, ReplicaSets, Services, Endpoints, ConfigMaps, Jobs.", + ) + return + + +@app.cell(hide_code=True) +def _(cln_b3): + step_cell( + "3. Verify Namespace Removal", + f"kubectl get namespace {LAB_NAMESPACE} || true", + cln_b3, + cln_b3.value > 0, + ) + return + + +if __name__ == "__main__": + app.run() diff --git a/arena/notebooks/cks_kubernetes/configmap.yaml b/arena/notebooks/cks_kubernetes/configmap.yaml new file mode 100644 index 0000000..1cb820e --- /dev/null +++ b/arena/notebooks/cks_kubernetes/configmap.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: arena-k8s-config + labels: + app.kubernetes.io/part-of: arena-k8s-lab +data: + MODE: learning + OWNER: arena diff --git a/arena/notebooks/cks_kubernetes/deployment.yaml b/arena/notebooks/cks_kubernetes/deployment.yaml new file mode 100644 index 0000000..5d71df4 --- /dev/null +++ b/arena/notebooks/cks_kubernetes/deployment.yaml @@ -0,0 +1,22 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: arena-k8s-web + labels: + app.kubernetes.io/part-of: arena-k8s-lab +spec: + replicas: 1 + selector: + matchLabels: + app: arena-k8s-web + template: + metadata: + labels: + app: arena-k8s-web + app.kubernetes.io/part-of: arena-k8s-lab + spec: + containers: + - name: web + image: nginx:1.27 + ports: + - containerPort: 80 diff --git a/arena/notebooks/cks_kubernetes/job.yaml b/arena/notebooks/cks_kubernetes/job.yaml new file mode 100644 index 0000000..4e7bb27 --- /dev/null +++ b/arena/notebooks/cks_kubernetes/job.yaml @@ -0,0 +1,18 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: arena-k8s-job + labels: + app.kubernetes.io/part-of: arena-k8s-lab +spec: + backoffLimit: 2 + template: + metadata: + labels: + app.kubernetes.io/part-of: arena-k8s-lab + spec: + restartPolicy: Never + containers: + - name: worker + image: busybox:1.36 + command: ["/bin/sh", "-c", "echo hello-from-cks; sleep 3; echo completed"] diff --git a/arena/notebooks/cks_kubernetes/service.yaml b/arena/notebooks/cks_kubernetes/service.yaml new file mode 100644 index 0000000..7c27f7d --- /dev/null +++ b/arena/notebooks/cks_kubernetes/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: arena-k8s-web + labels: + app.kubernetes.io/part-of: arena-k8s-lab +spec: + type: ClusterIP + selector: + app: arena-k8s-web + ports: + - port: 80 + targetPort: 80 From 33e6e783d804a0132f1a8155374295fce67f376d Mon Sep 17 00:00:00 2001 From: David Flanagan Date: Wed, 13 May 2026 16:31:12 +0100 Subject: [PATCH 2/4] chore: refactor to shared shell function --- arena/notebooks/caios_warp.py | 2 +- arena/notebooks/cks_kubernetes.py | 90 +++-------- .../notebooks/lib/remote_execution_helpers.py | 145 ++++++++++++++---- 3 files changed, 141 insertions(+), 96 deletions(-) diff --git a/arena/notebooks/caios_warp.py b/arena/notebooks/caios_warp.py index 33cb384..bfa53cd 100644 --- a/arena/notebooks/caios_warp.py +++ b/arena/notebooks/caios_warp.py @@ -38,8 +38,8 @@ def _(): _elements = [ banner(), about( - "Warp", """This notebook provides a walkthrough for benchmarking CoreWeave AI Object Storage (CAIOS) and LOTA inside a CoreWeave cluster using the Minio Warp tool.
+ "Warp", _If you are running this notebook in edit mode, make sure you start by running all cells in the bottom right._ """, ), diff --git a/arena/notebooks/cks_kubernetes.py b/arena/notebooks/cks_kubernetes.py index 69d40a0..142a854 100644 --- a/arena/notebooks/cks_kubernetes.py +++ b/arena/notebooks/cks_kubernetes.py @@ -14,10 +14,10 @@ with app.setup: import os - import subprocess import marimo as mo from lib.k8s import K8s, KubernetesConfigError + from lib.remote_execution_helpers import shell from lib.ui import about, banner, security_disclaimer, table_of_contents MANIFEST_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cks_kubernetes") @@ -40,57 +40,6 @@ def truncate(text: str, max_chars: int = 4000) -> str: return text return f"{text[:max_chars]}\n...output truncated..." - def run_command(command: str, timeout_seconds: int = 180) -> dict: - """Run a shell command and return structured result.""" - try: - result = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=timeout_seconds) - return { - "command": command, - "returncode": result.returncode, - "stdout": result.stdout.strip(), - "stderr": result.stderr.strip(), - "ok": result.returncode == 0, - } - except subprocess.TimeoutExpired as err: - stdout = str(err.stdout or "").strip() if err.stdout else "" - stderr = str(err.stderr or "").strip() if err.stderr else "" - return { - "command": command, - "returncode": 124, - "stdout": stdout, - "stderr": f"Command timed out after {timeout_seconds}s. {stderr}".strip(), - "ok": False, - } - - def apply_manifest(manifest: str, namespace: str | None = None, timeout_seconds: int = 180) -> dict: - """Apply a YAML manifest by piping it to `kubectl apply -f -` via stdin.""" - ns_flag = f"-n {namespace} " if namespace else "" - command = f"kubectl {ns_flag}apply -f -" - try: - result = subprocess.run( - command, - shell=True, - input=manifest, - capture_output=True, - text=True, - timeout=timeout_seconds, - ) - return { - "command": f"{command} # manifest piped via stdin", - "returncode": result.returncode, - "stdout": result.stdout.strip(), - "stderr": result.stderr.strip(), - "ok": result.returncode == 0, - } - except subprocess.TimeoutExpired: - return { - "command": command, - "returncode": 124, - "stdout": "", - "stderr": f"Apply timed out after {timeout_seconds}s.", - "ok": False, - } - def write_incluster_kubeconfig(kubeconfig_path: str) -> tuple[bool, str]: """Write a kubeconfig that references the mounted service-account token via `tokenFile:`. @@ -136,29 +85,29 @@ def ensure_kubectl_access() -> tuple[bool, str]: 1. An existing `kubectl config current-context` (local or already-configured) — leave env alone. 2. Generated in-cluster kubeconfig at `KUBECTL_KUBECONFIG_PATH` — only then set `KUBECONFIG`. """ - kubectl_check = run_command("kubectl version --client") - if not bool(kubectl_check["ok"]): + kubectl_check = shell("kubectl version --client", quiet=True) + if not kubectl_check.ok: return ( False, "`kubectl` is not available. Install it in this environment and rerun all cells.", ) - local_context = run_command("kubectl config current-context") - if bool(local_context["ok"]): + local_context = shell("kubectl config current-context", quiet=True) + if local_context.ok: # Probe Python-side connectivity via the shared K8s helper. Failure is non-fatal — the # lab's shell-kubectl commands will still work. try: K8s().validate_config() except KubernetesConfigError: pass - return True, f"Using local kubectl context `{local_context['stdout']}`." + return True, f"Using local kubectl context `{local_context.strip()}`." incluster_ok, incluster_message = write_incluster_kubeconfig(KUBECTL_KUBECONFIG_PATH) if incluster_ok: os.environ["KUBECONFIG"] = KUBECTL_KUBECONFIG_PATH return True, f"Using in-cluster service-account context via `{KUBECTL_KUBECONFIG_PATH}`." - details = str(local_context["stderr"]) if local_context["stderr"] else incluster_message + details = local_context.stderr.strip() or incluster_message return ( False, "kubectl is installed but no cluster context is usable. " @@ -170,12 +119,14 @@ def step_cell(title: str, cmd: str, btn, clicked: bool, explanation: str = "") - """Render an imperative step: optional explanation, title + button, command, and result if clicked.""" _output = None if clicked: - _r = run_command(cmd) - _status = "SUCCESS" if _r["ok"] else "FAILED" - _stdout = truncate(str(_r["stdout"])) if _r["stdout"] else "(no output)" + _r = shell(cmd, quiet=True, timeout=180) + _status = "SUCCESS" if _r.ok else "FAILED" + _stdout_text = _r.strip() + _stdout = truncate(_stdout_text) if _stdout_text else "(no output)" _stderr_block = "" - if _r["stderr"] and not _r["ok"]: - _stderr_block = f"\n**stderr:**\n```\n{truncate(str(_r['stderr']))}\n```" + _stderr_text = _r.stderr.strip() + if _stderr_text and not _r.ok: + _stderr_block = f"\n**stderr:**\n```\n{truncate(_stderr_text)}\n```" _output = mo.md(f"`{_status}`\n```\n{_stdout}\n```{_stderr_block}") parts = [ mo.hstack([mo.md(f"**{title}**"), btn], justify="space-between", align="center"), @@ -202,12 +153,15 @@ def manifest_step_cell( """ _output = None if clicked: - _r = apply_manifest(editor.value, namespace) - _status = "SUCCESS" if _r["ok"] else "FAILED" - _stdout = truncate(str(_r["stdout"])) if _r["stdout"] else "(no output)" + ns_flag = f"-n {namespace} " if namespace else "" + _r = shell(f"kubectl {ns_flag}apply -f -", quiet=True, timeout=180, input=editor.value) + _status = "SUCCESS" if _r.ok else "FAILED" + _stdout_text = _r.strip() + _stdout = truncate(_stdout_text) if _stdout_text else "(no output)" _stderr_block = "" - if _r["stderr"] and not _r["ok"]: - _stderr_block = f"\n**stderr:**\n```\n{truncate(str(_r['stderr']))}\n```" + _stderr_text = _r.stderr.strip() + if _stderr_text and not _r.ok: + _stderr_block = f"\n**stderr:**\n```\n{truncate(_stderr_text)}\n```" _output = mo.md(f"`{_status}`\n```\n{_stdout}\n```{_stderr_block}") ns_label = f" *(namespace: `{namespace}`)*" if namespace else "" parts = [ diff --git a/arena/notebooks/lib/remote_execution_helpers.py b/arena/notebooks/lib/remote_execution_helpers.py index 3147dc0..67bfd65 100644 --- a/arena/notebooks/lib/remote_execution_helpers.py +++ b/arena/notebooks/lib/remote_execution_helpers.py @@ -205,9 +205,49 @@ def ssh(command: str, verbose: bool = True, stream: bool = False) -> str: return result.stdout -def shell(command: str | list, quiet: bool = False, check: bool = False, stream: bool = False) -> str: # noqa: C901 +class ShellResult(str): + """String-like result from shell()/bash() that also carries structured fields. + + Subclasses str so existing callers that treat the return value as a string + keep working. New callers can read .returncode, .stderr, .ok, .command for + structured access. + """ + + returncode: int + stderr: str + command: str + ok: bool + + def __new__( + cls, + stdout: str, + *, + returncode: int = 0, + stderr: str = "", + command: str = "", + ok: bool = True, + ) -> "ShellResult": + instance = super().__new__(cls, stdout) + instance.returncode = returncode + instance.stderr = stderr + instance.command = command + instance.ok = ok + return instance + + +def shell( # noqa: C901 + command: str | list, + quiet: bool = False, + check: bool = False, + stream: bool = False, + timeout: int | None = None, + input: str | None = None, +) -> ShellResult: """Run a local shell command with clean output. + Returns a ShellResult — a str subclass equal to the captured stdout, with + .returncode, .stderr, .ok, .command for callers that need structured access. + Args: command: Command string or list of arguments quiet: If True, suppress status messages (only show command output) @@ -215,15 +255,18 @@ def shell(command: str | list, quiet: bool = False, check: bool = False, stream: stream: If True, stream stdout/stderr in real-time (for long-running commands). Note: In Marimo notebooks, streaming may still buffer until cell completion due to Marimo's output capture mechanism. - - Returns: - The stdout output as a string (empty if streaming) + timeout: Optional timeout in seconds (non-stream mode only). On timeout + the result has returncode=124 and ok=False. + input: Optional string piped to the command's stdin (non-stream mode only). Example: - shell("aws s3 ls") - shell("aws configure set s3.addressing_style virtual") + result = shell("aws s3 ls") + if not result.ok: + print(result.stderr) + shell(["kubectl", "get", "pods"]) - shell("s5cmd cp ...", stream=True) # Stream output in real-time + shell("kubectl apply -f -", input=manifest_yaml) + shell("s5cmd cp ...", stream=True) """ if isinstance(command, str): cmd_display = command @@ -261,44 +304,92 @@ def shell(command: str | list, quiet: bool = False, check: bool = False, stream: output_lines.append(line) proc.wait() + stdout = "".join(output_lines) if proc.returncode == 0: if not quiet: sys.stdout.write("✓ Done (exit: 0)\n") sys.stdout.flush() else: - sys.stdout.write(f"✗ Failed (exit: {proc.returncode})\n") - sys.stdout.flush() + if not quiet: + sys.stdout.write(f"✗ Failed (exit: {proc.returncode})\n") + sys.stdout.flush() if check: raise subprocess.CalledProcessError(proc.returncode, command) - return "".join(output_lines) + return ShellResult( + stdout, + returncode=proc.returncode, + stderr="", + command=cmd_display, + ok=proc.returncode == 0, + ) - else: - # Capture output (original behavior) + try: result = subprocess.run( command, shell=shell_mode, capture_output=True, text=True, + timeout=timeout, + input=input, ) - - if result.returncode == 0: - if not quiet: - print(f"✓ {cmd_display}") - if result.stdout.strip(): - print(result.stdout.rstrip()) - return result.stdout - else: + except subprocess.TimeoutExpired as err: + out = err.stdout or "" + err_text = err.stderr or "" + if isinstance(out, bytes): + out = out.decode("utf-8", errors="replace") + if isinstance(err_text, bytes): + err_text = err_text.decode("utf-8", errors="replace") + stderr_msg = f"Command timed out after {timeout}s. {err_text.strip()}".strip() + if not quiet: print(f"✗ {cmd_display}") - if result.stderr.strip(): - print(f" Error: {result.stderr.strip()}") - if check: - raise subprocess.CalledProcessError(result.returncode, command) - return "" + print(f" Error: {stderr_msg}") + if check: + raise + return ShellResult( + out, + returncode=124, + stderr=stderr_msg, + command=cmd_display, + ok=False, + ) + if result.returncode == 0: + if not quiet: + print(f"✓ {cmd_display}") + if result.stdout.strip(): + print(result.stdout.rstrip()) + return ShellResult( + result.stdout, + returncode=0, + stderr=result.stderr, + command=cmd_display, + ok=True, + ) + + if not quiet: + print(f"✗ {cmd_display}") + if result.stderr.strip(): + print(f" Error: {result.stderr.strip()}") + if check: + raise subprocess.CalledProcessError(result.returncode, command) + return ShellResult( + result.stdout, + returncode=result.returncode, + stderr=result.stderr, + command=cmd_display, + ok=False, + ) -def bash(command: str, quiet: bool = False, stream: bool = False) -> str: + +def bash( + command: str, + quiet: bool = False, + stream: bool = False, + timeout: int | None = None, + input: str | None = None, +) -> ShellResult: """Alias for shell() - run a bash command. Example: @@ -306,7 +397,7 @@ def bash(command: str, quiet: bool = False, stream: bool = False) -> str: bash("aws s3 ls") bash("s5cmd cp ...", stream=True) """ - return shell(command, quiet=quiet, stream=stream) + return shell(command, quiet=quiet, stream=stream, timeout=timeout, input=input) if __name__ == "__main__": From a959205ed19cf4176e41a8f5f1aace8b2be8c520 Mon Sep 17 00:00:00 2001 From: David Flanagan Date: Wed, 13 May 2026 16:39:40 +0100 Subject: [PATCH 3/4] chore: refactor kubernetes access helpers --- arena/notebooks/caios_warp.py | 2 +- arena/notebooks/cks_kubernetes.py | 84 +------------------ arena/notebooks/lib/k8s.py | 84 +++++++++++++++++++ .../notebooks/lib/remote_execution_helpers.py | 5 ++ 4 files changed, 93 insertions(+), 82 deletions(-) diff --git a/arena/notebooks/caios_warp.py b/arena/notebooks/caios_warp.py index bfa53cd..33cb384 100644 --- a/arena/notebooks/caios_warp.py +++ b/arena/notebooks/caios_warp.py @@ -38,8 +38,8 @@ def _(): _elements = [ banner(), about( - """This notebook provides a walkthrough for benchmarking CoreWeave AI Object Storage (CAIOS) and LOTA inside a CoreWeave cluster using the Minio Warp tool.
"Warp", + """This notebook provides a walkthrough for benchmarking CoreWeave AI Object Storage (CAIOS) and LOTA inside a CoreWeave cluster using the Minio Warp tool.
_If you are running this notebook in edit mode, make sure you start by running all cells in the bottom right._ """, ), diff --git a/arena/notebooks/cks_kubernetes.py b/arena/notebooks/cks_kubernetes.py index 142a854..1e2116a 100644 --- a/arena/notebooks/cks_kubernetes.py +++ b/arena/notebooks/cks_kubernetes.py @@ -16,7 +16,7 @@ import os import marimo as mo - from lib.k8s import K8s, KubernetesConfigError + from lib.k8s import ensure_kubectl_access from lib.remote_execution_helpers import shell from lib.ui import about, banner, security_disclaimer, table_of_contents @@ -31,90 +31,12 @@ SERVICE_NAME = "arena-k8s-web" JOB_NAME = "arena-k8s-job" - SERVICE_ACCOUNT_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token" - SERVICE_ACCOUNT_CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" - KUBECTL_KUBECONFIG_PATH = "/tmp/arena-incluster-kubectl.yaml" - def truncate(text: str, max_chars: int = 4000) -> str: + """Limit command output to a readable size for notebook display.""" if len(text) <= max_chars: return text return f"{text[:max_chars]}\n...output truncated..." - def write_incluster_kubeconfig(kubeconfig_path: str) -> tuple[bool, str]: - """Write a kubeconfig that references the mounted service-account token via `tokenFile:`. - - Using `tokenFile:` (rather than inlining `token:`) keeps the bearer token in the - kernel-protected `/var/run/secrets/...` mount and picks up token rotation automatically. - """ - host = os.getenv("KUBERNETES_SERVICE_HOST", "") - port = os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "") or os.getenv("KUBERNETES_SERVICE_PORT", "443") - - if not host: - return False, "KUBERNETES_SERVICE_HOST is not set." - if not os.path.exists(SERVICE_ACCOUNT_TOKEN_PATH): - return False, f"Service account token not found at {SERVICE_ACCOUNT_TOKEN_PATH}." - if not os.path.exists(SERVICE_ACCOUNT_CA_PATH): - return False, f"Service account CA cert not found at {SERVICE_ACCOUNT_CA_PATH}." - - kubeconfig = f"""apiVersion: v1 -kind: Config -clusters: -- name: in-cluster - cluster: - server: https://{host}:{port} - certificate-authority: {SERVICE_ACCOUNT_CA_PATH} -users: -- name: service-account - user: - tokenFile: {SERVICE_ACCOUNT_TOKEN_PATH} -contexts: -- name: in-cluster - context: - cluster: in-cluster - user: service-account -current-context: in-cluster -""" - with open(kubeconfig_path, "w", encoding="utf-8") as file: - file.write(kubeconfig) - return True, kubeconfig_path - - def ensure_kubectl_access() -> tuple[bool, str]: - """Make sure kubectl has a usable context, without disturbing the user's existing KUBECONFIG. - - Order of preference: - 1. An existing `kubectl config current-context` (local or already-configured) — leave env alone. - 2. Generated in-cluster kubeconfig at `KUBECTL_KUBECONFIG_PATH` — only then set `KUBECONFIG`. - """ - kubectl_check = shell("kubectl version --client", quiet=True) - if not kubectl_check.ok: - return ( - False, - "`kubectl` is not available. Install it in this environment and rerun all cells.", - ) - - local_context = shell("kubectl config current-context", quiet=True) - if local_context.ok: - # Probe Python-side connectivity via the shared K8s helper. Failure is non-fatal — the - # lab's shell-kubectl commands will still work. - try: - K8s().validate_config() - except KubernetesConfigError: - pass - return True, f"Using local kubectl context `{local_context.strip()}`." - - incluster_ok, incluster_message = write_incluster_kubeconfig(KUBECTL_KUBECONFIG_PATH) - if incluster_ok: - os.environ["KUBECONFIG"] = KUBECTL_KUBECONFIG_PATH - return True, f"Using in-cluster service-account context via `{KUBECTL_KUBECONFIG_PATH}`." - - details = local_context.stderr.strip() or incluster_message - return ( - False, - "kubectl is installed but no cluster context is usable. " - "If running locally, set a context with `kubectl config use-context `. " - f"Details: `{details}`", - ) - def step_cell(title: str, cmd: str, btn, clicked: bool, explanation: str = "") -> mo.Html: """Render an imperative step: optional explanation, title + button, command, and result if clicked.""" _output = None @@ -184,7 +106,7 @@ def load_manifest(filename: str) -> str: with open(os.path.join(MANIFEST_DIR, filename), "r", encoding="utf-8") as file: return file.read() - RUN_BUTTON = dict(value=0, on_click=lambda v: v + 1, label="Run") + RUN_BUTTON = {"value": 0, "on_click": lambda v: v + 1, "label": "Run"} # ---------------------------------------------------------------- # Manifests (the source of truth for each lab object). diff --git a/arena/notebooks/lib/k8s.py b/arena/notebooks/lib/k8s.py index a75e683..55e46b7 100644 --- a/arena/notebooks/lib/k8s.py +++ b/arena/notebooks/lib/k8s.py @@ -7,9 +7,15 @@ from kubernetes.client.rest import ApiException from ruamel.yaml import YAML +from .remote_execution_helpers import shell + # We don't have a good way to get this off node labels currently AVAILABILITY_ZONE = os.getenv("AVAILABILITY_ZONE", "A") +SERVICE_ACCOUNT_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token" +SERVICE_ACCOUNT_CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" +KUBECTL_INCLUSTER_KUBECONFIG_PATH = "/tmp/arena-incluster-kubectl.yaml" + class KubernetesError(Exception): """Base exception for k8s helper errors.""" @@ -543,6 +549,84 @@ def delete_by_label(self, namespace: str, label_selector: str) -> dict[str, list return results +def _write_incluster_kubeconfig(kubeconfig_path: str) -> None: + """Write a kubeconfig that references the mounted service-account token via `tokenFile:`. + + Using `tokenFile:` (rather than inlining `token:`) keeps the bearer token in the + kernel-protected `/var/run/secrets/...` mount and picks up token rotation automatically. + + Assumes `config.load_incluster_config()` has already validated the mount exists. + """ + host = os.environ["KUBERNETES_SERVICE_HOST"] + port = os.getenv("KUBERNETES_SERVICE_PORT_HTTPS") or os.getenv("KUBERNETES_SERVICE_PORT", "443") + kubeconfig = f"""apiVersion: v1 +kind: Config +clusters: +- name: in-cluster + cluster: + server: https://{host}:{port} + certificate-authority: {SERVICE_ACCOUNT_CA_PATH} +users: +- name: service-account + user: + tokenFile: {SERVICE_ACCOUNT_TOKEN_PATH} +contexts: +- name: in-cluster + context: + cluster: in-cluster + user: service-account +current-context: in-cluster +""" + with open(kubeconfig_path, "w", encoding="utf-8") as file: + file.write(kubeconfig) + + +def ensure_kubectl_access() -> tuple[bool, str]: + """Make sure `kubectl` has a usable context, without disturbing the user's existing KUBECONFIG. + + Order of preference: + 1. An existing `kubectl config current-context` (local or already-configured) — leave env alone. + 2. An in-cluster service-account identity — write a bridge kubeconfig at + `KUBECTL_INCLUSTER_KUBECONFIG_PATH` and set `KUBECONFIG` so kubectl picks it up. + """ + if not shell("kubectl version --client", quiet=True).ok: + return ( + False, + "`kubectl` is not available. Install it in this environment and rerun all cells.", + ) + + local_context = shell("kubectl config current-context", quiet=True) + cluster_probe = None + if local_context.ok: + cluster_probe = shell("kubectl cluster-info", quiet=True, timeout=15) + if cluster_probe.ok: + # Non-fatal Python-side connectivity probe — kubectl will still work even if this fails. + try: + K8s().validate_config() + except KubernetesConfigError: + pass + return True, f"Using local kubectl context `{local_context.strip()}`." + + # No local kubectl context — try in-cluster. `load_incluster_config()` is the canonical + # detection: it succeeds only when the SA token + CA + service env vars are all in place. + try: + config.load_incluster_config() + except config.ConfigException as e: + details = ( + cluster_probe.stderr.strip() if cluster_probe is not None else local_context.stderr.strip() + ) or str(e) + return ( + False, + "kubectl is installed but no cluster context is usable. " + "If running locally, set a context with `kubectl config use-context `. " + f"Details: `{details}`", + ) + + _write_incluster_kubeconfig(KUBECTL_INCLUSTER_KUBECONFIG_PATH) + os.environ["KUBECONFIG"] = KUBECTL_INCLUSTER_KUBECONFIG_PATH + return True, f"Using in-cluster service-account context via `{KUBECTL_INCLUSTER_KUBECONFIG_PATH}`." + + def kubeconfig_input() -> tuple[mo.Html | None, mo.ui.form | None]: """Create a form for a user to input their kubeconfig path. diff --git a/arena/notebooks/lib/remote_execution_helpers.py b/arena/notebooks/lib/remote_execution_helpers.py index 67bfd65..44a4c50 100644 --- a/arena/notebooks/lib/remote_execution_helpers.py +++ b/arena/notebooks/lib/remote_execution_helpers.py @@ -227,6 +227,7 @@ def __new__( command: str = "", ok: bool = True, ) -> "ShellResult": + """Create a string result with subprocess metadata attached.""" instance = super().__new__(cls, stdout) instance.returncode = returncode instance.stderr = stderr @@ -234,6 +235,10 @@ def __new__( instance.ok = ok return instance + def __bool__(self) -> bool: + """Treat failed commands as false while preserving string truthiness for stdout.""" + return self.ok and len(self) > 0 + def shell( # noqa: C901 command: str | list, From 1865265ae51c4d6c5b64dbdb21c87d49e5631faf Mon Sep 17 00:00:00 2001 From: David Flanagan Date: Tue, 2 Jun 2026 14:42:53 +0100 Subject: [PATCH 4/4] chore(k8s): selection of fixes --- arena/notebooks/cks_kubernetes.py | 84 ++++++++----------- arena/notebooks/lib/k8s.py | 1 + .../notebooks/lib/remote_execution_helpers.py | 4 +- 3 files changed, 40 insertions(+), 49 deletions(-) diff --git a/arena/notebooks/cks_kubernetes.py b/arena/notebooks/cks_kubernetes.py index 1e2116a..6e8e23d 100644 --- a/arena/notebooks/cks_kubernetes.py +++ b/arena/notebooks/cks_kubernetes.py @@ -62,21 +62,22 @@ def step_cell(title: str, cmd: str, btn, clicked: bool, explanation: str = "") - def manifest_step_cell( title: str, - editor, + form, namespace: str | None, - btn, - clicked: bool, explanation: str = "", ) -> mo.Html: - """Render a declarative step: title + button, optional explanation, editable YAML, result on click. + """Render a declarative step: title, optional explanation, YAML form, result on submit. - `editor` is a `mo.ui.code_editor` whose current `.value` is applied to the cluster - when the Run button has been clicked. Editing the YAML and re-clicking re-applies. + `form` is a `mo.ui.code_editor(...).form()` — `form.value` is None until the user + clicks the form's submit button, then becomes the current editor contents. Each + subsequent submit re-applies the (possibly edited) manifest; edits made without + clicking submit do not trigger a re-apply. """ _output = None - if clicked: + manifest = form.value + if manifest is not None: ns_flag = f"-n {namespace} " if namespace else "" - _r = shell(f"kubectl {ns_flag}apply -f -", quiet=True, timeout=180, input=editor.value) + _r = shell(f"kubectl {ns_flag}apply -f -", quiet=True, timeout=180, input=manifest) _status = "SUCCESS" if _r.ok else "FAILED" _stdout_text = _r.strip() _stdout = truncate(_stdout_text) if _stdout_text else "(no output)" @@ -86,17 +87,10 @@ def manifest_step_cell( _stderr_block = f"\n**stderr:**\n```\n{truncate(_stderr_text)}\n```" _output = mo.md(f"`{_status}`\n```\n{_stdout}\n```{_stderr_block}") ns_label = f" *(namespace: `{namespace}`)*" if namespace else "" - parts = [ - mo.hstack( - [mo.md(f"**{title}**{ns_label}"), btn], - justify="space-between", - align="center", - ), - ] + parts = [mo.md(f"**{title}**{ns_label}")] if explanation: parts.append(mo.md(explanation)) - parts.append(editor) - parts.append(mo.md("_Edit the manifest above if you like, then click **Run** to_ `kubectl apply -f -` _it._")) + parts.append(form) if _output is not None: parts.append(_output) return mo.vstack(parts) @@ -267,26 +261,25 @@ def _(kubectl_ready: bool): @app.cell(hide_code=True) def _(kubectl_ready: bool): mo.stop(not kubectl_ready) - cm_editor = mo.ui.code_editor(value=CONFIGMAP_MANIFEST, language="yaml", min_height=200, debounce=True) - cm_b1 = mo.ui.button(**RUN_BUTTON) + cm_form = mo.ui.code_editor(value=CONFIGMAP_MANIFEST, language="yaml", min_height=200).form( + submit_button_label="Apply", bordered=False + ) cm_b2 = mo.ui.button(**RUN_BUTTON) cm_b3 = mo.ui.button(**RUN_BUTTON) cm_b4 = mo.ui.button(**RUN_BUTTON) cm_b5 = mo.ui.button(**RUN_BUTTON) - return cm_editor, cm_b1, cm_b2, cm_b3, cm_b4, cm_b5 + return cm_form, cm_b2, cm_b3, cm_b4, cm_b5 @app.cell(hide_code=True) -def _(cm_editor, cm_b1): +def _(cm_form): manifest_step_cell( "1. Apply ConfigMap Manifest", - cm_editor, + cm_form, LAB_NAMESPACE, - cm_b1, - cm_b1.value > 0, explanation=( "Two top-level fields matter: `metadata` (identity — name, namespace, labels) " - "and `data` (the key/value payload). Edit the YAML and re-click **Run** to see how " + "and `data` (the key/value payload). Edit the YAML and click **Apply** to see how " "`kubectl apply` reconciles your change." ), ) @@ -365,8 +358,9 @@ def _(kubectl_ready: bool): @app.cell(hide_code=True) def _(kubectl_ready: bool): mo.stop(not kubectl_ready) - dep_editor = mo.ui.code_editor(value=DEPLOYMENT_MANIFEST, language="yaml", min_height=320, debounce=True) - dep_b1 = mo.ui.button(**RUN_BUTTON) + dep_form = mo.ui.code_editor(value=DEPLOYMENT_MANIFEST, language="yaml", min_height=320).form( + submit_button_label="Apply", bordered=False + ) dep_b2 = mo.ui.button(**RUN_BUTTON) dep_b3 = mo.ui.button(**RUN_BUTTON) dep_b4 = mo.ui.button(**RUN_BUTTON) @@ -374,17 +368,15 @@ def _(kubectl_ready: bool): dep_b6 = mo.ui.button(**RUN_BUTTON) dep_b7 = mo.ui.button(**RUN_BUTTON) dep_b8 = mo.ui.button(**RUN_BUTTON) - return dep_editor, dep_b1, dep_b2, dep_b3, dep_b4, dep_b5, dep_b6, dep_b7, dep_b8 + return dep_form, dep_b2, dep_b3, dep_b4, dep_b5, dep_b6, dep_b7, dep_b8 @app.cell(hide_code=True) -def _(dep_editor, dep_b1): +def _(dep_form): manifest_step_cell( "1. Apply Deployment Manifest", - dep_editor, + dep_form, LAB_NAMESPACE, - dep_b1, - dep_b1.value > 0, explanation=( "Three nested layers: the Deployment's `selector.matchLabels` finds Pods whose labels match. " "`template.metadata.labels` *stamps* those labels onto every Pod it creates. " @@ -515,26 +507,25 @@ def _(kubectl_ready: bool): @app.cell(hide_code=True) def _(kubectl_ready: bool): mo.stop(not kubectl_ready) - svc_editor = mo.ui.code_editor(value=SERVICE_MANIFEST, language="yaml", min_height=220, debounce=True) - svc_b1 = mo.ui.button(**RUN_BUTTON) + svc_form = mo.ui.code_editor(value=SERVICE_MANIFEST, language="yaml", min_height=220).form( + submit_button_label="Apply", bordered=False + ) svc_b2 = mo.ui.button(**RUN_BUTTON) svc_b3 = mo.ui.button(**RUN_BUTTON) svc_b4 = mo.ui.button(**RUN_BUTTON) svc_b5 = mo.ui.button(**RUN_BUTTON) - return svc_editor, svc_b1, svc_b2, svc_b3, svc_b4, svc_b5 + return svc_form, svc_b2, svc_b3, svc_b4, svc_b5 @app.cell(hide_code=True) -def _(svc_editor, svc_b1): +def _(svc_form): manifest_step_cell( "1. Apply Service Manifest", - svc_editor, + svc_form, LAB_NAMESPACE, - svc_b1, - svc_b1.value > 0, explanation=( f"`spec.selector: app: {DEPLOYMENT_NAME}` is the only link between this Service and the Deployment's Pods — " - "no foreign key, just label matching. Try changing the selector to a label no Pod has and re-run; " + "no foreign key, just label matching. Try changing the selector to a label no Pod has and click **Apply**; " "the Endpoints in step 3 will go empty." ), ) @@ -608,12 +599,13 @@ def _(kubectl_ready: bool): @app.cell(hide_code=True) def _(kubectl_ready: bool): mo.stop(not kubectl_ready) - job_editor = mo.ui.code_editor(value=JOB_MANIFEST, language="yaml", min_height=280, debounce=True) + job_form = mo.ui.code_editor(value=JOB_MANIFEST, language="yaml", min_height=280).form( + submit_button_label="Apply", bordered=False + ) job_b1 = mo.ui.button(**RUN_BUTTON) - job_b2 = mo.ui.button(**RUN_BUTTON) job_b3 = mo.ui.button(**RUN_BUTTON) job_b4 = mo.ui.button(**RUN_BUTTON) - return job_editor, job_b1, job_b2, job_b3, job_b4 + return job_form, job_b1, job_b3, job_b4 @app.cell(hide_code=True) @@ -629,13 +621,11 @@ def _(job_b1): @app.cell(hide_code=True) -def _(job_editor, job_b2): +def _(job_form): manifest_step_cell( "2. Apply Job Manifest", - job_editor, + job_form, LAB_NAMESPACE, - job_b2, - job_b2.value > 0, explanation=( "`restartPolicy: Never` is required for Jobs (or `OnFailure`) — it tells the kubelet not to restart " "a Pod that exits. `backoffLimit: 2` caps retries if the Pod fails." diff --git a/arena/notebooks/lib/k8s.py b/arena/notebooks/lib/k8s.py index 55e46b7..267bb72 100644 --- a/arena/notebooks/lib/k8s.py +++ b/arena/notebooks/lib/k8s.py @@ -579,6 +579,7 @@ def _write_incluster_kubeconfig(kubeconfig_path: str) -> None: """ with open(kubeconfig_path, "w", encoding="utf-8") as file: file.write(kubeconfig) + os.chmod(kubeconfig_path, 0o600) def ensure_kubectl_access() -> tuple[bool, str]: diff --git a/arena/notebooks/lib/remote_execution_helpers.py b/arena/notebooks/lib/remote_execution_helpers.py index 44a4c50..253fc8f 100644 --- a/arena/notebooks/lib/remote_execution_helpers.py +++ b/arena/notebooks/lib/remote_execution_helpers.py @@ -236,8 +236,8 @@ def __new__( return instance def __bool__(self) -> bool: - """Treat failed commands as false while preserving string truthiness for stdout.""" - return self.ok and len(self) > 0 + """True iff the command exited successfully (returncode 0).""" + return self.ok def shell( # noqa: C901