From 2e818f3375fb9bef09147748f8384c465b561c22 Mon Sep 17 00:00:00 2001 From: Radu Malliu <26001154+radu-malliu@users.noreply.github.com> Date: Fri, 3 Apr 2026 12:19:07 -0700 Subject: [PATCH 1/3] add instance type to nccl test mapping; add instance type selector --- .../notebooks/sunk_cluster_walkthrough.py | 103 ++++++++++++------ 1 file changed, 68 insertions(+), 35 deletions(-) rename arena-staging/arena_sunk_cluster_walkthrough_benchmarks.py => arena/notebooks/sunk_cluster_walkthrough.py (61%) diff --git a/arena-staging/arena_sunk_cluster_walkthrough_benchmarks.py b/arena/notebooks/sunk_cluster_walkthrough.py similarity index 61% rename from arena-staging/arena_sunk_cluster_walkthrough_benchmarks.py rename to arena/notebooks/sunk_cluster_walkthrough.py index 188c1da..ae5fd49 100644 --- a/arena-staging/arena_sunk_cluster_walkthrough_benchmarks.py +++ b/arena/notebooks/sunk_cluster_walkthrough.py @@ -1,3 +1,16 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "boto3==1.42.45", +# "k8s==0.28.0", +# "kubernetes==35.0.0", +# "marimo>=0.20.2", +# "mypy-boto3-s3>=1.42.37", +# "ruamel-yaml>=0.19.1", +# "typing-extensions>=4.15.0" +# ] +# /// + import marimo __generated_with = "0.19.7" @@ -17,31 +30,22 @@ def _(): # setup cell runs before anything else, recommend putting values in here import marimo as mo - """ - Import SSH helpers from arena library. - - Configuration via environment variables: - CW_ARENA_SSH_KEY_PATH: Path to SSH private key (default: /root/.ssh/id_rsa) - CW_ARENA_SSH_HOST: SSH host (e.g., user+tenant@sunk.tenant.coreweave.app) - """ from lib.remote_execution_helpers import shell - - return mo, shell - - -@app.cell(hide_code=True) -def _(mo): - mo.md(r""" - # CoreWeave AI Labs: SUNK Cluster - - /// admonition | About This Notebook - type: info - - This notebook provides a walkthrough for inspecting and benchmarking your SUNK (Slurm on Kubernetes) cluster. - /// - """) - return - + from lib.k8s import K8s + + TEST_NAMES = { + "gb300-4x": "nccl-test-distributed-gb200-nvl72-enroot.slurm", + "gb300-4x-e": "nccl-test-distributed-gb300-roce-nvl72-enroot.slurm", + "gb200-4x": "nccl-test-distributed-gb200-nvl72-enroot.slurm", + "b300-8x": "nccl-test-distributed-h100-64.slurm", + "b200-8x": "nccl-test-distributed-h100-64.slurm", + "gd-8xh200ib-i128":"nccl-test-distributed-h100-64.slurm", + "gd-8xh100ib-i128":"nccl-test-distributed-h100-64.slurm", + "gd-8xa100-i128": "nccl-test-distributed-a100-64.slurm", + # L40S, L40, GH200, RTX Pro 6000 — no matching nccl test script +} + + return mo, shell, K8s, TEST_NAMES @app.cell(hide_code=True) def _(mo): @@ -84,21 +88,21 @@ def _(mo): @app.cell def _(shell): # Slurm cluster info: - print(shell("sinfo")) + shell("sinfo") return @app.cell def _(shell): # Slurm user info: - print(shell("sacctmgr show users")) + shell("sacctmgr show users") return @app.cell def _(shell): # Slurm accounting info: - print(shell("sacctmgr show associations format=User,Account,Partition,QOS")) + shell("sacctmgr show associations format=User,Account,Partition,QOS") return @@ -113,7 +117,7 @@ def _(mo): @app.cell def _(shell): # Node info - print(shell("scontrol show nodes")) + shell("scontrol show nodes") return @@ -127,7 +131,7 @@ def _(mo): @app.cell def _(shell): - print(shell("scontrol show partition")) + shell("scontrol show partition") return @@ -172,6 +176,24 @@ def _(mo, num_nodes): return +@app.cell(hide_code=True) +def _(mo, K8s): + k8s = K8s() + nodes = k8s.nodes + gpu_nodes = nodes.get("gpu") or {} + gpu_keys = list(gpu_nodes.keys()) + if not gpu_keys: + gpu_keys = ["(no GPU nodes detected)"] + default_node_type = gpu_keys[0] + node_type_dropdown = mo.ui.dropdown( + options=gpu_keys, value=default_node_type, label="Node type" + ) + _gpu_type_ui = mo.md( + f"**GPU node types (from cluster API)**\n\n{node_type_dropdown}" + ) + _gpu_type_ui + return (node_type_dropdown,) + @app.cell(hide_code=True) def _(mo): submit_btn = mo.ui.run_button(label="Submit NCCL Test Job") @@ -180,11 +202,22 @@ def _(mo): @app.cell -def _(num_nodes, shell, submit_btn): +def _( + mo, + num_nodes, + shell, + submit_btn, + node_type_dropdown: mo.ui.dropdown, + TEST_NAMES, +): if submit_btn.value: - cmd = f"cd /mnt/data/arena/benchmarks/nccl/nccl-tests/slurm && sbatch -N {num_nodes.value} nccl-test-distributed-h100-64.slurm" - print(f"Running: sbatch -N {num_nodes.value} ...") - print(shell(cmd)) + if node_type_dropdown.value not in TEST_NAMES: + print(f"No NCCL slurm script mapped for node type {node_type_dropdown.value!r}.") + else: + script = TEST_NAMES[node_type_dropdown.value] + cmd = f"cd /mnt/data/arena/benchmarks/nccl/nccl-tests/slurm && sbatch -N {num_nodes.value} {script}" + print(f"Running: sbatch -N {num_nodes.value} ...") + shell(cmd) else: print("Click 'Submit NCCL Test Job' to run the benchmark") return @@ -192,13 +225,13 @@ def _(num_nodes, shell, submit_btn): @app.cell def _(shell): - print(shell("squeue")) + shell("squeue") return @app.cell def _(shell): - print(shell("ls /mnt/data/arena/benchmarks/nccl/nccl-tests/slurm/*.out")) + shell("ls /mnt/data/arena/benchmarks/nccl/nccl-tests/slurm/*.out") return From af7fdf46947024526e9760813eeab3e4ed381f11 Mon Sep 17 00:00:00 2001 From: Radu Malliu <26001154+radu-malliu@users.noreply.github.com> Date: Fri, 3 Apr 2026 14:04:25 -0700 Subject: [PATCH 2/3] fix: remove dependencies --- arena/notebooks/sunk_cluster_walkthrough.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/arena/notebooks/sunk_cluster_walkthrough.py b/arena/notebooks/sunk_cluster_walkthrough.py index ae5fd49..6878fc9 100644 --- a/arena/notebooks/sunk_cluster_walkthrough.py +++ b/arena/notebooks/sunk_cluster_walkthrough.py @@ -1,11 +1,9 @@ # /// script # requires-python = ">=3.12" # dependencies = [ -# "boto3==1.42.45", # "k8s==0.28.0", # "kubernetes==35.0.0", # "marimo>=0.20.2", -# "mypy-boto3-s3>=1.42.37", # "ruamel-yaml>=0.19.1", # "typing-extensions>=4.15.0" # ] From b393e92967aa7d862d17603d7cec94ff324c4ff2 Mon Sep 17 00:00:00 2001 From: Radu Malliu <26001154+radu-malliu@users.noreply.github.com> Date: Mon, 6 Apr 2026 04:35:37 -0700 Subject: [PATCH 3/3] fix: use ui components --- arena/notebooks/sunk_cluster_walkthrough.py | 48 +++++++++++---------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/arena/notebooks/sunk_cluster_walkthrough.py b/arena/notebooks/sunk_cluster_walkthrough.py index 6878fc9..1d31f45 100644 --- a/arena/notebooks/sunk_cluster_walkthrough.py +++ b/arena/notebooks/sunk_cluster_walkthrough.py @@ -14,23 +14,15 @@ __generated_with = "0.19.7" app = marimo.App(width="medium", app_title="CoreWeave ARENA") - -@app.cell(hide_code=True) -def _(mo): - mo.md(r""" - ![CoreWeave ARENA Banner](public/banner.jpg) - """) - return - - -@app.cell(hide_code=True) -def _(): - # setup cell runs before anything else, recommend putting values in here +with app.setup: import marimo as mo - from lib.remote_execution_helpers import shell from lib.k8s import K8s + from lib.ui import about, banner, security_disclaimer, table_of_contents + +@app.cell(hide_code=True) +def _(): TEST_NAMES = { "gb300-4x": "nccl-test-distributed-gb200-nvl72-enroot.slurm", "gb300-4x-e": "nccl-test-distributed-gb300-roce-nvl72-enroot.slurm", @@ -42,19 +34,29 @@ def _(): "gd-8xa100-i128": "nccl-test-distributed-a100-64.slurm", # L40S, L40, GH200, RTX Pro 6000 — no matching nccl test script } + return TEST_NAMES - return mo, shell, K8s, TEST_NAMES @app.cell(hide_code=True) -def _(mo): - mo.md(r""" - /// details | Table of Contents - - - **Cluster Inspection** - View nodes, partitions, and user info - - **NCCL Benchmarks** - Run distributed GPU communication tests - - **Job Observability** - Grafana dashboards and monitoring - /// - """) +def _(): + _elements = [ + banner(), + about( + "SUNK Cluster Walkthrough", + """This notebook provides a walkthrough for inspecting and benchmarking your SUNK (Slurm on Kubernetes) cluster.
+ _If you are running this notebook in edit mode, make sure you start by running all cells in the bottom right._ + """, + ), + table_of_contents( + [ + {"title": "Cluster Inspection", "description": "View nodes, partitions, and user info"}, + {"title": "NCCL Benchmarks", "description": "Run distributed GPU communication tests"}, + {"title": "Job Observability", "description": "Grafana dashboards and monitoring"}, + ] + ), + security_disclaimer(), + ] + mo.vstack(_elements) return