varad-more
diff --git a/‎NEXT_STEPS.md‎
Lines changed: 1 addition & 1 deletion b/‎NEXT_STEPS.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎PENDING_RUNS.md‎
Lines changed: 19 additions & 3 deletions b/‎PENDING_RUNS.md‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎analysis/generate_final_benchmark_report.py‎
Lines changed: 16 additions & 9 deletions b/‎analysis/generate_final_benchmark_report.py‎
Lines changed: 16 additions & 9 deletions
diff --git a/‎docker-compose.gemma9b-vllm-a10g.yml‎
Lines changed: 29 additions & 0 deletions b/‎docker-compose.gemma9b-vllm-a10g.yml‎
Lines changed: 29 additions & 0 deletions
@@ -155,7 +155,7 @@ Once the pipeline is proven with Qwen 1.5B, repeat with the models from the prio
 - `mistralai/Mistral-7B-Instruct-v0.3`
 - `google/gemma-2-9b-it`
 
-Use `matrix` with `--iterations 2` and `--cooldown-seconds 300` for stable results.
+Use `matrix` with `--iterations 2` and `--cooldown-seconds 120` for the next run.
 
 Store each model in its own result directory to avoid mixed-model comparisons in the dashboard and HTML report. For example:
 
 
@@ -2,9 +2,25 @@
 
 ## What's missing
 
-Only **Gemma 9B on vLLM** (all 5 scenarios, 2 iterations).
+**Nothing pending right now.**
 
-Everything else is complete (7 models, both engines where supported).
+Gemma 9B on vLLM was the last missing benchmark leg, and it has now completed.
+Everything else is complete (7 models, both engines where supported, except the known Phi-3 mini SGLang exclusion).
+
+## Automation scripts
+
+Use these repo-root scripts to run and verify the remaining work:
+
+```bash
+./pending_run_gemma9b_vllm.sh
+./pending_run_gemma9b_vllm_verify.sh
+```
+
+Known-good compose override for this machine:
+
+```bash
+docker compose -f docker-compose.yml -f docker-compose.gemma9b-vllm-a10g.yml --profile vllm up -d vllm
+```
 
 ## Debugging the 404 on /v1/completions
 
@@ -82,7 +98,7 @@ python run_experiment.py matrix \
   --model google/gemma-2-9b-it \
   --output-dir results/gemma-2-9b-it \
   --iterations 2 \
-  --cooldown-seconds 300
+  --cooldown-seconds 120
 ```
 
 ### Step 4: Verify output
 
@@ -6,19 +6,22 @@
 from collections.abc import Iterable
 from pathlib import Path
 
-REPORT_DATE = "2026-03-22"
+REPORT_DATE = "2026-03-28"
 RESULTS_DIR = Path("results")
 OUTPUT_DIR = Path("reports")
 FIGURES_DIR = OUTPUT_DIR / "figures"
 
 TARGET_MODELS = [
-    {"id": "google/gemma-2-2b-it", "name": "Gemma 2B", "size_b": 2},
-    {"id": "microsoft/Phi-3-mini-4k-instruct", "name": "Phi-3 mini", "size_b": 3},
-    {"id": "Qwen/Qwen2.5-7B-Instruct", "name": "Qwen 7B", "size_b": 7},
-    {"id": "mistralai/Mistral-7B-Instruct-v0.3", "name": "Mistral 7B", "size_b": 7},
-    {"id": "google/gemma-2-9b-it", "name": "Gemma 9B", "size_b": 9},
+    {"id": "google/gemma-2-2b-it", "dir": "gemma-2-2b-it", "name": "Gemma 2B", "size_b": 2},
+    {"id": "meta-llama/Llama-3.2-3B-Instruct", "dir": "llama-3.2-3b-instruct", "name": "Llama 3.2 3B", "size_b": 3},
+    {"id": "microsoft/Phi-3-mini-4k-instruct", "dir": "phi-3-mini-4k-instruct", "name": "Phi-3 mini", "size_b": 4},
+    {"id": "Qwen/Qwen2.5-7B-Instruct", "dir": "qwen2.5-7b-instruct", "name": "Qwen 7B", "size_b": 7},
+    {"id": "mistralai/Mistral-7B-Instruct-v0.3", "dir": "mistral-7b-instruct-v0.3", "name": "Mistral 7B", "size_b": 7},
+    {"id": "meta-llama/Llama-3.1-8B-Instruct", "dir": "llama-3.1-8b-instruct", "name": "Llama 3.1 8B", "size_b": 8},
+    {"id": "google/gemma-2-9b-it", "dir": "gemma-2-9b-it", "name": "Gemma 9B", "size_b": 9},
 ]
 TARGET_MODEL_MAP = {entry["id"]: entry for entry in TARGET_MODELS}
+DIR_NAME_TO_MODEL_ID = {entry["dir"]: entry["id"] for entry in TARGET_MODELS}
 MODEL_ORDER = [entry["name"] for entry in TARGET_MODELS]
 SCENARIO_ORDER = ["single_request_latency", "throughput_ramp"]
 ENGINE_ORDER = ["vLLM", "SGLang"]
@@ -30,7 +33,7 @@
         "SGLang could not be included on this setup because the FlashInfer/CUDA graph path failed on unsupported `head_dim=96`.",
     ],
     "google/gemma-2-9b-it": [
-        "vLLM required tuned launch settings on the single A10G: `context=4096` and `gpu_memory_utilization=0.92`.",
+        "vLLM required tuned launch settings on the single A10G: `max_model_len=2048`, `gpu_memory_utilization=0.95`, `--disable-frontend-multiprocessing`, and `--enforce-eager`.",
     ],
 }
 
@@ -141,6 +144,7 @@ def _extract_model_id(
         data.get("model"),
         data.get("model_id"),
         model_map_from_logs.get(path.name),
+        DIR_NAME_TO_MODEL_ID.get(path.parent.name),
     ]
     for candidate in candidates:
         if candidate in TARGET_MODEL_MAP:
@@ -224,7 +228,7 @@ def load_latest_rows() -> list[dict]:
     model_map_from_logs = _load_result_model_map_from_logs()
     snapshot_hints = _load_snapshot_hints()
 
-    for path in sorted(RESULTS_DIR.glob("*Client_*.json")):
+    for path in sorted(RESULTS_DIR.rglob("*Client_*.json")):
         try:
             data = json.loads(path.read_text())
         except Exception:
@@ -517,6 +521,7 @@ def build_markdown(rows: list[dict]) -> str:
     best_rps = best_by(rows, "requests_per_sec", scenario="throughput_ramp")
     takeaways = generate_takeaways(rows)
     notes = sorted({note for model_id, notes in MODEL_NOTES.items() for note in notes})
+    models_included = ", ".join(MODEL_ORDER)
 
     return f"""# Final Multi-Model Benchmark Report ({REPORT_DATE})
 
@@ -536,7 +541,7 @@ def build_markdown(rows: list[dict]) -> str:
 - Instance: **AWS g5.2xlarge**
 - GPU: **NVIDIA A10G, 24 GB VRAM**
 - Execution policy: **one engine at a time**
-- Models included: Gemma 2B, Phi-3 mini, Qwen 7B, Mistral 7B, Gemma 9B
+- Models included: {models_included}
 
 ## Important notes
 
@@ -599,6 +604,7 @@ def build_html(rows: list[dict]) -> str:
     best_tps = best_by(rows, "tokens_per_sec", scenario="throughput_ramp")
     best_rps = best_by(rows, "requests_per_sec", scenario="throughput_ramp")
     notes = sorted({note for model_id, notes in MODEL_NOTES.items() for note in notes})
+    models_included = ", ".join(MODEL_ORDER)
 
     def render_table(table_rows: list[dict]) -> str:
         body = "".join(
@@ -649,6 +655,7 @@ def render_table(table_rows: list[dict]) -> str:
       <li>AWS <strong>g5.2xlarge</strong></li>
       <li>NVIDIA <strong>A10G 24 GB</strong></li>
       <li>Sequential engine execution on a single GPU</li>
+      <li>Models included: <strong>{html.escape(models_included)}</strong></li>
     </ul>
   </div>
 
 
@@ -0,0 +1,29 @@
+# Known-good override for running google/gemma-2-9b-it on a single NVIDIA A10G 24 GB
+# with vLLM on this repo's benchmark stack.
+#
+# Usage:
+#   docker compose -f docker-compose.yml -f docker-compose.gemma9b-vllm-a10g.yml --profile vllm up -d vllm
+#
+# Notes:
+# - Lower max context to 2048 to leave KV-cache headroom.
+# - Push GPU memory utilization to 0.95.
+# - Disable frontend multiprocessing and enforce eager mode to reduce startup/runtime memory pressure.
+
+services:
+  vllm:
+    command:
+      - "--model"
+      - "google/gemma-2-9b-it"
+      - "--host"
+      - "0.0.0.0"
+      - "--port"
+      - "8000"
+      - "--enable-prefix-caching"
+      - "--max-model-len"
+      - "2048"
+      - "--gpu-memory-utilization"
+      - "0.95"
+      - "--served-model-name"
+      - "google/gemma-2-9b-it"
+      - "--disable-frontend-multiprocessing"
+      - "--enforce-eager"