Skip to content

Commit 3d03587

Browse files
committed
data: add completed gemma 9b vllm benchmark artifacts
1 parent 6b97516 commit 3d03587

28 files changed

Lines changed: 52768 additions & 30 deletions

NEXT_STEPS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ Once the pipeline is proven with Qwen 1.5B, repeat with the models from the prio
155155
- `mistralai/Mistral-7B-Instruct-v0.3`
156156
- `google/gemma-2-9b-it`
157157

158-
Use `matrix` with `--iterations 2` and `--cooldown-seconds 300` for stable results.
158+
Use `matrix` with `--iterations 2` and `--cooldown-seconds 120` for the next run.
159159

160160
Store each model in its own result directory to avoid mixed-model comparisons in the dashboard and HTML report. For example:
161161

PENDING_RUNS.md

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,25 @@
22

33
## What's missing
44

5-
Only **Gemma 9B on vLLM** (all 5 scenarios, 2 iterations).
5+
**Nothing pending right now.**
66

7-
Everything else is complete (7 models, both engines where supported).
7+
Gemma 9B on vLLM was the last missing benchmark leg, and it has now completed.
8+
Everything else is complete (7 models, both engines where supported, except the known Phi-3 mini SGLang exclusion).
9+
10+
## Automation scripts
11+
12+
Use these repo-root scripts to run and verify the remaining work:
13+
14+
```bash
15+
./pending_run_gemma9b_vllm.sh
16+
./pending_run_gemma9b_vllm_verify.sh
17+
```
18+
19+
Known-good compose override for this machine:
20+
21+
```bash
22+
docker compose -f docker-compose.yml -f docker-compose.gemma9b-vllm-a10g.yml --profile vllm up -d vllm
23+
```
824

925
## Debugging the 404 on /v1/completions
1026

@@ -82,7 +98,7 @@ python run_experiment.py matrix \
8298
--model google/gemma-2-9b-it \
8399
--output-dir results/gemma-2-9b-it \
84100
--iterations 2 \
85-
--cooldown-seconds 300
101+
--cooldown-seconds 120
86102
```
87103

88104
### Step 4: Verify output

analysis/generate_final_benchmark_report.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,22 @@
66
from collections.abc import Iterable
77
from pathlib import Path
88

9-
REPORT_DATE = "2026-03-22"
9+
REPORT_DATE = "2026-03-28"
1010
RESULTS_DIR = Path("results")
1111
OUTPUT_DIR = Path("reports")
1212
FIGURES_DIR = OUTPUT_DIR / "figures"
1313

1414
TARGET_MODELS = [
15-
{"id": "google/gemma-2-2b-it", "name": "Gemma 2B", "size_b": 2},
16-
{"id": "microsoft/Phi-3-mini-4k-instruct", "name": "Phi-3 mini", "size_b": 3},
17-
{"id": "Qwen/Qwen2.5-7B-Instruct", "name": "Qwen 7B", "size_b": 7},
18-
{"id": "mistralai/Mistral-7B-Instruct-v0.3", "name": "Mistral 7B", "size_b": 7},
19-
{"id": "google/gemma-2-9b-it", "name": "Gemma 9B", "size_b": 9},
15+
{"id": "google/gemma-2-2b-it", "dir": "gemma-2-2b-it", "name": "Gemma 2B", "size_b": 2},
16+
{"id": "meta-llama/Llama-3.2-3B-Instruct", "dir": "llama-3.2-3b-instruct", "name": "Llama 3.2 3B", "size_b": 3},
17+
{"id": "microsoft/Phi-3-mini-4k-instruct", "dir": "phi-3-mini-4k-instruct", "name": "Phi-3 mini", "size_b": 4},
18+
{"id": "Qwen/Qwen2.5-7B-Instruct", "dir": "qwen2.5-7b-instruct", "name": "Qwen 7B", "size_b": 7},
19+
{"id": "mistralai/Mistral-7B-Instruct-v0.3", "dir": "mistral-7b-instruct-v0.3", "name": "Mistral 7B", "size_b": 7},
20+
{"id": "meta-llama/Llama-3.1-8B-Instruct", "dir": "llama-3.1-8b-instruct", "name": "Llama 3.1 8B", "size_b": 8},
21+
{"id": "google/gemma-2-9b-it", "dir": "gemma-2-9b-it", "name": "Gemma 9B", "size_b": 9},
2022
]
2123
TARGET_MODEL_MAP = {entry["id"]: entry for entry in TARGET_MODELS}
24+
DIR_NAME_TO_MODEL_ID = {entry["dir"]: entry["id"] for entry in TARGET_MODELS}
2225
MODEL_ORDER = [entry["name"] for entry in TARGET_MODELS]
2326
SCENARIO_ORDER = ["single_request_latency", "throughput_ramp"]
2427
ENGINE_ORDER = ["vLLM", "SGLang"]
@@ -30,7 +33,7 @@
3033
"SGLang could not be included on this setup because the FlashInfer/CUDA graph path failed on unsupported `head_dim=96`.",
3134
],
3235
"google/gemma-2-9b-it": [
33-
"vLLM required tuned launch settings on the single A10G: `context=4096` and `gpu_memory_utilization=0.92`.",
36+
"vLLM required tuned launch settings on the single A10G: `max_model_len=2048`, `gpu_memory_utilization=0.95`, `--disable-frontend-multiprocessing`, and `--enforce-eager`.",
3437
],
3538
}
3639

@@ -141,6 +144,7 @@ def _extract_model_id(
141144
data.get("model"),
142145
data.get("model_id"),
143146
model_map_from_logs.get(path.name),
147+
DIR_NAME_TO_MODEL_ID.get(path.parent.name),
144148
]
145149
for candidate in candidates:
146150
if candidate in TARGET_MODEL_MAP:
@@ -224,7 +228,7 @@ def load_latest_rows() -> list[dict]:
224228
model_map_from_logs = _load_result_model_map_from_logs()
225229
snapshot_hints = _load_snapshot_hints()
226230

227-
for path in sorted(RESULTS_DIR.glob("*Client_*.json")):
231+
for path in sorted(RESULTS_DIR.rglob("*Client_*.json")):
228232
try:
229233
data = json.loads(path.read_text())
230234
except Exception:
@@ -517,6 +521,7 @@ def build_markdown(rows: list[dict]) -> str:
517521
best_rps = best_by(rows, "requests_per_sec", scenario="throughput_ramp")
518522
takeaways = generate_takeaways(rows)
519523
notes = sorted({note for model_id, notes in MODEL_NOTES.items() for note in notes})
524+
models_included = ", ".join(MODEL_ORDER)
520525

521526
return f"""# Final Multi-Model Benchmark Report ({REPORT_DATE})
522527
@@ -536,7 +541,7 @@ def build_markdown(rows: list[dict]) -> str:
536541
- Instance: **AWS g5.2xlarge**
537542
- GPU: **NVIDIA A10G, 24 GB VRAM**
538543
- Execution policy: **one engine at a time**
539-
- Models included: Gemma 2B, Phi-3 mini, Qwen 7B, Mistral 7B, Gemma 9B
544+
- Models included: {models_included}
540545
541546
## Important notes
542547
@@ -599,6 +604,7 @@ def build_html(rows: list[dict]) -> str:
599604
best_tps = best_by(rows, "tokens_per_sec", scenario="throughput_ramp")
600605
best_rps = best_by(rows, "requests_per_sec", scenario="throughput_ramp")
601606
notes = sorted({note for model_id, notes in MODEL_NOTES.items() for note in notes})
607+
models_included = ", ".join(MODEL_ORDER)
602608

603609
def render_table(table_rows: list[dict]) -> str:
604610
body = "".join(
@@ -649,6 +655,7 @@ def render_table(table_rows: list[dict]) -> str:
649655
<li>AWS <strong>g5.2xlarge</strong></li>
650656
<li>NVIDIA <strong>A10G 24 GB</strong></li>
651657
<li>Sequential engine execution on a single GPU</li>
658+
<li>Models included: <strong>{html.escape(models_included)}</strong></li>
652659
</ul>
653660
</div>
654661
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Known-good override for running google/gemma-2-9b-it on a single NVIDIA A10G 24 GB
2+
# with vLLM on this repo's benchmark stack.
3+
#
4+
# Usage:
5+
# docker compose -f docker-compose.yml -f docker-compose.gemma9b-vllm-a10g.yml --profile vllm up -d vllm
6+
#
7+
# Notes:
8+
# - Lower max context to 2048 to leave KV-cache headroom.
9+
# - Push GPU memory utilization to 0.95.
10+
# - Disable frontend multiprocessing and enforce eager mode to reduce startup/runtime memory pressure.
11+
12+
services:
13+
vllm:
14+
command:
15+
- "--model"
16+
- "google/gemma-2-9b-it"
17+
- "--host"
18+
- "0.0.0.0"
19+
- "--port"
20+
- "8000"
21+
- "--enable-prefix-caching"
22+
- "--max-model-len"
23+
- "2048"
24+
- "--gpu-memory-utilization"
25+
- "0.95"
26+
- "--served-model-name"
27+
- "google/gemma-2-9b-it"
28+
- "--disable-frontend-multiprocessing"
29+
- "--enforce-eager"

0 commit comments

Comments
 (0)