66from collections .abc import Iterable
77from pathlib import Path
88
9- REPORT_DATE = "2026-03-22 "
9+ REPORT_DATE = "2026-03-28 "
1010RESULTS_DIR = Path ("results" )
1111OUTPUT_DIR = Path ("reports" )
1212FIGURES_DIR = OUTPUT_DIR / "figures"
1313
1414TARGET_MODELS = [
15- {"id" : "google/gemma-2-2b-it" , "name" : "Gemma 2B" , "size_b" : 2 },
16- {"id" : "microsoft/Phi-3-mini-4k-instruct" , "name" : "Phi-3 mini" , "size_b" : 3 },
17- {"id" : "Qwen/Qwen2.5-7B-Instruct" , "name" : "Qwen 7B" , "size_b" : 7 },
18- {"id" : "mistralai/Mistral-7B-Instruct-v0.3" , "name" : "Mistral 7B" , "size_b" : 7 },
19- {"id" : "google/gemma-2-9b-it" , "name" : "Gemma 9B" , "size_b" : 9 },
15+ {"id" : "google/gemma-2-2b-it" , "dir" : "gemma-2-2b-it" , "name" : "Gemma 2B" , "size_b" : 2 },
16+ {"id" : "meta-llama/Llama-3.2-3B-Instruct" , "dir" : "llama-3.2-3b-instruct" , "name" : "Llama 3.2 3B" , "size_b" : 3 },
17+ {"id" : "microsoft/Phi-3-mini-4k-instruct" , "dir" : "phi-3-mini-4k-instruct" , "name" : "Phi-3 mini" , "size_b" : 4 },
18+ {"id" : "Qwen/Qwen2.5-7B-Instruct" , "dir" : "qwen2.5-7b-instruct" , "name" : "Qwen 7B" , "size_b" : 7 },
19+ {"id" : "mistralai/Mistral-7B-Instruct-v0.3" , "dir" : "mistral-7b-instruct-v0.3" , "name" : "Mistral 7B" , "size_b" : 7 },
20+ {"id" : "meta-llama/Llama-3.1-8B-Instruct" , "dir" : "llama-3.1-8b-instruct" , "name" : "Llama 3.1 8B" , "size_b" : 8 },
21+ {"id" : "google/gemma-2-9b-it" , "dir" : "gemma-2-9b-it" , "name" : "Gemma 9B" , "size_b" : 9 },
2022]
2123TARGET_MODEL_MAP = {entry ["id" ]: entry for entry in TARGET_MODELS }
24+ DIR_NAME_TO_MODEL_ID = {entry ["dir" ]: entry ["id" ] for entry in TARGET_MODELS }
2225MODEL_ORDER = [entry ["name" ] for entry in TARGET_MODELS ]
2326SCENARIO_ORDER = ["single_request_latency" , "throughput_ramp" ]
2427ENGINE_ORDER = ["vLLM" , "SGLang" ]
3033 "SGLang could not be included on this setup because the FlashInfer/CUDA graph path failed on unsupported `head_dim=96`." ,
3134 ],
3235 "google/gemma-2-9b-it" : [
33- "vLLM required tuned launch settings on the single A10G: `context=4096` and `gpu_memory_utilization=0.92 `." ,
36+ "vLLM required tuned launch settings on the single A10G: `max_model_len=2048`, `gpu_memory_utilization=0.95`, `--disable-frontend-multiprocessing`, and `--enforce-eager `." ,
3437 ],
3538}
3639
@@ -141,6 +144,7 @@ def _extract_model_id(
141144 data .get ("model" ),
142145 data .get ("model_id" ),
143146 model_map_from_logs .get (path .name ),
147+ DIR_NAME_TO_MODEL_ID .get (path .parent .name ),
144148 ]
145149 for candidate in candidates :
146150 if candidate in TARGET_MODEL_MAP :
@@ -224,7 +228,7 @@ def load_latest_rows() -> list[dict]:
224228 model_map_from_logs = _load_result_model_map_from_logs ()
225229 snapshot_hints = _load_snapshot_hints ()
226230
227- for path in sorted (RESULTS_DIR .glob ("*Client_*.json" )):
231+ for path in sorted (RESULTS_DIR .rglob ("*Client_*.json" )):
228232 try :
229233 data = json .loads (path .read_text ())
230234 except Exception :
@@ -517,6 +521,7 @@ def build_markdown(rows: list[dict]) -> str:
517521 best_rps = best_by (rows , "requests_per_sec" , scenario = "throughput_ramp" )
518522 takeaways = generate_takeaways (rows )
519523 notes = sorted ({note for model_id , notes in MODEL_NOTES .items () for note in notes })
524+ models_included = ", " .join (MODEL_ORDER )
520525
521526 return f"""# Final Multi-Model Benchmark Report ({ REPORT_DATE } )
522527
@@ -536,7 +541,7 @@ def build_markdown(rows: list[dict]) -> str:
536541- Instance: **AWS g5.2xlarge**
537542- GPU: **NVIDIA A10G, 24 GB VRAM**
538543- Execution policy: **one engine at a time**
539- - Models included: Gemma 2B, Phi-3 mini, Qwen 7B, Mistral 7B, Gemma 9B
544+ - Models included: { models_included }
540545
541546## Important notes
542547
@@ -599,6 +604,7 @@ def build_html(rows: list[dict]) -> str:
599604 best_tps = best_by (rows , "tokens_per_sec" , scenario = "throughput_ramp" )
600605 best_rps = best_by (rows , "requests_per_sec" , scenario = "throughput_ramp" )
601606 notes = sorted ({note for model_id , notes in MODEL_NOTES .items () for note in notes })
607+ models_included = ", " .join (MODEL_ORDER )
602608
603609 def render_table (table_rows : list [dict ]) -> str :
604610 body = "" .join (
@@ -649,6 +655,7 @@ def render_table(table_rows: list[dict]) -> str:
649655 <li>AWS <strong>g5.2xlarge</strong></li>
650656 <li>NVIDIA <strong>A10G 24 GB</strong></li>
651657 <li>Sequential engine execution on a single GPU</li>
658+ <li>Models included: <strong>{ html .escape (models_included )} </strong></li>
652659 </ul>
653660 </div>
654661
0 commit comments