SemiAnalysisAI · haic0 · Apr 13, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 23, 2026
@@ -1312,6 +1312,266 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=2"
 
+minimaxm2.5-fp8-mi300x-vllm-disagg:
+  image: ghcr.io/simondanielsson/vllm/vllm-openai-rocm:fix-moriio-hangs-high-concurrency
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi300x-disagg
+  precision: fp8
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # Top of curve: 2P1D
+      - spec-decoding: "none"
+        conc-list: [256, 512, 1024, 2048 ]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=2"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+
+      # Bottom of curve: 1P2D
+      - spec-decoding: "none"
+        conc-list: [8, 16, 32, 64, 128]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+
+minimaxm2.5-fp8-mi325x-vllm-disagg:
+  image: ghcr.io/simondanielsson/vllm/vllm-openai-rocm:fix-moriio-hangs-high-concurrency
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi325x-disagg
+  precision: fp8
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # Top of curve: 2P1D
+      - spec-decoding: "none"
+        conc-list: [256, 512, 1024, 2048 ]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=2"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+
+      # Bottom of curve: 1P2D
+      - spec-decoding: "none"
+        conc-list: [8, 16, 32, 64, 128]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+
+kimik2.5-fp4-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x-disagg
+  precision: fp4
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total 
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+minimaxm2.5-fp8-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi355x-disagg
+  precision: fp8
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
+      # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536,
+      # TP8 shards to 192 which is not divisible by FP8 block_n=128.
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
 dsr1-fp4-mi355x-sglang-disagg:
   image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501
   model: amd/DeepSeek-R1-0528-MXFP4-v2

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -271,7 +271,9 @@ jobs:
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: multinode_server_logs_${{ env.RESULT_FILENAME }}
-          path: multinode_server_logs.tar.gz
+          path: |
+            multinode_server_logs.tar.gz
+            benchmark_artifacts/
           if-no-files-found: ignore
 
       - name: Upload agentic aggregated result

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -204,10 +204,12 @@ run_benchmark_serving() {
     local result_filename=""
     local result_dir=""
     local workspace_dir=""
+    local tokenizer=""
     local use_chat_template=false
     local dsv4=false
     local trust_remote_code=false
     local server_pid=""
+    local tokenizer=""
 
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -268,6 +270,10 @@ run_benchmark_serving() {
                 use_chat_template=true
                 shift
                 ;;
+            --tokenizer)
+                tokenizer="$2"
+                shift 2
+                ;;
             --trust-remote-code)
                 trust_remote_code=true
                 shift
@@ -276,6 +282,10 @@ run_benchmark_serving() {
                 server_pid="$2"
                 shift 2
                 ;;
+            --tokenizer)
+                tokenizer="$2"
+                shift 2
+                ;;
             *)
                 echo "Unknown parameter: $1"
                 return 1
@@ -383,6 +393,10 @@ run_benchmark_serving() {
         benchmark_cmd+=(--trust-remote-code)
     fi
 
+    if [[ -n "$tokenizer" ]]; then
+        benchmark_cmd+=(--tokenizer "$tokenizer")
+    fi
+
     # Run benchmark with optional server monitoring
     set -x
     if [[ -n "$server_pid" ]]; then