From f36537242d1c392856229a0157830d9abae7c191 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Mon, 13 Apr 2026 03:00:45 +0000 Subject: [PATCH 01/98] remove vllm disagg for dpsr1 and dpv3 Signed-off-by: Theresa Shan --- .github/configs/amd-master.yaml | 108 ++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 692725bc1..a8480f4b9 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1145,6 +1145,114 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" +kimik2.5-fp4-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:v0.18.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x-disagg + precision: fp4 + framework: vllm-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + +minimaxm2.5-fp8-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:v0.18.0 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi355x-disagg + precision: fp8 + framework: vllm-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536, + # TP8 shards to 192 which is not divisible by FP8 block_n=128. + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + dsr1-fp4-mi355x-sglang-disagg: image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501 model: amd/DeepSeek-R1-0528-MXFP4-v2 From 08f4c5b38516084f43ac4d9d6ea7d38fec8f1b51 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 21 Apr 2026 06:40:27 +0000 Subject: [PATCH 02/98] consolidate amd_utils for sglang and vllm Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/bench.sh | 72 +- benchmarks/multi_node/amd_utils/env.sh | 231 +++-- benchmarks/multi_node/amd_utils/job.slurm | 468 ++++----- .../multi_node/amd_utils/models_vllm.yaml | 42 + .../multi_node/amd_utils/moriio_proxy.py | 327 +++++++ .../amd_utils/patches/minimax_m2.py | 672 +++++++++++++ benchmarks/multi_node/amd_utils/server.sh | 66 +- .../multi_node/amd_utils/server_sglang.sh | 624 ++++++++++++ .../multi_node/amd_utils/server_vllm.sh | 490 ++++++++++ benchmarks/multi_node/amd_utils/setup_deps.sh | 908 ++++++++++++++++++ benchmarks/multi_node/amd_utils/start_etcd.sh | 47 + benchmarks/multi_node/amd_utils/submit.sh | 112 ++- benchmarks/multi_node/amd_utils/sync.py | 5 +- .../dsr1_fp4_mi355x_sglang-disagg.sh | 3 +- .../dsr1_fp8_mi355x_sglang-disagg.sh | 3 +- .../kimik2.5_fp4_mi355x_vllm-disagg.sh | 80 ++ .../minimaxm2.5_fp8_mi355x_vllm-disagg.sh | 78 ++ 17 files changed, 3800 insertions(+), 428 deletions(-) create mode 100644 benchmarks/multi_node/amd_utils/models_vllm.yaml create mode 100644 benchmarks/multi_node/amd_utils/moriio_proxy.py create mode 100644 benchmarks/multi_node/amd_utils/patches/minimax_m2.py create mode 100755 benchmarks/multi_node/amd_utils/server_sglang.sh create mode 100755 benchmarks/multi_node/amd_utils/server_vllm.sh create mode 100644 benchmarks/multi_node/amd_utils/setup_deps.sh create mode 100755 benchmarks/multi_node/amd_utils/start_etcd.sh create mode 100755 benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh create mode 100644 benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index ac996c5a9..87f3b1e8a 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -1,4 +1,17 @@ #!/bin/bash +# Dual-Engine Disaggregated Benchmark Runner +# +# ENGINE=sglang (default): SGLang benchmark +# ENGINE=vllm: vLLM benchmark +# +# Produces JSON result files via benchmark_serving.py so that the CI pipeline +# can collect and process results. +# +# Usage: bash bench.sh \ +# \ +# + +ENGINE="${ENGINE:-sglang}" n_prefill=$1 n_decode=$2 @@ -6,58 +19,81 @@ prefill_gpus=$3 decode_gpus=$4 model_path=$5 model_name=$6 -MODEL_PATH="${model_path}/${model_name}" +MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" log_path=$7 chosen_isl=${8:-1024} chosen_osl=${9:-1024} concurrency_list=${10:-"512x1"} -chosen_req_rate=${11:-1} +if [[ "$ENGINE" == "vllm" ]]; then + chosen_req_rate=${11:-inf} +else + chosen_req_rate=${11:-1} +fi random_range_ratio=${12:-0.8} num_prompts_multiplier=${13:-10} IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" -echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" - -head_node="localhost" -head_port="30000" +ROUTER_PORT="${ROUTER_PORT:-30000}" +echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" -profile_folder="${log_path}/sglang_isl_${chosen_isl}_osl_${chosen_osl}" -mkdir -p $profile_folder +profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}" +mkdir -p "$profile_folder" source "$(dirname "$0")/../../benchmark_lib.sh" -# Repo root inside the container (3 levels up from this script's directory) REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" -for max_concurrency in ${chosen_concurrencies[@]}; do +for max_concurrency in "${chosen_concurrencies[@]}"; do export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" + num_prompts=$(( max_concurrency * num_prompts_multiplier )) + if [[ "$num_prompts" -lt 16 ]]; then + num_prompts=16 + fi + echo "profile_folder: $profile_folder" echo "max_concurrency: $max_concurrency" echo "chosen_req_rate: $chosen_req_rate" echo "MODEL_PATH: $MODEL_PATH" - echo "head_port: $head_port" + echo "ROUTER_PORT: $ROUTER_PORT" echo "chosen_isl: $chosen_isl" echo "chosen_osl: $chosen_osl" + echo "num_prompts: $num_prompts" echo "export_file: $export_file" + # Engine-specific extra flags + extra_flags="" + if [[ "$ENGINE" == "vllm" ]]; then + extra_flags="--trust-remote-code" + else + if [ "$IS_MTP" = "true" ]; then + extra_flags="--use-chat-template" + fi + fi + run_benchmark_serving \ --bench-serving-dir "$REPO_ROOT" \ - --model ${MODEL_PATH} \ - --port ${head_port} \ + --model "$MODEL_PATH" \ + --port "$ROUTER_PORT" \ --backend openai \ - --input-len ${chosen_isl} \ - --output-len ${chosen_osl} \ - --random-range-ratio ${random_range_ratio} \ - --num-prompts $(( $max_concurrency * $num_prompts_multiplier )) \ + --input-len "$chosen_isl" \ + --output-len "$chosen_osl" \ + --random-range-ratio "$random_range_ratio" \ + --num-prompts "$num_prompts" \ --max-concurrency "$max_concurrency" \ --result-filename "$export_file" \ --result-dir /workspace/ \ - $( [ "$IS_MTP" = "true" ] && echo "--use-chat-template" ) + $extra_flags echo "-----------------------------------------" + + # vLLM: cooldown between rounds for idle KV block reaper + if [[ "$ENGINE" == "vllm" ]]; then + echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." + sleep 10 + fi done diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index d0b99eddc..c5a438541 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -1,141 +1,198 @@ #!/bin/bash -# SGLang/MoRI environment setup for multi-node disaggregated serving. +# Dual-engine environment setup for multi-node disaggregated serving. +# +# ENGINE=sglang (default): SGLang/MoRI environment +# ENGINE=vllm: vLLM/Nixl environment # # REQUIRED ENVIRONMENT VARIABLES: # IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) -# This must be set by the runner script (runners/launch_mi355x-amds.sh) -# -# OPTIONAL ENVIRONMENT VARIABLES: -# MORI_RDMA_TC - RDMA traffic class (e.g., 96, 104). Set by runner if cluster uses QoS. - +# Set by runner or auto-detected from hostname. set -x + +ENGINE="${ENGINE:-sglang}" export PYTHONDONTWRITEBYTECODE=1 -# IBDEVICES configuration +# ============================================================================= +# Shared: IBDEVICES detection +# ============================================================================= + # Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh) # Fall back to hostname detection if not set (for direct script execution) if [[ -z "$IBDEVICES" ]]; then - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7 - elif [[ $NODENAME == mia1* ]]; then - export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 + DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',') + if [[ -n "$DETECTED" ]]; then + export IBDEVICES="$DETECTED" else - echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2 - exit 1 + echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2 fi - echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $NODENAME" + echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)" else echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)" fi export IBDEVICES -# Auto-detect default network interface (portable across clusters) +# Shared: Auto-detect default network interface (portable across clusters) export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) +set +x -export NCCL_IB_HCA=$IBDEVICES +export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} -export SGLANG_USE_AITER=1 +# ============================================================================= +# Engine-specific environment +# ============================================================================= -export SGLANG_MORI_DISPATCH_DTYPE=auto -export SGLANG_MORI_FP8_COMB=true -export SGLANG_MORI_QP_PER_TRANSFER=4 -export SGLANG_MORI_NUM_WORKERS=4 -export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000 +if [[ "$ENGINE" == "vllm" ]]; then + # ========================================================================= + # vLLM/Nixl-specific environment + # ========================================================================= + set -x -export MORI_IO_QP_MAX_SEND_WR=16384 -export MORI_IO_QP_MAX_CQE=32768 -export MORI_IO_QP_MAX_SGE=4 + # UCX_NET_DEVICES: Use the first benic interface for UCX TCP transport + if [[ -z "$UCX_NET_DEVICES" ]]; then + UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1) + if [[ -n "$UCX_NET_DEV" ]]; then + export UCX_NET_DEVICES="$UCX_NET_DEV" + else + FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) + if [[ -n "$FIRST_IB" ]]; then + export UCX_NET_DEVICES="${FIRST_IB}:1" + fi + fi + echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES" + else + echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)" + fi -export MORI_IO_TC_DISABLE=0 + # RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing + export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1} -export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600 -export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600 + # QoS/DSCP configuration for lossless RoCEv2 fabric. + if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then + echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)" + elif command -v nicctl &> /dev/null; then + ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') + ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" ' +$1 == "DSCP" && $2 == ":" && $NF == p { + print $3; exit +}') + if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then + export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP )) + export UCX_IB_SL=$ND_PRIO + echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL" + else + echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + fi + fi + else + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + else + echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration." + fi + fi -# Disable allocating memory in one pass -export MORI_SHMEM_MODE=ISOLATION + set +x + echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}" -# Enable spec v2 -export SGLANG_ENABLE_SPEC_V2=1 -export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 +else + # ========================================================================= + # SGLang/MoRI-specific environment + # ========================================================================= -export SGLANG_LOG_MS=true -export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 + export SGLANG_USE_AITER=1 + export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200 + export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200 -export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 -export MORI_MAX_DISPATCH_TOKENS_DECODE=512 + # Disable allocating memory in one pass + export MORI_SHMEM_MODE=ISOLATION + export SGLANG_MORI_FP8_DISP=True -export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768 -export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703 + if [[ "$MODEL_NAME" == *mxfp4* ]]; then + export SGLANG_MORI_FP8_DISP=False + fi + + export SGLANG_MORI_FP4_DISP=False + export SGLANG_MORI_FP8_COMB=False -# set MTP size=1 when EP16 -export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) + # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower) + export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 + if [[ "$MODEL_NAME" == *mxfp4* ]]; then + export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 + fi + export MORI_MAX_DISPATCH_TOKENS_DECODE=160 -export MORI_EP_LAUNCH_CONFIG_MODE=AUTO + # set MTP size=1 when EP16 + export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) + export MORI_EP_LAUNCH_CONFIG_MODE=AUTO + export MORI_IO_QP_MAX_SEND_WR=16384 + export MORI_IO_QP_MAX_CQE=32768 + export MORI_IO_QP_MAX_SGE=4 -export MORI_APP_LOG_LEVEL=INFO + export MORI_APP_LOG_LEVEL=INFO -# Router logging control: -# 0 (default) keeps noisy per-request access logs out of stdout while still logging to file. -# 1 mirrors router logs to stdout via tee (useful for live debugging). -export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}" + # Router logging control + export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}" -# QoS/DSCP configuration -# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname -if [[ -n "$MORI_RDMA_TC" ]]; then - echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)" -elif command -v nicctl &> /dev/null; then - ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') - ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" ' + # QoS/DSCP configuration + if [[ -n "$MORI_RDMA_TC" ]]; then + echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)" + elif command -v nicctl &> /dev/null; then + ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') + ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" ' $1 == "DSCP" && $2 == ":" && $NF == p { print $3; exit }') - if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then - TC=$(( 4 * ND_DSCP )) - export MORI_RDMA_SL=$ND_PRIO - export MORI_IO_SL=$ND_PRIO - export MORI_RDMA_TC=$TC - export MORI_IO_TC=$TC - echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL" + if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then + TC=$(( 4 * ND_DSCP )) + export MORI_RDMA_SL=$ND_PRIO + export MORI_RDMA_TC=$TC + echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL" + else + echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." + # Fall back to hostname-based detection + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export MORI_RDMA_TC=96 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export MORI_RDMA_TC=104 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + else + echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." + fi + fi else - echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." - # Fall back to hostname-based detection + # nicctl not available, try hostname-based detection NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 - export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 - export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else - echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." + echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." + echo " This is normal for clusters without QoS or outside Docker containers." fi fi -else - # nicctl not available, try hostname-based detection - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export MORI_RDMA_TC=96 - export MORI_IO_TC=96 - echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" - elif [[ $NODENAME == mia1* ]]; then - export MORI_RDMA_TC=104 - export MORI_IO_TC=104 - echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" - else - echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." - echo " This is normal for clusters without QoS or outside Docker containers." - fi -fi - -# FIXME: WA for latest upstream 0305 image -export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} + # FIXME: WA for latest upstream 0305 image + export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} -set +x +fi diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 824605c46..56fefb0ed 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -1,265 +1,260 @@ #!/bin/bash -#SBATCH --job-name=1p2d_bench-serving # Specify a custom string for your slurm batch job -#SBATCH -N 3 # CHECK this to be right in batch jobs -#SBATCH -n 3 # CHECK this to be right in batch jobs +#SBATCH --job-name=disagg-bench +#SBATCH -N 3 # Overridden by submit.sh -N flag +#SBATCH -n 3 # Overridden by submit.sh -n flag #SBATCH --ntasks-per-node=1 #SBATCH --spread-job -#SBATCH --gres=gpu:8 # Request 8 GPUs and 8 NICs (use --gres if specific GPU resources are needed) -#SBATCH --time=24:00:00 # Set a time limit for the job (HH:MM:SS) +#SBATCH --gres=gpu:8 +#SBATCH --time=24:00:00 # --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR +ENGINE="${ENGINE:-sglang}" -# ------------------------ -# Print current time in UTC and PST formats -# ------------------------ echo "=== Job Start Time ===" echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')" +echo "ENGINE: $ENGINE" echo "=======================" echo "" # ============================================================================= -# Model validation from models.yaml (replaces hardcoded VALID_MODELS array) +# Model Validation # ============================================================================= -# DI_REPO_DIR is set below from $(pwd); use the submit-time working directory -# because sbatch copies this script to /var/spool/slurmd/ at runtime. -MODELS_YAML="$(pwd)/models.yaml" + +# Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/ +# at runtime, but the CWD remains the submit-time directory (amd_utils/). +if [[ "$ENGINE" == "vllm" ]]; then + MODELS_YAML="$(pwd)/models_vllm.yaml" +else + MODELS_YAML="$(pwd)/models.yaml" +fi if [[ ! -f "$MODELS_YAML" ]]; then - echo "Error: models.yaml not found at $MODELS_YAML" + echo "Error: models YAML not found at $MODELS_YAML" exit 1 fi -# Validate MODEL_NAME exists as a top-level key in models.yaml +if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then + echo "Error: DOCKER_IMAGE_NAME is not set." + exit 1 +fi + +MODEL_NAME="${MODEL_NAME:-None}" if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then - echo "Error: Model '$MODEL_NAME' not found in models.yaml" + echo "Error: Model '$MODEL_NAME' not found in $MODELS_YAML" echo "Available models:" grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /' exit 1 fi echo "Model found: $MODEL_NAME" -# All models use server.sh as the entrypoint RUN_FILE="server.sh" echo "Runfile set: $RUN_FILE" -if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then - echo "Error: DOCKER_IMAGE_NAME is not set." - exit 1 -fi - -# DI_REPO_DIR points to the repo root so Docker can access both benchmarks/ and utils/. +# DI_REPO_DIR points to the repo root. # $(pwd) is amd_utils/ (the sbatch submit dir); go up 3 levels to reach the repo root. export DI_REPO_DIR=$(cd "$(pwd)/../../.." && pwd) -xP="${xP:-1}" #-> Number of Prefill Workers -yD="${yD:-1}" #-> Number of Decode Workers +xP="${xP:-1}" +yD="${yD:-1}" -# Parallelism Configuration with defaults -PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" -PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" -PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" -DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" -DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" -DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" -DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} # 0 for disabling MTP - -# Benchmark Configuration with defaults +# Benchmark configuration BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" GPUS_PER_NODE="${GPUS_PER_NODE:-8}" -MODEL_NAME="${MODEL_NAME:-None}" +# Engine-specific defaults +PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-false}" +PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-false}" +DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-false}" +DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-false}" +PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" +DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" +DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} + +# ============================================================================= +# Docker privilege detection +# ============================================================================= +# Detect on the batch host. Per-node detection happens inside srun below. +if docker ps &>/dev/null; then + DOCKER_CMD="docker" +else + DOCKER_CMD="sudo docker" +fi +export DOCKER_CMD + +# ============================================================================= +# Model Path Resolution +# ============================================================================= # MODEL_DIR detection: prefer env var, fall back to hostname detection if [[ -z "$MODEL_DIR" ]]; then NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then MODEL_DIR="/nfsdata" - echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then MODEL_DIR="/it-share/data" - echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME" else - MODEL_DIR="/nfsdata" # Default fallback - echo "[INFO] Using default MODEL_DIR=$MODEL_DIR (hostname $NODENAME not recognized)" + MODEL_DIR="/nfsdata" fi + echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $(hostname -s)" fi export MODEL_DIR -# ------------------------ -# Model path validation and selection across all nodes -# ------------------------ -echo "Looking for model: $MODEL_NAME" -echo "Checking model availability across all allocated nodes..." - -# Get all allocated nodes -ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -TOTAL_NODES=$(echo "$ALL_NODES" | wc -l) - -echo "Total allocated nodes: $TOTAL_NODES" -echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')" - -# Function to check model path on all nodes -check_model_path() { - local path=$1 - local check_name=$2 - - echo "Checking $check_name: $path" +if [[ "$ENGINE" == "vllm" ]]; then + # vLLM: Extract hf_dir from models.yaml, search multiple paths, resolve HF cache snapshots + DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} + found && /^[^ ]/{exit} + found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML") + DISK_DIR_NAME="${DISK_DIR_NAME:-$MODEL_NAME}" + echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)" + + resolve_hf_cache_path() { + local base_path=$1 + if [[ -d "${base_path}/snapshots" ]]; then + local snapshot=$(ls -1 "${base_path}/snapshots" 2>/dev/null | head -1) + if [[ -n "$snapshot" ]]; then + echo "${base_path}/snapshots/${snapshot}" + return 0 + fi + fi + echo "$base_path" + return 1 + } + + MODEL_PATH="" + SEARCH_PATHS=( + "${MODEL_DIR}/${DISK_DIR_NAME}" + "${MODEL_DIR}/${MODEL_NAME}" + "/nfsdata/hf_hub_cache-0/${DISK_DIR_NAME}" + "/nfsdata/hf_hub_cache-0/${MODEL_NAME}" + ) + + for search_path in "${SEARCH_PATHS[@]}"; do + if [[ -d "$search_path" ]]; then + RESOLVED=$(resolve_hf_cache_path "$search_path") + MODEL_PATH="$RESOLVED" + echo "Found MODEL_PATH: $MODEL_PATH" + break + fi + done - # Run check on all nodes in parallel - srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c " - if [ -d '$path' ]; then - echo \"\$(hostname): ✓ Found $path\" - exit 0 + if [[ -z "$MODEL_PATH" ]]; then + echo "FATAL: Model '$MODEL_NAME' not found. Searched:" + for p in "${SEARCH_PATHS[@]}"; do echo " - $p"; done + exit 1 + fi + echo "Final MODEL_PATH: $MODEL_PATH" +else + # SGLang: Validate model path across all allocated nodes + echo "Looking for model: $MODEL_NAME" + echo "Checking model availability across all allocated nodes..." + + ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") + TOTAL_NODES=$(echo "$ALL_NODES" | wc -l) + echo "Total allocated nodes: $TOTAL_NODES" + echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')" + + check_model_path() { + local path=$1 + local check_name=$2 + echo "Checking $check_name: $path" + srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c " + if [ -d '$path' ]; then + echo \"\$(hostname): Found $path\" + exit 0 + else + echo \"\$(hostname): Missing $path\" + exit 1 + fi + " + local exit_code=$? + if [ $exit_code -eq 0 ]; then + echo "$check_name available on ALL nodes" + return 0 else - echo \"\$(hostname): ✗ Missing $path\" - exit 1 + echo "$check_name NOT available on all nodes" + return 1 fi - " + } - # Check if all nodes succeeded (exit code 0) - local exit_code=$? - if [ $exit_code -eq 0 ]; then - echo "✓ $check_name available on ALL nodes" - return 0 + if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then + MODEL_PATH="$MODEL_DIR/$MODEL_NAME" + echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" else - echo "✗ $check_name NOT available on all nodes" - return 1 + echo "FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in:" + echo " - $MODEL_DIR/$MODEL_NAME" + exit 1 fi -} - -# Check model weights exist on "$MODEL_DIR/$MODEL_NAME" -if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then - MODEL_PATH="$MODEL_DIR/$MODEL_NAME" - echo "" - echo "✓ Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" -else - echo "" - echo "✗ FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in the following:" - echo " - $MODEL_DIR/$MODEL_NAME" - echo "" - echo "Model must be accessible from all nodes for distributed execution." - echo "Please ensure the model is available on all allocated nodes." - exit 1 + echo "Final MODEL_PATH: $MODEL_PATH" fi -echo "Final MODEL_PATH: $MODEL_PATH" -echo "" - -NUM_NODES="${NUM_NODES}" +# ============================================================================= +# Node Selection +# ============================================================================= -# ------------------------ -# Extract first NUM_NODES from SLURM allocation and update SLURM variables -# ------------------------ -echo "Original SLURM allocation:" -echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "SLURM_NNODES: $SLURM_NNODES" -echo "SLURM_NTASKS: $SLURM_NTASKS" +NUM_NODES=$((xP + yD)) +echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD)" -# Get the full nodelist and extract first NUM_NODES FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') -# Create new nodelist in SLURM format -# This is a simplified approach - for complex ranges, you might need more sophisticated parsing -NEW_SLURM_NODELIST=$(echo "$SELECTED_NODES" | paste -sd, | sed 's/,/,/g') - # Update SLURM environment variables export SLURM_NNODES=$NUM_NODES export SLURM_NTASKS=$NUM_NODES export SLURM_JOB_NUM_NODES=$NUM_NODES export SLURM_NPROCS=$NUM_NODES -export SLURM_JOB_NODELIST="$NEW_SLURM_NODELIST" -export SLURM_NODELIST="$NEW_SLURM_NODELIST" - -# Keep other SLURM variables as they were or set defaults +export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR" +export SLURM_NODELIST="$SELECTED_NODELIST_STR" export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)" -export SLURM_SUBMIT_DIR="${SLURM_SUBMIT_DIR:-$HOME}" -export SLURM_CLUSTER_NAME="${SLURM_CLUSTER_NAME}" # Let SLURM set this automatically -export SLURM_JOB_CPUS_PER_NODE="${SLURM_JOB_CPUS_PER_NODE}" -export SLURM_JOB_PARTITION="${SLURM_JOB_PARTITION}" # Should be set by sbatch/runner -export SLURM_JOBID="${SLURM_JOBID:-$SLURM_JOB_ID}" -export SLURM_JOB_QOS="${SLURM_JOB_QOS}" # Should be set by sbatch/runner if needed -export SLURM_JOB_ACCOUNT="${SLURM_JOB_ACCOUNT}" # Should be set by sbatch/runner export SLURM_NTASKS_PER_NODE=1 -export SLURM_SUBMIT_HOST="${SLURM_SUBMIT_HOST}" -export SLURM_JOB_ID="${SLURM_JOB_ID}" -# SLURM_CONF is auto-set by SLURM, no need to override -export SLURM_JOB_NAME="${SLURM_JOB_NAME:-1p1d_bench-serving}" echo "" -echo "Updated SLURM Environment Variables:" -echo "SLURM_JOB_ID: $SLURM_JOB_ID" -echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "SLURM_NNODES: $SLURM_NNODES" -echo "SLURM_NTASKS: $SLURM_NTASKS" -echo "SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "SLURM_JOB_CPUS_PER_NODE: $SLURM_JOB_CPUS_PER_NODE" -echo "SLURM_JOB_PARTITION: $SLURM_JOB_PARTITION" -echo "SLURM_JOB_NUM_NODES: $SLURM_JOB_NUM_NODES" -echo "SLURM_JOBID: $SLURM_JOBID" -echo "SLURM_JOB_QOS: $SLURM_JOB_QOS" -echo "SLURM_NODELIST: $SLURM_NODELIST" -echo "SLURM_JOB_ACCOUNT: $SLURM_JOB_ACCOUNT" -echo "SLURM_NPROCS: $SLURM_NPROCS" -echo "SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "SLURM_CONF: $SLURM_CONF" -echo "SLURM_JOB_NAME: $SLURM_JOB_NAME" -echo "SLURM_NTASKS_PER_NODE: $SLURM_NTASKS_PER_NODE" -echo "SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "SLURM_CLUSTER_NAME: $SLURM_CLUSTER_NAME" -echo "ulimit: $(ulimit -a)" -echo "" -echo "Selected nodes for execution:" -echo "$SELECTED_NODES" -echo "" +echo "Selected nodes: $SELECTED_NODELIST_STR" + +# ============================================================================= +# IP Resolution +# ============================================================================= -# Node information USER_NAME=$(whoami) MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1) NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1') NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}') IPS=() - -GW_NIC=$(ip route | awk '/^default/ {print $5; exit}') for NODE in $SELECTED_NODES; do IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1') IP=$(echo "$IP" | awk '/src/ {print $7}') IPS+=("$IP") done -echo "Selected node IPs: ${IPS[*]}" | sed 's/ /,/g' +echo "Node IPs: ${IPS[*]}" DOCKER_MOUNT_PATH="/workspace" -SGLANG_WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" -timestamp=$(date +"%Y-%m-%d_%H-%M-%S") +WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" NNODES=$NUM_NODES -echo "MASTER_NODE is ${MASTER_NODE}" -echo "NODE0_ADDR is ${NODE0_ADDR}" -echo "NNODES is ${NNODES}" -echo "REPO Directory is ${DI_REPO_DIR}" -echo "USER_NAME is ${USER_NAME}" - -# Get the RDMA priority and DSCP value from the NIC -if ! command -v nicctl >/dev/null 2>&1; then - echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2 - exit 1 -fi +echo "MASTER_NODE: ${MASTER_NODE}" +echo "NODE0_ADDR: ${NODE0_ADDR}" +echo "NNODES: ${NNODES}" +echo "REPO DIR: ${DI_REPO_DIR}" +echo "USER: ${USER_NAME}" # Reduce log spam export TQDM_MININTERVAL=20 +# Translate the host-resolved MODEL_PATH to the Docker mount namespace +DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}" + export DI_REPO_DIR=$DI_REPO_DIR -export SGLANG_WS_PATH=$SGLANG_WS_PATH +export WS_PATH=$WS_PATH export NNODES=$NNODES export NODE0_ADDR=$NODE0_ADDR export MODEL_PATH=$MODEL_PATH @@ -269,21 +264,16 @@ export yD=$yD export MODEL_NAME=$MODEL_NAME export USER_NAME=$USER_NAME export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')" -export PREFILL_TP_SIZE=$PREFILL_TP_SIZE -export PREFILL_ENABLE_EP=$PREFILL_ENABLE_EP -export PREFILL_ENABLE_DP=$PREFILL_ENABLE_DP -export DECODE_TP_SIZE=$DECODE_TP_SIZE -export DECODE_ENABLE_EP=$DECODE_ENABLE_EP -export DECODE_ENABLE_DP=$DECODE_ENABLE_DP -export DECODE_MTP_SIZE=$DECODE_MTP_SIZE export GPUS_PER_NODE=$GPUS_PER_NODE export BENCH_INPUT_LEN=$BENCH_INPUT_LEN export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY +export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE export DRY_RUN="${DRY_RUN:-0}" export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +export ENGINE=$ENGINE # Eval-related env vars (threaded from submit.sh) export RUN_EVAL="${RUN_EVAL:-false}" @@ -298,38 +288,101 @@ export SPEC_DECODING="${SPEC_DECODING:-}" export IS_MULTINODE="${IS_MULTINODE:-false}" SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') -export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" -export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}" - +export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" +export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" -# Use only the selected nodes for srun execution SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) - cleanup() { - echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..." - # clean up the logs folder - sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true - + echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." + rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true echo "[${SLURM_JOB_ID}] cleanup done." } trap cleanup INT TERM HUP - -# Force NFS cache refresh on all nodes before running Docker to avoid stale file handle errors +# Force NFS cache refresh on all nodes echo "Refreshing NFS caches on all nodes..." srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' sync - # Force re-stat of the mounted directory to refresh NFS handles ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils > /dev/null 2>&1 stat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 cat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 - # Drop caches if we have permission (optional, requires root) echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true echo "NFS cache refreshed on $(hostname)" ' +# ============================================================================= +# Build engine-specific Docker environment variables +# ============================================================================= + +# Common env vars (always passed) +DOCKER_ENV_COMMON=( + -e SLURM_JOB_ID=\$SLURM_JOB_ID + -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST + -e NNODES=\$NNODES + -e NODE_RANK=\$SLURM_PROCID + -e NODE0_ADDR=\$NODE0_ADDR + -e MODEL_DIR=/models + -e MODEL_NAME=\$MODEL_NAME + -e GPUS_PER_NODE=\$GPUS_PER_NODE + -e xP=\$xP + -e yD=\$yD + -e IPADDRS=\$IPADDRS + -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN + -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN + -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO + -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER + -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY + -e TQDM_MININTERVAL=\$TQDM_MININTERVAL + -e DRY_RUN=\$DRY_RUN + -e BENCHMARK_LOGS_DIR=/benchmark_logs + -e ENGINE=\$ENGINE + -e WS_PATH=${WS_PATH} + -e RUN_EVAL=\$RUN_EVAL + -e EVAL_ONLY=\$EVAL_ONLY + -e EVAL_CONC=\$EVAL_CONC + -e FRAMEWORK=\$FRAMEWORK + -e PRECISION=\$PRECISION + -e MODEL_PREFIX=\$MODEL_PREFIX + -e RUNNER_TYPE=\$RUNNER_TYPE + -e RESULT_FILENAME=\$RESULT_FILENAME + -e SPEC_DECODING=\$SPEC_DECODING + -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE + -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP + -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP + -e DECODE_TP_SIZE=\$DECODE_TP_SIZE + -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP + -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP + -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE +) + +# Engine-specific env vars +if [[ "$ENGINE" == "vllm" ]]; then + DOCKER_ENV_ENGINE=( + -e VLLM_WS_PATH=${WS_PATH} + -e MODEL_PATH=$DOCKER_MODEL_PATH + -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma + -e UCX_SOCKADDR_TLS_PRIORITY=tcp + -e UCX_MEMTYPE_CACHE=y + -e UCX_RNDV_SCHEME=get_zcopy + -e UCX_RNDV_THRESH=4k + -e UCX_ROCM_IPC_MIN_ZCOPY=0 + -e UCX_LOG_LEVEL=warn + -e HSA_ENABLE_SDMA=1 + -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} + -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} + -e PYTHONPYCACHEPREFIX=/tmp/pycache + ) +else + DOCKER_ENV_ENGINE=( + -e SGLANG_WS_PATH=${WS_PATH} + ) +fi + +# Engine-specific container filter for pre-clean +CONT_FILTER="name=^container_${ENGINE}_" + srun \ --nodelist="$SELECTED_NODELIST_SRUN" \ --kill-on-bad-exit=1 \ @@ -341,10 +394,10 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" # Pre-clean (idempotent) -sudo docker ps -aq --filter \"name=^container_sbatch_\" | xargs -r sudo docker rm -f || true -sudo docker ps -aq | xargs -r sudo docker stop || true +\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$_DCMD rm -f || true +\$DOCKER_CMD ps -aq | xargs -r \$_DCMD stop || true -exec sudo docker run --rm \ +exec \$DOCKER_CMD run --rm \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -367,51 +420,18 @@ exec sudo docker run --rm \ --cap-add SYS_PTRACE \ --security-opt seccomp=unconfined \ --privileged \ + -v /sys:/sys \ + $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \ -v ${MODEL_DIR}:/models \ -v \$HOME/.ssh:/root/.ssh \ - -v $(which nicctl):/usr/sbin/nicctl \ --shm-size 128G \ -v /tmp:/run_logs \ -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ - -e SLURM_JOB_ID=\$SLURM_JOB_ID \ - -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST \ - -e NNODES=\$NNODES \ - -e NODE_RANK=\$SLURM_PROCID \ - -e NODE0_ADDR=\$NODE0_ADDR \ - -e MODEL_DIR=/models \ - -e SGLANG_WS_PATH=${SGLANG_WS_PATH} \ - -e GPUS_PER_NODE=\$GPUS_PER_NODE \ - -e xP=\$xP \ - -e yD=\$yD \ - -e MODEL_NAME=\$MODEL_NAME \ - -e IPADDRS=\$IPADDRS \ - -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \ - -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \ - -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP \ - -e DECODE_TP_SIZE=\$DECODE_TP_SIZE \ - -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP \ - -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP \ - -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE \ - -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN \ - -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN \ - -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO \ - -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER \ - -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY \ - -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \ - -e DRY_RUN=\$DRY_RUN \ - -e BENCHMARK_LOGS_DIR=/benchmark_logs \ - -e RUN_EVAL=\$RUN_EVAL \ - -e EVAL_ONLY=\$EVAL_ONLY \ - -e EVAL_CONC=\$EVAL_CONC \ - -e FRAMEWORK=\$FRAMEWORK \ - -e PRECISION=\$PRECISION \ - -e MODEL_PREFIX=\$MODEL_PREFIX \ - -e RUNNER_TYPE=\$RUNNER_TYPE \ - -e RESULT_FILENAME=\$RESULT_FILENAME \ - -e SPEC_DECODING=\$SPEC_DECODING \ - -e IS_MULTINODE=\$IS_MULTINODE \ + ${DOCKER_ENV_COMMON[*]} \ + ${DOCKER_ENV_ENGINE[*]} \ --name \"$DOCKER_CONT_NAME\" \ + --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' set -o pipefail mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' @@ -425,4 +445,4 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then fi " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'sudo docker rm -f $DOCKER_CONT_NAME 2>/dev/null || true' +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml new file mode 100644 index 000000000..c68bb46e3 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml @@ -0,0 +1,42 @@ +# Model-specific vLLM server configurations for disaggregated inference. +# +# Each top-level key is a MODEL_NAME value (must match the model identifier +# used in amd-master.yaml and the directory/HF-cache name under MODEL_DIR). +# +# To add a new model: add a new top-level entry following the same schema. +# No script changes are required. +# +# Schema: +# : +# prefill_flags: str # vLLM CLI flags for prefill workers +# decode_flags: str # vLLM CLI flags for decode workers +# env: str # Space-separated KEY=VALUE pairs exported before vllm serve +# hf_dir: str # (optional) On-disk directory name if it differs from the key +# # e.g. HF cache layout: models--amd--Kimi-K2.5-MXFP4 + +Llama-3.1-405B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +amd-Llama-3.3-70B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +Kimi-K2.5-MXFP4: + prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" + hf_dir: "models--amd--Kimi-K2.5-MXFP4" + +MiniMax-M2.5: + prefill_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600" + hf_dir: "models--MiniMaxAI--MiniMax-M2.5" + +gpt-oss-120b: + prefill_flags: "--tensor-parallel-size 8" + decode_flags: "--tensor-parallel-size 8" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" diff --git a/benchmarks/multi_node/amd_utils/moriio_proxy.py b/benchmarks/multi_node/amd_utils/moriio_proxy.py new file mode 100644 index 000000000..7d1e8454b --- /dev/null +++ b/benchmarks/multi_node/amd_utils/moriio_proxy.py @@ -0,0 +1,327 @@ +#!/usr/bin/env python3 +# MoRI-IO proxy server for vLLM PD disaggregation. +# +# Based on vLLM's examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py +# with the following adaptations for production multi-node use: +# - Ports configurable via PROXY_HTTP_PORT / PROXY_PING_PORT env vars +# - /health endpoint for sync.py barrier readiness checks +# - Uses stdlib `re` instead of `regex` to avoid extra dep +# +# The proxy performs two roles that vllm-router cannot: +# 1. ZMQ service discovery — prefill/decode workers register their RDMA ports +# 2. Request enrichment — injects remote endpoint info into kv_transfer_params + +import asyncio +import copy +import logging +import os +import re +import socket +import threading +import time +import uuid + +import aiohttp +import msgpack +import zmq +from quart import Quart, make_response, request + +logger = logging.getLogger("moriio_proxy") +logger.setLevel(logging.DEBUG) +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter( + "%(asctime)s %(levelname)s [%(name)s] %(message)s")) +logger.addHandler(handler) + +prefill_instances: list[dict] = [] +decode_instances: list[dict] = [] +request_nums = 0 +app = Quart(__name__) + +STREAM_IDLE_TIMEOUT = int(os.environ.get("PROXY_STREAM_IDLE_TIMEOUT", "300")) + +IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)") + +TRANSFER_TYPE = None + + +def _append_whole_dict_unique(target_list, data_dict): + new_filtered = {k: v for k, v in data_dict.items() if k != "index"} + for existed in target_list: + existed_filtered = {k: v for k, v in existed.items() if k != "index"} + if existed_filtered == new_filtered: + return False + logger.info("Registered instance: role=%s addr=%s hs_port=%s notify=%s dp=%s tp=%s", + data_dict.get("role"), data_dict.get("request_address"), + data_dict.get("handshake_port"), data_dict.get("notify_port"), + data_dict.get("dp_size"), data_dict.get("tp_size")) + target_list.append(data_dict) + transfer_mode = data_dict.get("transfer_mode", "unknown") + global TRANSFER_TYPE + + if TRANSFER_TYPE is None: + TRANSFER_TYPE = transfer_mode + logger.info("Transfer mode set to: %s", TRANSFER_TYPE) + elif transfer_mode != TRANSFER_TYPE: + raise ValueError(f"mismatched transfer mode {TRANSFER_TYPE} vs {transfer_mode}") + + return True + + +_list_lock = threading.RLock() + + +def _listen_for_register(hostname, port): + context = zmq.Context() + router_socket = context.socket(zmq.ROUTER) + router_socket.bind(f"tcp://{hostname}:{port}") + poller = zmq.Poller() + poller.register(router_socket, zmq.POLLIN) + global prefill_instances + global decode_instances + + while True: + socks = dict(poller.poll()) + if router_socket in socks: + remote_addr, msg = router_socket.recv_multipart() + data = msgpack.loads(msg) + if data["type"] == "HELLO": + pass + elif ( + data["type"] == "register" + and data["role"] == "P" + and data["request_address"] not in prefill_instances + ): + with _list_lock: + _append_whole_dict_unique(prefill_instances, data) + + elif ( + data["type"] == "register" + and data["role"] == "D" + and data["request_address"] not in decode_instances + ): + with _list_lock: + _append_whole_dict_unique(decode_instances, data) + + +def start_service_discovery(hostname, port): + if not hostname: + hostname = socket.gethostname() + if port == 0: + raise ValueError("Port cannot be 0") + + _listener_thread = threading.Thread( + target=_listen_for_register, args=(hostname, port), daemon=True + ) + _listener_thread.start() + logger.info("Service discovery listening on %s:%s", hostname, port) + return _listener_thread + + +async def send_request_to_prefill( + endpoint, req_data, request_id, d_endpoint, dip, dport, selected_prefill_dp_rank +): + req_data_copy = req_data + + req_data_copy["kv_transfer_params"].update( + { + "do_remote_decode": True, + "do_remote_prefill": False, + "remote_handshake_port": d_endpoint["handshake_port"], + "remote_notify_port": d_endpoint["notify_port"], + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": dip, + "remote_port": dport, + } + ) + req_data_copy["stream"] = False + req_data_copy["max_tokens"] = 1 + if "max_completion_tokens" in req_data_copy: + req_data_copy["max_completion_tokens"] = 1 + if "stream_options" in req_data_copy: + del req_data_copy["stream_options"] + async with aiohttp.ClientSession( + timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) + ) as session: + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + if selected_prefill_dp_rank is not None: + headers["X-data-parallel-rank"] = str(selected_prefill_dp_rank) + async with session.post( + url=endpoint, json=req_data_copy, headers=headers + ) as response: + if response.status == 200: + return await response.json() + else: + raise RuntimeError( + f"Prefill response status={response.status}" + ) + + +async def start_decode_request(endpoint, req_data, request_id): + session = aiohttp.ClientSession( + timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) + ) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + response = await session.post(url=endpoint, json=req_data, headers=headers) + return session, response + + +async def stream_decode_response(session, response, request_id): + try: + if response.status == 200: + chunk_iter = response.content.iter_chunked(1024).__aiter__() + while True: + try: + chunk_bytes = await asyncio.wait_for( + chunk_iter.__anext__(), timeout=STREAM_IDLE_TIMEOUT, + ) + yield chunk_bytes + except StopAsyncIteration: + break + except asyncio.TimeoutError: + logger.error( + "Decode stream %s idle for %ds, aborting", + request_id, STREAM_IDLE_TIMEOUT, + ) + break + else: + raise RuntimeError( + f"Decode response status={response.status}" + ) + finally: + await response.release() + await session.close() + + +@app.route("/health", methods=["GET"]) +async def health_check(): + with _list_lock: + p_count = len(prefill_instances) + d_count = len(decode_instances) + return await make_response( + ({"status": "ok", "prefill_instances": p_count, "decode_instances": d_count}, 200) + ) + + +@app.route("/v1/completions", methods=["POST"]) +@app.route("/v1/chat/completions", methods=["POST"]) +async def handle_request(): + try: + with _list_lock: + global request_nums + request_nums += 1 + + def extract_ip_port_fast(url): + match = IP_PORT_PATTERN.search(url) + if not match: + raise ValueError(f"Invalid URL format: {url}") + return match.groups() + + req_data = await request.get_json() + request_id = str(uuid.uuid4()) + + if not prefill_instances or not decode_instances: + return await make_response( + ("Service Unavailable: No prefill or decode instances registered.", 503) + ) + + pid = request_nums % len(prefill_instances) + did = request_nums % len(decode_instances) + prefill_instance_endpoint = prefill_instances[pid] + decode_instance_endpoint = decode_instances[did] + + selected_prefill_dp_rank = None + if prefill_instance_endpoint["dp_size"] > 1: + selected_prefill_dp_rank = request_nums % prefill_instance_endpoint["dp_size"] + + dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"]) + + req_data_to_prefill = copy.deepcopy(req_data) + req_data_to_prefill["kv_transfer_params"] = {"transfer_id": request_id} + req_data["kv_transfer_params"] = {"transfer_id": request_id} + req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = ( + decode_instance_endpoint["dp_size"] + ) + req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = ( + decode_instance_endpoint["tp_size"] + ) + + send_prefill_task = asyncio.create_task( + send_request_to_prefill( + prefill_instance_endpoint["request_address"], + req_data_to_prefill, + request_id, + decode_instance_endpoint, + dip, + dport, + selected_prefill_dp_rank, + ) + ) + ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"]) + + req_data["max_tokens"] -= 1 + + req_data["kv_transfer_params"] = { + "transfer_id": request_id, + "do_remote_decode": False, + "do_remote_prefill": True, + "remote_handshake_port": prefill_instance_endpoint["handshake_port"], + "remote_notify_port": prefill_instance_endpoint["notify_port"], + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": ip, + "remote_port": port, + } + if TRANSFER_TYPE == "READ": + prefill_response = await send_prefill_task + req_data["kv_transfer_params"]["remote_engine_id"] = prefill_response[ + "kv_transfer_params" + ]["remote_engine_id"] + req_data["kv_transfer_params"]["remote_block_ids"] = prefill_response[ + "kv_transfer_params" + ]["remote_block_ids"] + + req_data["kv_transfer_params"]["remote_dp_size"] = prefill_instance_endpoint[ + "dp_size" + ] + req_data["kv_transfer_params"]["remote_tp_size"] = prefill_instance_endpoint[ + "tp_size" + ] + + if selected_prefill_dp_rank is not None: + req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank + + decode_request_task = asyncio.create_task( + start_decode_request( + decode_instance_endpoint["request_address"], req_data, request_id + ) + ) + + session, decode_response = await decode_request_task + stream_generator = stream_decode_response(session, decode_response, request_id) + response = await make_response(stream_generator) + return response + except Exception as e: + logger.exception("Error handling request: %s", e) + return await make_response((f"Internal Server Error: {e!s}", 500)) + + +if __name__ == "__main__": + http_port = int(os.environ.get("PROXY_HTTP_PORT", "30000")) + ping_port = int(os.environ.get("PROXY_PING_PORT", "36367")) + + t = start_service_discovery("0.0.0.0", ping_port) + app.debug = False + app.config["BODY_TIMEOUT"] = 360000 + app.config["RESPONSE_TIMEOUT"] = 360000 + + logger.info("MoRI-IO proxy starting: HTTP=%d, ZMQ=%d", http_port, ping_port) + app.run(host="0.0.0.0", port=http_port) + t.join() diff --git a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py new file mode 100644 index 000000000..8290276fb --- /dev/null +++ b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py @@ -0,0 +1,672 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The MiniMax AI team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only MiniMaxM2/M2.5 model.""" + +from collections.abc import Iterable +from typing import Any + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm._aiter_ops import rocm_aiter_ops +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import ( + get_ep_group, + get_pp_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention +from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.model_executor.models.utils import sequence_parallel_chunk +from vllm.sequence import IntermediateTensors + +from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP +from .utils import ( + AutoWeightsLoader, + PPMissingLayer, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) + +logger = init_logger(__name__) + + +class MiniMaxM2MoE(nn.Module): + """MoE layer for MiniMax M2/M2.5 with EP/WideEP/Mori support. + + Follows the DeepSeek V2 MoE pattern: GateLinear + FusedMoE with + expert parallelism, EPLB, and sequence parallel awareness. + """ + + def __init__( + self, + config: PretrainedConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ): + super().__init__() + vllm_config = get_current_vllm_config() + parallel_config = vllm_config.parallel_config + + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + + self.ep_group = get_ep_group().device_group + self.ep_rank = get_ep_group().rank_in_group + self.ep_size = self.ep_group.size() + + self.n_routed_experts: int = config.num_local_experts + self.n_shared_experts: int = 0 + + self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe + self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) + self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + + eplb_config = parallel_config.eplb_config + self.enable_eplb = parallel_config.enable_eplb + self.n_redundant_experts = eplb_config.num_redundant_experts + self.n_logical_experts = self.n_routed_experts + self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.use_routing_bias = getattr(config, "use_routing_bias", False) + if self.use_routing_bias: + self.e_score_correction_bias = nn.Parameter( + torch.empty(config.num_local_experts, dtype=torch.float32) + ) + self.e_score_correction_bias.weight_loader = ( + MiniMaxM2MoE.ebias_weight_loader + ) + else: + self.e_score_correction_bias = None + + self.gate = GateLinear( + config.hidden_size, + config.num_local_experts, + out_dtype=torch.float32, + prefix=f"{prefix}.gate", + ) + + self.experts = FusedMoE( + num_experts=config.num_local_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + reduce_results=False, + renormalize=True, + scoring_func=getattr(config, "scoring_func", "softmax"), + e_score_correction_bias=self.e_score_correction_bias, + quant_config=quant_config, + prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, + is_sequence_parallel=self.is_sequence_parallel, + router_logits_dtype=torch.float32, + gate=self.gate, + routed_scaling_factor=1.0 + if not self.is_rocm_aiter_moe_enabled + else self.routed_scaling_factor, + ) + + @staticmethod + def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None: + assert param.size() == loaded_weight.size() + param.data.copy_(loaded_weight.to(torch.float32)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + + if self.is_sequence_parallel: + hidden_states = sequence_parallel_chunk(hidden_states) + + if self.experts.is_internal_router: + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=hidden_states + ) + else: + router_logits, _ = self.gate(hidden_states) + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=router_logits + ) + + if hidden_states.dtype != torch.float16: + if not self.is_rocm_aiter_moe_enabled: + final_hidden_states = final_hidden_states * self.routed_scaling_factor + + if self.is_sequence_parallel: + final_hidden_states = tensor_model_parallel_all_gather( + final_hidden_states, 0 + ) + final_hidden_states = final_hidden_states[:num_tokens] + elif self.tp_size > 1: + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states + ) + + return final_hidden_states.view(num_tokens, hidden_dim) + + +class MiniMaxM2Attention(nn.Module): + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rotary_dim: int, + rope_parameters: dict[str, Any] | None = None, + attn_window_size: int | None = None, + max_position_embeddings: int = 8192, + head_dim: int | None = None, + rms_norm_eps: float = 1e-06, + qkv_bias: bool = False, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim or (hidden_size // self.total_num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + if ( + rope_parameters is not None + and "partial_rotary_factor" not in rope_parameters + ): + rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim + self.rotary_emb = get_rope( + self.head_dim, + max_position=max_position_embeddings, + rope_parameters=rope_parameters, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + per_layer_sliding_window=attn_window_size, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + + self.q_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_heads, eps=rms_norm_eps + ) + self.k_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = MiniMaxText01RMSNormTP.forward_qk( + self.q_norm, self.k_norm, q.contiguous(), k.contiguous() + ) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class MiniMaxM2DecoderLayer(nn.Module): + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): + max_position_embeddings = max( + config.max_position_embeddings, config.max_model_len + ) + # DecoderLayers are created with `make_layers` which passes the prefix + # with the layer's index. + layer_idx = int(prefix.split(sep=".")[-1]) + + self.layer_idx = layer_idx + self.self_attn = MiniMaxM2Attention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rotary_dim=config.rotary_dim, + rope_parameters=config.rope_parameters, + max_position_embeddings=max_position_embeddings, + rms_norm_eps=config.rms_norm_eps, + qkv_bias=getattr(config, "attention_bias", False), + head_dim=getattr(config, "head_dim", None), + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + self.block_sparse_moe = MiniMaxM2MoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + + hidden_states = self.block_sparse_moe(hidden_states) + + return hidden_states, residual + + +@support_torch_compile +class MiniMaxM2Model(nn.Module): + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config + + self.vocab_size = config.vocab_size + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=None, + prefix=f"{prefix}.embed_tokens", + ) + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: MiniMaxM2DecoderLayer( + config, + prefix, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + ), + prefix=f"{prefix}.layers", + ) + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for layer in self.layers[self.start_layer : self.end_layer]: + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return FusedMoE.make_expert_params_mapping( + self, + ckpt_gate_proj_name="w1", + ckpt_down_proj_name="w2", + ckpt_up_proj_name="w3", + num_experts=self.config.num_local_experts, + num_redundant_experts=0, + ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = self.get_expert_mapping() + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue # skip spec decode layers for main model + + for param_name, weight_name, shard_id in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if ("mlp.experts." in name) and name not in params_dict: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader( + param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + ) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class MiniMaxM2MixtureOfExperts(MixtureOfExperts): + """EPLB protocol implementation for MiniMax M2/M2.5.""" + + moe_mlp_layers: list[MiniMaxM2MoE] + + def extract_moe_parameters(self, example_moe: MiniMaxM2MoE | None): + if example_moe is None: + self.num_moe_layers = 0 + self.num_expert_groups = 0 + self.num_logical_experts = 0 + self.num_physical_experts = 0 + self.num_local_physical_experts = 0 + self.num_routed_experts = 0 + self.num_shared_experts = 0 + self.num_redundant_experts = 0 + logger.warning("MiniMax M2: No MoE layer found in model.layers.") + else: + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for moe in self.moe_mlp_layers: + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + +class MiniMaxM2ForCausalLM( + nn.Module, SupportsLoRA, SupportsPP, MiniMaxM2MixtureOfExperts +): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + if hasattr(vllm_config.model_config, "max_model_len"): + self.config.max_model_len = vllm_config.model_config.max_model_len + self.model = MiniMaxM2Model( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead( + config.vocab_size, config.hidden_size, quant_config=None + ) + else: + self.lm_head = PPMissingLayer() + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + self.num_moe_layers = config.num_hidden_layers + self._set_moe_parameters() + + def _set_moe_parameters(self): + self.expert_weights: list = [] + self.num_expert_groups = 1 + self.moe_layers: list = [] + self.moe_mlp_layers: list[MiniMaxM2MoE] = [] + example_moe = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + assert isinstance(layer, MiniMaxM2DecoderLayer) + if isinstance(layer.block_sparse_moe, MiniMaxM2MoE): + example_moe = layer.block_sparse_moe + self.moe_mlp_layers.append(layer.block_sparse_moe) + self.moe_layers.append(layer.block_sparse_moe.experts) + self.extract_moe_parameters(example_moe) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor | IntermediateTensors: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() + + +def get_spec_layer_idx_from_weight_name( + config: PretrainedConfig, weight_name: str +) -> int | None: + if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0): + layer_idx = config.num_hidden_layers + for i in range(config.num_mtp_modules): + if weight_name.startswith(f"model.layers.{layer_idx + i}."): + return layer_idx + i + return None diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index bbe8de6aa..3c92422be 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -1,63 +1,23 @@ #!/bin/bash -# SGLang Disaggregated Server Launcher with Model-Specific Configurations +# Dual-Engine Disaggregated Server Dispatcher # ============================================================================= - -# ============================================================================= -# Environment Configuration +# Dispatches to the engine-specific server launcher based on ENGINE env var. +# ENGINE=sglang (default) -> server_sglang.sh (SGLang + MoRI) +# ENGINE=vllm -> server_vllm.sh (vLLM + Nixl/MoRI-IO) # ============================================================================= -NODE0_ADDR="${NODE0_ADDR:-localhost}" -NODE_RANK="${NODE_RANK:-0}" -MODEL_DIR="${MODEL_DIR:-}" -MODEL_NAME="${MODEL_NAME:-}" - -xP="${xP:-1}" #-> Number of Prefill Workers -yD="${yD:-1}" #-> Number of Decode Workers - -IPADDRS="${IPADDRS:-localhost}" -HEADNODE_PORT="${HEADNODE_PORT:-20000}" -# Parallelism Configuration -PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" -PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" -PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" -DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" -DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" -DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" -DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" - -# Benchmark Configuration -BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" -BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" -BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" -BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" -BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" -BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" - -# Dry Run for debugging purpose -DRY_RUN="${DRY_RUN:-0}" - -# GPU count (expandable for different hardware) -GPUS_PER_NODE="${GPUS_PER_NODE:-8}" - - -# ============================================================================= -# Dependencies and Environment Setup -# ============================================================================= -source $SGLANG_WS_PATH/env.sh +ENGINE="${ENGINE:-sglang}" +WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}" +export WS_PATH ENGINE -host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') -host_name=$(hostname) +echo "[DISPATCHER] ENGINE=$ENGINE WS_PATH=$WS_PATH" -# MORI_RDMA_TC configuration (optional) -# If set by runner, use it for RDMA traffic class configuration -# If not set, RDMA operations will proceed without QoS/traffic class settings -if [[ -n "${MORI_RDMA_TC}" ]]; then - echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration" - echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC" +if [[ "$ENGINE" == "vllm" ]]; then + source "$WS_PATH/server_vllm.sh" else - echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration." - echo "[INFO] This is normal for clusters without QoS requirements." + source "$WS_PATH/server_sglang.sh" fi +<<<<<<< HEAD # ============================================================================= # Model-Specific Configuration from YAML @@ -759,3 +719,5 @@ fi echo "Script completed successfully" exit 0 +======= +>>>>>>> 766ba4ee (consolidate amd_utils for sglang and vllm) diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh new file mode 100755 index 000000000..53ca29cc5 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -0,0 +1,624 @@ +#!/bin/bash +# SGLang Disaggregated Server Launcher with Model-Specific Configurations +# ============================================================================= + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +NODE0_ADDR="${NODE0_ADDR:-localhost}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_DIR="${MODEL_DIR:-}" +MODEL_NAME="${MODEL_NAME:-}" + +xP="${xP:-1}" #-> Number of Prefill Workers +yD="${yD:-1}" #-> Number of Decode Workers + +IPADDRS="${IPADDRS:-localhost}" +HEADNODE_PORT="${HEADNODE_PORT:-20000}" +# Parallelism Configuration +PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" +PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" +PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" +DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" +DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" +DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" +DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" + +# Benchmark Configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" + +# Dry Run for debugging purpose +DRY_RUN="${DRY_RUN:-0}" + +# GPU count (expandable for different hardware) +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + + +# ============================================================================= +# Dependencies and Environment Setup +# ============================================================================= +source $WS_PATH/env.sh + +host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') +host_name=$(hostname) + +# MORI_RDMA_TC configuration (optional) +# If set by runner, use it for RDMA traffic class configuration +# If not set, RDMA operations will proceed without QoS/traffic class settings +if [[ -n "${MORI_RDMA_TC}" ]]; then + echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration" + echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC" +else + echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration." + echo "[INFO] This is normal for clusters without QoS requirements." +fi + +# ============================================================================= +# Model-Specific Configuration from YAML +# ============================================================================= +MODELS_YAML="${WS_PATH}/models.yaml" + +if [[ ! -f "$MODELS_YAML" ]]; then + echo "ERROR: models.yaml not found at $MODELS_YAML" + exit 1 +fi + +# Load model config via inline Python (PyYAML is available in SGLang containers) +# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP") +# is done here in Python to avoid bash glob-expanding the * characters. +eval "$(python3 -c " +import yaml, sys, os + +config_path = '${MODELS_YAML}' +model_name = '${MODEL_NAME}' + +with open(config_path) as f: + models = yaml.safe_load(f) + +if model_name not in models: + print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') + sys.exit(0) + +m = models[model_name] + +def eval_formula(val): + \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\" + if isinstance(val, (int, float)): + return int(val) + s = str(val) + # Build a namespace from env vars (convert numeric values to int) + ns = {} + for k, v in os.environ.items(): + try: + ns[k] = int(v) + except (ValueError, TypeError): + pass + try: + return int(eval(s, {'__builtins__': {}}, ns)) + except Exception as e: + print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr) + return val + +def parse_range(cuda_range, default_start, default_end): + if '-' in str(cuda_range): + s, e = str(cuda_range).split('-') + return s, e + return str(default_start), str(default_end) + +# Output shell variables +print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"') +print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') +print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"') + +prefill = m.get('prefill', {}) +decode = m.get('decode', {}) + +print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"') +print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"') + +dp = prefill.get('dp', {}) +no_dp = prefill.get('no_dp', {}) +print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') +print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') +print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') +print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') +print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) +print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') +print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') + +print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"') +print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"') + +dp = decode.get('dp', {}) +ep_only = decode.get('ep_only', {}) +no_dp = decode.get('no_dp', {}) + +# Decode DP config +print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160) +print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"') + +# Decode EP-only config (EP enabled but DP disabled) +print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256) +print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"') + +# Decode no-DP config +print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) +print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') +")" + +echo "Loaded model configuration for: $MODEL_NAME" + +# Compute DP-dependent prefill parameters +if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then + prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) + prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP + prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP +else + prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) + prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP + prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP +fi + +# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) +if [[ "$DECODE_ENABLE_DP" == "true" ]]; then + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END)) + decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE)) +elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END)) + decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY +else + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END)) + decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP +fi + +# Use Decode configuration to configure different TP/DP size between P and D +PREFILL_DECODE_DIFFERENT_TP="" +if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then + if [[ "$DECODE_ENABLE_DP" == "true" ]]; then + PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}" + else + PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1" + fi +fi + +# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}" +if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" +fi + +DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}" +if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then + DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" +fi + +if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then + MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) +fi + +# ============================================================================= +# Cluster Topology Configuration +# ============================================================================= +IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" + +# Ceiling division by GPUS_PER_NODE for nodes-per-worker +PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE)) +DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE)) +NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP)) + +# Build prefill arguments dynamically based on xP +PREFILL_HEADNODE_URLS=() +PREFILL_ARGS="" +for i in $(seq 0 $((xP - 1))); do + prefill_idx=$((i * PREFILL_NODES_PER_WORKER)) + PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}" + PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000" +done + +# Build decode arguments dynamically based on yD +DECODE_HEADNODE_URLS=() +DECODE_ARGS="" +for i in $(seq 0 $((yD - 1))); do + decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET)) + DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}" + DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000" +done + +echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}" +echo "Decode worker headnode list: ${DECODE_HEADNODE_URLS[@]}" + +# ============================================================================= +# Configuration Builder Functions +# ============================================================================= + +build_server_config() { + local mode="$1" + local model_name="$2" + local tp_size="$3" + local enable_ep="$4" + local enable_dp="$5" + local decode_mtp_size="$6" + + # Calculate EP and DP sizes based on enable flags + local ep_size=1 + local dp_size=1 + + if [[ "$enable_ep" == "true" ]]; then + ep_size=$tp_size + fi + + if [[ "$enable_dp" == "true" ]]; then + dp_size=$tp_size + fi + + # Build parallelism arguments + local parallel_args="--tp-size ${tp_size}" + + if [[ "$enable_ep" == "true" ]]; then + parallel_args="$parallel_args --ep-size ${ep_size}" + fi + + if [[ "$enable_dp" == "true" ]]; then + parallel_args="$parallel_args --dp-size ${dp_size}" + fi + + # Get model-specific configuration from YAML-loaded variables + local base_config="$MODEL_BASE_FLAGS" + local mtp_config="" + local dp_config="" + local specific_config="" + + # MTP config (only if MTP is enabled and mode is decode) + if [ "$decode_mtp_size" -gt 0 ]; then + mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))" + fi + + # DP config (only if DP is enabled) + if [[ "$enable_dp" == "true" ]]; then + dp_config="$MODEL_DP_FLAGS" + fi + + # Mode-specific config + if [[ "$mode" == "prefill" ]]; then + specific_config="$PREFILL_MODE_FLAGS" + elif [[ "$mode" == "decode" ]]; then + specific_config="$DECODE_MODE_FLAGS" + fi + + # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config + local full_config="$parallel_args" + if [[ -n "$base_config" ]]; then + full_config="$full_config $base_config" + fi + if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then + full_config="$full_config $mtp_config" + fi + if [[ -n "$dp_config" ]]; then + full_config="$full_config $dp_config" + fi + if [[ -n "$specific_config" ]]; then + full_config="$full_config $specific_config" + fi + + echo "$full_config" +} + +# Build complete server configurations +PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE") +DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE") + +if [[ -n "$MODEL_NAME" ]]; then + echo "Using model-specific configuration for: $MODEL_NAME" +fi + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +echo "Waiting at the container creation barrier on $host_name" +python3 $WS_PATH/sync.py barrier \ + --local-ip ${host_ip} \ + --local-port 5000 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 5000 \ + --wait-for-all-ports \ + --timeout 300 + + +# ============================================================================= +# Node Role Assignment and Server Launch +# ============================================================================= + +if [ "$NODE_RANK" -eq 0 ]; then + echo "NODE INFO =======================================" + echo "================================================" + echo "Node List : ${SLURM_JOB_NODELIST}" + echo "Node IPs : ${IPADDRS}" + echo "Model Name : ${MODEL_NAME:-'Not specified'}" + echo "================================================" + + echo "CLUSTER INFO ====================================" + echo "================================================" + echo "${host_name}:${host_ip} is Proxy Node and Prefill Node" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" + echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" + echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" + echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" + echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}" + echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}" + echo "================================================" + + # start the head prefill server + PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + --model-path $MODEL_DIR/$MODEL_NAME \ + --disaggregation-mode prefill \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${PREFILL_SERVER_CONFIG} \ + --log-level-http warning" + + if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then + PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" + fi + + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill0_pid=$! + fi + + + echo "Waiting for all prefill and decode servers to be up . . ." + + + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 8000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + echo "Congratulations!!! All prefill and decode servers are up . . ." + + ROUTER_CMD="python -m sglang_router.launch_router \ + --pd-disaggregation \ + --port 30000 \ + --policy random \ + --prefill-policy random \ + --decode-policy random \ + ${PREFILL_ARGS} \ + ${DECODE_ARGS}" + + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $ROUTER_CMD" + else + ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log" + set -x + if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then + eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & + else + eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 & + fi + set +x + proxy_pid=$! + + # Wait for router to be ready via health endpoint + HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-health \ + --health-endpoint /readiness \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $HEALTH_BARRIER_CMD" + else + eval "$HEALTH_BARRIER_CMD" + fi + + echo "Router is ready for benchmarking" + fi + + + echo "Ready for benchmarking on ${host_name}:${host_ip}" + + echo "Benchmarking on ${host_name}:${host_ip}" + cd $WS_PATH + + # Export IS_MTP based on whether MTP is enabled + if [ "$DECODE_MTP_SIZE" -gt 0 ]; then + export IS_MTP=true + else + export IS_MTP=false + fi + + # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier + BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BENCH_CMD" + else + set -x + eval "$BENCH_CMD" + set +x + fi + + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" + mkdir -p "$LOGS_OUTPUT" + + if [[ "$DRY_RUN" -eq 0 ]]; then + cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" + echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" + fi + + echo "Killing the proxy server and prefill server" + + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $proxy_pid + kill $prefill0_pid + fi + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then + echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" + + PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + --model-path $MODEL_DIR/${MODEL_NAME} \ + --disaggregation-mode prefill \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${PREFILL_SERVER_CONFIG} \ + --log-level-http warning" + + if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then + rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) + prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER)) + PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port 30000" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the rank $NODE_RANK prefill server" + + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $prefill_pid + fi + +else + RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER)) + echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})" + echo "Using decode config: $DECODE_SERVER_CONFIG" + echo "Decode node rank: $RANK" + echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" + + DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + --model-path ${MODEL_DIR}/${MODEL_NAME} \ + --disaggregation-mode decode \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${DECODE_SERVER_CONFIG} \ + --log-level-http warning" + + if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then + rank=$((RANK % DECODE_NODES_PER_WORKER)) + decode_idx=$((RANK / DECODE_NODES_PER_WORKER)) + DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $DECODE_CMD" + else + set -x + eval "$DECODE_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & + + set +x + decode_pid=$! + fi + + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port 30000" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the rank $RANK decode server" + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $decode_pid + fi + +fi + +echo "Script completed successfully" +exit 0 diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh new file mode 100755 index 000000000..a10e45d6d --- /dev/null +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -0,0 +1,490 @@ +#!/bin/bash +# vLLM Disaggregated Server Launcher with Model-Specific Configurations +# ============================================================================= +# +# Node role assignment (by NODE_RANK): +# 0 -> Proxy/Router + first Prefill node (kv_producer) +# 1..xP-1 -> Additional Prefill nodes (kv_producer) +# xP..xP+yD-1 -> Decode nodes (kv_consumer) +# +# Total nodes = xP + yD (router co-located with first prefill, like SGLang). + +# ============================================================================= +# Dependency Setup (idempotent; required when using base vLLM image) +# ============================================================================= +source "$(dirname "${BASH_SOURCE[0]}")/setup_deps.sh" + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +NODE0_ADDR="${NODE0_ADDR:-localhost}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_DIR="${MODEL_DIR:-}" +MODEL_NAME="${MODEL_NAME:-}" + +xP="${xP:-1}" +yD="${yD:-1}" + +IPADDRS="${IPADDRS:-localhost}" + +# Benchmark Configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" + +DRY_RUN="${DRY_RUN:-0}" +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +ROUTER_PORT="${ROUTER_PORT:-30000}" +SERVER_PORT="${SERVER_PORT:-2584}" +ENGINE_ID="${ENGINE_ID:-${MODEL_NAME}-pd-run}" + +# Prefer MODEL_PATH from job.slurm (handles HF cache snapshot resolution) +MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}" + +# ============================================================================= +# Dependencies and Environment Setup +# ============================================================================= +source $WS_PATH/env.sh + +host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}') +# RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available) +rdma_ip=$(hostname -I | tr ' ' '\n' | grep '^192\.168\.' | head -1) +rdma_ip="${rdma_ip:-$host_ip}" +host_name=$(hostname) + +echo "[INFO] Management IP (barriers/proxy): $host_ip" +echo "[INFO] RDMA IP (Nixl KV transfer): $rdma_ip" + +# ============================================================================= +# RDMA / Nixl Workarounds +# ============================================================================= + +setup_rdma_env() { + # Pensando ionic (RoCEv2) point-to-point /31 route fix. + # Each benic interface has a /31 to the TOR switch. Without explicit routes, + # traffic to other nodes' RDMA IPs falls through to the management network. + if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then + local rdma_subnet="${BASH_REMATCH[1]}" + local rdma_host="${BASH_REMATCH[2]}" + local rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))" + local rdma_iface + rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1) + if [[ -n "$rdma_iface" ]]; then + ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \ + echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \ + echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24" + fi + fi + + # Patch Nixl UCX backend: set ucx_error_handling_mode=none. + # Required for ALL NIC types under high concurrency (C512+). Without this, + # UCX's default UCP_ERR_HANDLING_MODE_PEER triggers transport-level error + # recovery on ibv_post_send failures, preventing RIXL RDMA READ retries from + # recovering gracefully. This causes the prefill KV cache to fill to 100% + # and deadlock the pipeline. On ionic NICs this was already applied (rdmacm + # incompatibility); on mlx5 NICs it was incorrectly skipped. + local nixl_api + nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) + if [[ -n "$nixl_api" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then + sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api (IBDEVICES=${IBDEVICES:-unset})" + else + echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" + fi + fi +} + +setup_rdma_env + +if [[ -z "$UCX_NET_DEVICES" ]]; then + echo "Error: UCX_NET_DEVICES is empty after env.sh detection" >&2 + exit 1 +fi + +# ============================================================================= +# Model-Specific Configuration from YAML +# ============================================================================= +MODELS_YAML="${WS_PATH}/models_vllm.yaml" + +if [[ ! -f "$MODELS_YAML" ]]; then + echo "ERROR: models.yaml not found at $MODELS_YAML" + exit 1 +fi + +if [[ -z "$MODEL_NAME" ]]; then + echo "ERROR: MODEL_NAME is not set"; exit 1 +fi + +eval "$(python3 -c " +import yaml, sys + +with open('${MODELS_YAML}') as f: + models = yaml.safe_load(f) + +model_name = '${MODEL_NAME}' +if model_name not in models: + print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') + sys.exit(0) + +m = models[model_name] + +def bash_escape(s): + \"\"\"Escape a value for safe embedding in a bash double-quoted assignment.\"\"\" + return s.replace('\\\\', '\\\\\\\\').replace('\"', '\\\\\"').replace('\$', '\\\\\$').replace('\`', '\\\\\`') + +pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8')) +df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8')) +ev = bash_escape(m.get('env', '')) +dev = bash_escape(m.get('decode_env', '')) +print(f'PREFILL_SERVER_CONFIG=\"{pf}\"') +print(f'DECODE_SERVER_CONFIG=\"{df}\"') +print(f'MODEL_ENVS=\"{ev}\"') +print(f'DECODE_MODEL_ENVS=\"{dev}\"') +")" + +echo "Loaded model configuration for: $MODEL_NAME" + +# Apply tensor-parallel size and EP/DP flags from submit pipeline. +if [[ -n "${PREFILL_TP_SIZE:-}" ]]; then + if echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${PREFILL_TP_SIZE}/g") + else + PREFILL_SERVER_CONFIG+=" --tensor-parallel-size ${PREFILL_TP_SIZE}" + fi +fi +if [[ -n "${DECODE_TP_SIZE:-}" ]]; then + if echo "$DECODE_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${DECODE_TP_SIZE}/g") + else + DECODE_SERVER_CONFIG+=" --tensor-parallel-size ${DECODE_TP_SIZE}" + fi +fi +if [[ "${PREFILL_ENABLE_EP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + PREFILL_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${PREFILL_ENABLE_DP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + PREFILL_SERVER_CONFIG+=" --enable-dp-attention" +fi +if [[ "${DECODE_ENABLE_EP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + DECODE_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${DECODE_ENABLE_DP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + DECODE_SERVER_CONFIG+=" --enable-dp-attention" +fi + +echo "PREFILL_SERVER_CONFIG (after TP/EP/DP): $PREFILL_SERVER_CONFIG" +echo "DECODE_SERVER_CONFIG (after TP/EP/DP): $DECODE_SERVER_CONFIG" + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +echo "Waiting at the container creation barrier on $host_name" +python3 $WS_PATH/sync.py barrier \ + --local-ip ${host_ip} \ + --local-port 5000 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 5000 \ + --wait-for-all-ports \ + --timeout 600 + +# ============================================================================= +# ETCD Server Setup +# ============================================================================= + +echo "Proceeding to start etcd server on $host_name" +bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 & +etcd_pid=$! + +echo "Waiting at etcd server barrier on $host_name" +python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 2379 \ + --wait-for-all-ports \ + --timeout 300 + +echo "All etcd servers are up : $host_name" +sleep 3 + +echo "etcd endpoint health==================" +etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true +echo "======================================" + +python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 2379 \ + --wait-for-all-ports \ + --timeout 300 + +# ============================================================================= +# Cluster Topology Configuration +# ============================================================================= +IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" + +PREFILL_ARGS="" +DECODE_ARGS="" + +for ((i=0; i "$PROXY_LOG_FILE" 2>&1 & + set +x + proxy_pid=$! + sleep 3 + fi + + PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + ${PREFILL_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" + set -x + eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & + set +x + prefill_pid=$! + fi + + echo "Waiting for all prefill and decode servers to be up . . ." + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: skipping barrier (wait-for-all-ports)" + else + python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports $SERVER_PORT \ + --wait-for-all-ports \ + --timeout 1800 + fi + + echo "Congratulations!!! All prefill and decode servers are up . . ." + + # Wait for proxy /health to confirm it is accepting requests + HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-health \ + --health-endpoint /health \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $HEALTH_BARRIER_CMD" + else + eval "$HEALTH_BARRIER_CMD" + echo "MoRI-IO proxy is ready for benchmarking" + fi + + echo "Ready for benchmarking on ${host_name}:${host_ip}" + echo "Benchmarking on ${host_name}:${host_ip}" + cd $WS_PATH + + export ROUTER_PORT=$ROUTER_PORT + BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((GPUS_PER_NODE*xP)) $((GPUS_PER_NODE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BENCH_CMD" + else + set -x + eval "$BENCH_CMD" + set +x + fi + + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" + mkdir -p "$LOGS_OUTPUT" + + if [[ "$DRY_RUN" -eq 0 ]]; then + cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" + echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" + fi + + echo "Killing the proxy server and prefill server" + if [[ "$DRY_RUN" -eq 0 ]]; then + [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true + [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true + sleep 2 + # Fallback: ensure no orphaned processes keep ports open + pkill -f moriio_proxy 2>/dev/null || true + pkill -f "vllm serve" 2>/dev/null || true + fi + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then + echo "${host_name}:${host_ip} is Additional Prefill Node (Model: ${MODEL_NAME})" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + + setup_vllm_env + + PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + ${PREFILL_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" + set -x + eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & + set +x + prefill_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the prefill server" + [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid 2>/dev/null || true + +else + echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})" + echo "Using decode config: $DECODE_SERVER_CONFIG" + + setup_vllm_env + + for env_pair in ${DECODE_MODEL_ENVS}; do + export "$env_pair" + echo "[DECODE_ENV] $env_pair" + done + + DECODE_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + ${DECODE_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $DECODE_CMD" + else + DECODE_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log" + set -x + eval "$DECODE_CMD" > "$DECODE_LOG_FILE" 2>&1 & + set +x + decode_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the decode server" + [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true +fi + +echo "Killing the etcd server" +kill $etcd_pid 2>/dev/null || true +pkill -f etcd 2>/dev/null || true + +echo "Script completed successfully" +exit 0 diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh new file mode 100644 index 000000000..8c7a9f07a --- /dev/null +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -0,0 +1,908 @@ +#!/bin/bash +# ============================================================================= +# setup_deps.sh — Install missing vLLM disagg dependencies at container start. +# +# Base image: vllm/vllm-openai-rocm:v0.18.0 +# Sourced by server.sh so PATH / LD_LIBRARY_PATH exports persist. +# Idempotent: each component is skipped if already present. +# +# Build steps run in subshells to avoid CWD pollution between installers. +# ============================================================================= + +ROCM_PATH="${ROCM_PATH:-/opt/rocm}" +UCX_HOME="${UCX_HOME:-/usr/local/ucx}" +RIXL_HOME="${RIXL_HOME:-/usr/local/rixl}" + +_SETUP_START=$(date +%s) +_SETUP_INSTALLED=() + +git_clone_retry() { + local url="$1" dest="$2" max_tries=3 try=1 + while (( try <= max_tries )); do + if git clone --quiet "$url" "$dest" 2>/dev/null; then return 0; fi + echo "[SETUP] git clone attempt $try/$max_tries failed for $url, retrying in 10s..." + rm -rf "$dest" + sleep 10 + (( try++ )) + done + echo "[SETUP] git clone failed after $max_tries attempts: $url" + return 1 +} + +# --------------------------------------------------------------------------- +# 1. UCX (ROCm fork — required for GPU-direct RDMA via Nixl) +# --------------------------------------------------------------------------- +install_ucx() { + if [[ -x "${UCX_HOME}/bin/ucx_info" ]]; then + echo "[SETUP] UCX already present at ${UCX_HOME}" + return 0 + fi + + echo "[SETUP] Installing UCX build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + autoconf automake libtool pkg-config \ + librdmacm-dev rdmacm-utils libibverbs-dev ibverbs-utils ibverbs-providers \ + infiniband-diags perftest ethtool rdma-core strace \ + && rm -rf /var/lib/apt/lists/* + + echo "[SETUP] Building UCX from source (ROCm/ucx @ da3fac2a)..." + ( + set -e + mkdir -p /usr/local/src && cd /usr/local/src + git_clone_retry https://github.com/ROCm/ucx.git ucx && cd ucx + git checkout da3fac2a + ./autogen.sh && mkdir -p build && cd build + ../configure \ + --prefix="${UCX_HOME}" \ + --enable-shared --disable-static \ + --disable-doxygen-doc --enable-optimizations \ + --enable-devel-headers --enable-mt \ + --with-rocm="${ROCM_PATH}" --with-verbs --with-dm + make -j"$(nproc)" && make install + ) + rm -rf /usr/local/src/ucx + + if [[ ! -x "${UCX_HOME}/bin/ucx_info" ]]; then + echo "[SETUP] ERROR: UCX build failed"; exit 1 + fi + _SETUP_INSTALLED+=("UCX") +} + +# --------------------------------------------------------------------------- +# 2. RIXL (ROCm fork of NIXL — KV cache transfer for disaggregated vLLM) +# --------------------------------------------------------------------------- +install_rixl() { + if python3 -c "import rixl" 2>/dev/null; then + echo "[SETUP] RIXL Python bindings already present" + return 0 + fi + + echo "[SETUP] Installing RIXL build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc \ + libcpprest-dev libaio-dev \ + && rm -rf /var/lib/apt/lists/* + pip3 install --quiet meson "pybind11[global]" + + echo "[SETUP] Building RIXL from source (ROCm/RIXL @ f33a5599)..." + ( + set -e + git_clone_retry https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl + git checkout f33a5599 + meson setup build --prefix="${RIXL_HOME}" \ + -Ducx_path="${UCX_HOME}" \ + -Drocm_path="${ROCM_PATH}" + cd build && ninja && ninja install + cd /opt/rixl + pip install --quiet \ + --config-settings=setup-args="-Drocm_path=${ROCM_PATH}" \ + --config-settings=setup-args="-Ducx_path=${UCX_HOME}" . + ) + rm -rf /opt/rixl + + if ! python3 -c "import rixl" 2>/dev/null; then + echo "[SETUP] ERROR: RIXL build failed"; exit 1 + fi + _SETUP_INSTALLED+=("RIXL") +} + +# --------------------------------------------------------------------------- +# 3. etcd (distributed KV store for vLLM disagg service discovery) +# --------------------------------------------------------------------------- +install_etcd() { + if [[ -x /usr/local/bin/etcd/etcd ]]; then + echo "[SETUP] etcd already present" + return 0 + fi + + local version="v3.6.0-rc.5" + echo "[SETUP] Downloading etcd ${version}..." + wget -q "https://github.com/etcd-io/etcd/releases/download/${version}/etcd-${version}-linux-amd64.tar.gz" \ + -O /tmp/etcd.tar.gz + mkdir -p /usr/local/bin/etcd + tar -xf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 + rm /tmp/etcd.tar.gz + _SETUP_INSTALLED+=("etcd") +} + +# --------------------------------------------------------------------------- +# 4. libionic1 (Pensando ionic RDMA verbs provider for RoCEv2 KV transfer) +# Harmless on non-Pensando nodes (shared lib is simply unused). +# --------------------------------------------------------------------------- +install_libionic() { + if dpkg -l libionic1 2>/dev/null | grep -q '^ii'; then + echo "[SETUP] libionic1 already installed" + return 0 + fi + + echo "[SETUP] Downloading and installing libionic1..." + wget -q "https://repo.radeon.com/amdainic/pensando/ubuntu/1.117.5/pool/main/r/rdma-core/libionic1_54.0-149.g3304be71_amd64.deb" \ + -O /tmp/libionic1.deb + dpkg -i /tmp/libionic1.deb || true + rm -f /tmp/libionic1.deb + _SETUP_INSTALLED+=("libionic1") +} + +# --------------------------------------------------------------------------- +# 5. MoRI-IO proxy deps (Python packages for the MoRI-IO-aware proxy server) +# The proxy replaces vllm-router: it handles both HTTP routing AND the +# MoRI-IO ZMQ registration/request-enrichment protocol. +# Only needed on NODE_RANK=0 (proxy node). +# --------------------------------------------------------------------------- +install_mori_proxy_deps() { + if python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then + echo "[SETUP] MoRI-IO proxy Python deps already present" + return 0 + fi + + echo "[SETUP] Installing MoRI-IO proxy Python deps..." + # v0.18.0 ships aiohttp, pyzmq, blinker(distutils); only quart and msgpack + # are missing. --ignore-installed blinker avoids pip's distutils uninstall + # error when quart pulls a newer blinker version. + pip install --quiet --ignore-installed blinker + pip install --quiet quart msgpack + + if ! python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then + echo "[SETUP] ERROR: MoRI-IO proxy deps install failed"; exit 1 + fi + _SETUP_INSTALLED+=("mori-proxy-deps") +} + +# --------------------------------------------------------------------------- +# 6. MoRI (Modular RDMA Interface — EP dispatch/combine kernels for MoE) +# Required for --all2all-backend mori (Expert Parallelism via RDMA). +# GPU kernels are JIT-compiled on first use; no hipcc needed at install. +# +# v0.18.0 ships MoRI 0.1.dev185+g2d02c6a98, but it STILL has the PCI +# topology bug (TopoSystemPci::Load assertion failure on Broadcom +# PEX890xx switches). Always rebuild from our target commit b645fc8 +# which includes the dsp2dev subordinate-range fix. +# --------------------------------------------------------------------------- +install_mori() { + local MORI_TARGET_COMMIT="b645fc8" + local MORI_MARKER="/usr/local/lib/python3.*/dist-packages/.mori_commit_${MORI_TARGET_COMMIT}" + + if ls $MORI_MARKER &>/dev/null; then + echo "[SETUP] MoRI @ $MORI_TARGET_COMMIT already installed (marker found)" + return 0 + fi + + echo "[SETUP] Installing MoRI build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + libopenmpi-dev openmpi-bin libpci-dev \ + && rm -rf /var/lib/apt/lists/* + + echo "[SETUP] Building MoRI from source (ROCm/mori @ $MORI_TARGET_COMMIT)..." + echo "[SETUP] (overriding image-provided version to fix PCI topology bug)" + ( + set -e + git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori + git checkout "$MORI_TARGET_COMMIT" + pip install --quiet --force-reinstall . + ) + rm -rf /opt/mori + + if ! python3 -c "import mori" 2>/dev/null; then + echo "[SETUP] ERROR: MoRI build failed"; exit 1 + fi + touch $(python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")/.mori_commit_${MORI_TARGET_COMMIT} + _SETUP_INSTALLED+=("MoRI@$MORI_TARGET_COMMIT") +} + +# --------------------------------------------------------------------------- +# 6b. amd-quark (MXFP4 quantization support for Kimi-K2.5-MXFP4 and similar) +# Required due to ROCm vLLM missing the quark dependency: +# https://github.com/vllm-project/vllm/issues/35633 +# --------------------------------------------------------------------------- +install_amd_quark() { + if python3 -c "import quark" 2>/dev/null; then + echo "[SETUP] amd-quark already present" + return 0 + fi + + echo "[SETUP] Installing amd-quark for MXFP4 quantization support..." + pip install --quiet amd-quark + + if ! python3 -c "import quark" 2>/dev/null; then + echo "[SETUP] WARN: amd-quark install failed (non-fatal for non-MXFP4 models)" + return 0 + fi + _SETUP_INSTALLED+=("amd-quark") +} + +# --------------------------------------------------------------------------- +# 7. Patch vLLM MoRI-EP + FP8 incompatibility (present in v0.17.1 & v0.18.0) +# vLLM asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel +# uses defer_input_quant=True which MoRI's prepare/finalize rejects. +# Patch: remove both the AITER requirement assertion and the +# defer_input_quant NotImplementedError so non-AITER kernels work. +# --------------------------------------------------------------------------- +patch_mori_fp8_compat() { + python3 -c ' +import re, os, sys +patched = [] + +# 1. Patch layer.py: remove multi-line AITER assertion for MoRI +try: + import vllm.model_executor.layers.fused_moe.layer as lm + f = lm.__file__ + src = open(f).read() + if "Mori needs to be used with aiter" in src: + new = re.sub( + r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", + "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", + src, flags=re.DOTALL) + if new != src: + open(f, "w").write(new) + patched.append("layer.py") +except Exception as e: + print(f"[SETUP] WARN patch layer.py: {e}", file=sys.stderr) + +# 2. Patch mori_prepare_finalize.py: remove defer_input_quant restriction +try: + import vllm.model_executor.layers.fused_moe.mori_prepare_finalize as mm + f = mm.__file__ + src = open(f).read() + if "defer_input_quant" in src: + new = re.sub( + r"raise NotImplementedError\([^)]*defer_input_quant[^)]*\)", + "pass # [PATCHED] defer_input_quant check removed for MoRI-EP + FP8", + src) + if new != src: + open(f, "w").write(new) + patched.append("mori_prepare_finalize.py") +except Exception as e: + print(f"[SETUP] WARN patch mori_pf: {e}", file=sys.stderr) + +if patched: + print(f"[SETUP] Patched: {chr(44).join(patched)}") +else: + print("[SETUP] No MoRI-FP8 patches needed") +' + _SETUP_INSTALLED+=("MoRI-FP8-patch") +} + +# --------------------------------------------------------------------------- +# 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock) +# In WRITE mode, save_kv_layer spins forever waiting for the handshake +# callback to set write_ready_flags. This blocks the model worker thread, +# preventing it from responding to EngineCore shm_broadcast, causing a +# TimeoutError cascade and crash. +# Patch: add time.sleep(0.001) and a 30s timeout to yield CPU and prevent +# the model worker from deadlocking. +# --------------------------------------------------------------------------- +patch_moriio_save_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + # Already patched? + if "[PATCHED] save_kv_layer timeout" in src: + print("[SETUP] save_kv_layer timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + continue""" + + if old not in src: + print("[SETUP] WARN: save_kv_layer busy-spin pattern not found, skipping patch") + sys.exit(0) + + new = """ # [PATCHED] save_kv_layer — null guard + timeout + sleep + if remote_engine_id is None: + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _SAVE_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + _elapsed = _time.monotonic() - _wait_start + if _elapsed > _SAVE_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] save_kv_layer: timeout (%.1fs) waiting for " + "write_ready_flags[%s], breaking to unblock model " + "worker", _elapsed, remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched save_kv_layer: null guard + timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch save_kv_layer: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-save-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 9. Patch MoRIIO waiting_for_transfer_complete with bounded timeout +# The original status.Wait() blocks forever if an RDMA completion never +# arrives (e.g., NIC queue saturation at C256). This replaces the unbounded +# wait with a polling loop using status.Succeeded() + configurable timeout. +# Also adds error handling to the write worker loop so a single failed +# transfer doesn't kill the background thread. +# --------------------------------------------------------------------------- +patch_moriio_transfer_timeout() { + python3 -c ' +import os, sys, textwrap + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine as me + f = me.__file__ + src = open(f).read() + + if "[PATCHED] transfer completion timeout" in src: + print("[SETUP] transfer completion timeout patch already applied") + sys.exit(0) + + # --- Patch 1: Replace waiting_for_transfer_complete with polling + timeout --- + old_wait = """ def waiting_for_transfer_complete(self): + if not self.transfer_status: + return + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + for status in transfers_to_wait: + try: + status.Wait() + if not status.Succeeded(): + logger.error( + "Transfer failed: %s, Code: %s", status.Message(), status.Code() + ) + raise TransferError("MoRIIO transfer failed!") + except Exception as e: + logger.error("Transfer %s failed: %s", status, e) + raise""" + + new_wait = """ def waiting_for_transfer_complete(self): + # [PATCHED] transfer completion timeout — bounded polling loop + import time as _time, os as _os + if not self.transfer_status: + return + + _timeout = float(_os.environ.get("VLLM_MORIIO_TRANSFER_TIMEOUT", "120")) + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + _start = _time.monotonic() + remaining = list(transfers_to_wait) + _polls = 0 + _completed = 0 + + while remaining: + _elapsed = _time.monotonic() - _start + if _elapsed > _timeout: + logger.error( + "[HANGFIX] transfer_timeout elapsed=%.1fs " + "pending=%d/%d completed=%d polls=%d " + "action=raise_transfer_error", + _elapsed, len(remaining), len(transfers_to_wait), + _completed, _polls, + ) + raise TransferError( + f"RDMA transfer timeout after {_elapsed:.1f}s, " + f"{len(remaining)}/{len(transfers_to_wait)} pending" + ) + + still_waiting = [] + for status in remaining: + try: + if status.Succeeded(): + _completed += 1 + continue + still_waiting.append(status) + except Exception as e: + logger.error( + "[HANGFIX] transfer_poll_error error=%s", e) + raise TransferError( + f"Transfer failed during poll: {e}" + ) from e + + remaining = still_waiting + if remaining: + _time.sleep(0.005) + _polls += 1 + if _polls % 2000 == 0: + logger.warning( + "[HANGFIX] transfer_wait pending=%d " + "completed=%d elapsed=%.1fs timeout=%.0fs", + len(remaining), _completed, + _time.monotonic() - _start, _timeout, + )""" + + if old_wait not in src: + print("[SETUP] WARN: waiting_for_transfer_complete pattern not found") + sys.exit(0) + + new_src = src.replace(old_wait, new_wait) + + # --- Patch 2: Add error handling + cleanup to _write_worker_loop --- + old_loop = """ self._execute_write_task(task)""" + + new_loop = """ try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s write_task_failed error=%s " + "action=cleanup_and_mark_done", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None + ) + except Exception: + pass""" + + if old_loop in new_src: + new_src = new_src.replace(old_loop, new_loop, 1) + else: + print("[SETUP] WARN: _write_worker_loop pattern not found for error handling") + + # --- Patch 3: Add deferred task timeout to _process_deferred_tasks --- + old_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + if not self._deferred_tasks: + return + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + if self._is_remote_ready(task): + self._execute_write_task(task) + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + new_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + # [PATCHED] deferred task timeout — prune stale tasks + import time as _time, os as _os + if not self._deferred_tasks: + return + + _DEFER_TIMEOUT = float( + _os.environ.get("VLLM_MORIIO_DEFER_TIMEOUT", "60")) + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + _age = _time.monotonic() - getattr(task, "_defer_ts", _time.monotonic()) + if _age > _DEFER_TIMEOUT: + logger.error( + "[HANGFIX] req=%s deferred_task_expired age=%.1fs " + "action=drop_and_mark_done", + task.request_id, _age, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + continue + if self._is_remote_ready(task): + try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s deferred_write_failed error=%s", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + if old_deferred in new_src: + new_src = new_src.replace(old_deferred, new_deferred, 1) + else: + print("[SETUP] WARN: _process_deferred_tasks pattern not found") + + # --- Patch 4: Stamp defer time when task is deferred --- + old_defer_add = """ self._deferred_tasks.append(task)""" + new_defer_add = """ import time as _time2 + if not hasattr(task, "_defer_ts"): + task._defer_ts = _time2.monotonic() + self._deferred_tasks.append(task)""" + if old_defer_add in new_src: + new_src = new_src.replace(old_defer_add, new_defer_add, 1) + else: + print("[SETUP] WARN: deferred task timestamp patch target not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: transfer timeout + writer error handling") + +except Exception as e: + print(f"[SETUP] WARN patch transfer_timeout: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-transfer-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 10. Patch MoRIIO start_load_kv busy-spin (same pattern as save_kv_layer) +# The READ-mode spin loop in start_load_kv has the same unbounded-spin +# issue as save_kv_layer. Add timeout + sleep + null guard. +# --------------------------------------------------------------------------- +patch_moriio_load_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + if "[PATCHED] start_load_kv timeout" in src: + print("[SETUP] start_load_kv timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + continue""" + + if old not in src: + print("[SETUP] WARN: start_load_kv busy-spin pattern not found, skipping") + sys.exit(0) + + new = """ # [PATCHED] start_load_kv timeout — prevent model worker deadlock + if remote_engine_id is None and not wait_handshake_readd_req: + self._reqs_to_send.update(metadata.reqs_to_send) + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _LOAD_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + if _time.monotonic() - _wait_start > _LOAD_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] start_load_kv: timeout (%.1fs) waiting for " + "load_ready_flag[%s]", _time.monotonic() - _wait_start, + remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: start_load_kv replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched start_load_kv busy-spin with timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch start_load_kv: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-load-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished +# vLLM asserts that a request in finished_recving must be either +# WAITING_FOR_REMOTE_KVS or finished. In READ mode the request can +# transition to RUNNING before the aggregated recv notification arrives, +# crashing the engine with AssertionError. +# (present in v0.17.1 & v0.18.0) +# --------------------------------------------------------------------------- +patch_scheduler_read_mode_fix() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] read-mode recv assertion" in src: + print("[SETUP] scheduler read-mode assertion fix already applied") + sys.exit(0) + + old_recv = """ for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + assert req_id in self.requests + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + else: + assert RequestStatus.is_finished(req.status) + self._free_blocks(self.requests[req_id])""" + + new_recv = """ # [PATCHED] read-mode recv assertion — handle intermediate states + for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping recv", req_id) + continue + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + elif RequestStatus.is_finished(req.status): + self._free_blocks(self.requests[req_id]) + else: + logger.debug( + "Request %s recv finished but status=%s (not " + "WAITING_FOR_REMOTE_KVS or finished), skipping " + "block free — will be freed on request completion", + req_id, req.status.name)""" + + if old_recv not in src: + print("[SETUP] WARN: scheduler finished_recving pattern not found, skipping") + sys.exit(0) + + new_src = src.replace(old_recv, new_recv, 1) + + old_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + assert req_id in self.requests + self._free_blocks(self.requests[req_id])""" + + new_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + self._free_blocks(self.requests[req_id])""" + + if old_send in new_src: + new_src = new_src.replace(old_send, new_send, 1) + else: + print("[SETUP] WARN: scheduler finished_sending pattern not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: scheduler _update_from_kv_xfer_finished read-mode fix") + +except Exception as e: + print(f"[SETUP] WARN patch scheduler read-mode: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("scheduler-read-mode-fix") +} + +# --------------------------------------------------------------------------- +# 12. Idle KV block reaper for disaggregated prefill (READ mode) +# The RIXL notification path can lose `finished_sending` signals under +# high concurrency with ibv_post_send failures. This leaves KV blocks +# permanently allocated on the prefill engine even after the decode has +# finished reading. Over multiple benchmark rounds, leaked blocks +# accumulate and eventually saturate the prefill KV cache. +# +# Fix: instrument the scheduler's `schedule()` method to detect idle +# periods (0 running, 0 waiting for >5s) and force-free blocks for +# any remaining requests whose status is finished. +# --------------------------------------------------------------------------- +patch_prefill_idle_kv_reaper() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] idle-kv-reaper" in src: + print("[SETUP] idle KV block reaper already applied") + sys.exit(0) + + # Find the _update_from_kv_xfer_finished method end and add reaper logic + # We inject into the method that processes KV transfer completions. + marker = "[PATCHED] read-mode recv assertion" + if marker not in src: + print("[SETUP] WARN: scheduler read-mode patch not found, skipping reaper") + sys.exit(0) + + # Add reaper state initialization to __init__ + old_init_marker = "self.finished_recving_kv_req_ids" + if old_init_marker not in src: + print("[SETUP] WARN: finished_recving_kv_req_ids not found in scheduler") + sys.exit(0) + + # Find the first occurrence to insert reaper state + init_pos = src.find(old_init_marker) + # Find the line containing it + line_end = src.find("\n", init_pos) + init_line = src[init_pos:line_end] + + # Add reaper state after this line + reaper_init = init_line + """ + # [PATCHED] idle-kv-reaper state + self._idle_kv_reaper_ts = 0.0 + self._idle_kv_reaper_active = False""" + + src = src.replace(init_line, reaper_init, 1) + + # Now add the reaper logic at the end of _update_from_kv_xfer_finished + # Find the finished_sending handler we patched + send_handler = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + self._free_blocks(self.requests[req_id])""" + + reaper_logic = send_handler + """ + + # [PATCHED] idle-kv-reaper — force-free leaked prefill KV blocks + import time as _time + _REAPER_IDLE_SECS = 5.0 + _num_running = sum(1 for r in self.requests.values() + if r.status == RequestStatus.RUNNING) + _should_reap = (_num_running == 0) + + if _should_reap: + if not self._idle_kv_reaper_active: + self._idle_kv_reaper_active = True + self._idle_kv_reaper_ts = _time.monotonic() + elif _time.monotonic() - self._idle_kv_reaper_ts > _REAPER_IDLE_SECS: + _reaped = 0 + _reap_ids = [] + for _rid, _req in list(self.requests.items()): + if RequestStatus.is_finished(_req.status): + _reap_ids.append(_rid) + for _rid in _reap_ids: + try: + _req = self.requests[_rid] + self._free_blocks(_req) + _reaped += 1 + except Exception as _e: + logger.debug("[KV-REAPER] free_blocks failed for %s: %s", _rid, _e) + if _reaped > 0: + logger.warning( + "[KV-REAPER] Force-freed blocks for %d finished " + "requests after %.1fs idle", + _reaped, _time.monotonic() - self._idle_kv_reaper_ts) + self._idle_kv_reaper_ts = _time.monotonic() + else: + self._idle_kv_reaper_active = False""" + + if send_handler in src: + src = src.replace(send_handler, reaper_logic, 1) + else: + print("[SETUP] WARN: send handler not found for reaper injection") + sys.exit(0) + + open(f, "w").write(src) + print("[SETUP] Patched: idle KV block reaper for prefill") + +except Exception as e: + print(f"[SETUP] WARN patch idle-kv-reaper: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("idle-kv-reaper") +} + +# --------------------------------------------------------------------------- +# 13. Patch MiniMax M2.5 WideEP + MoRI + EPLB support +# Replaces the upstream minimax_m2.py with our patched version that adds +# GateLinear, EP group integration, sequence parallelism, and the +# MixtureOfExperts EPLB protocol. Idempotent: skips if already patched. +# --------------------------------------------------------------------------- +patch_minimax_m2_wideep_mori() { + local patch_file="${WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}/patches/minimax_m2.py" + if [[ ! -f "$patch_file" ]]; then + # Also check the Docker-baked location + patch_file="/opt/vllm_disagg/patches/minimax_m2.py" + fi + if [[ ! -f "$patch_file" ]]; then + echo "[SETUP] minimax_m2.py patch not found, skipping (WideEP/MoRI not patched)" + return 0 + fi + + python3 -c ' +import os, sys, shutil + +try: + import vllm.model_executor.models.minimax_m2 as mmod + target = mmod.__file__ + src = sys.argv[1] + + with open(target) as f: + if "get_ep_group" in f.read(): + print("[SETUP] minimax_m2.py already has WideEP+MoRI support") + sys.exit(0) + + shutil.copy2(src, target) + print(f"[SETUP] Patched minimax_m2.py: {src} -> {target}") + +except Exception as e: + print(f"[SETUP] WARN patch minimax_m2: {e}", file=sys.stderr) +' "$patch_file" + _SETUP_INSTALLED+=("minimax-m2-wideep-mori") +} + +# ============================================================================= +# Run installers +# ============================================================================= + +install_ucx +install_rixl +install_etcd +install_libionic +install_mori +install_amd_quark +install_mori_proxy_deps +patch_mori_fp8_compat +patch_moriio_save_kv_timeout +patch_moriio_transfer_timeout +patch_moriio_load_kv_timeout +patch_scheduler_read_mode_fix +patch_prefill_idle_kv_reaper +patch_minimax_m2_wideep_mori + +# ============================================================================= +# Export paths (persists for server.sh since this file is sourced) +# ============================================================================= + +export ROCM_PATH="${ROCM_PATH}" +export UCX_HOME="${UCX_HOME}" +export RIXL_HOME="${RIXL_HOME}" +export PATH="${UCX_HOME}/bin:/usr/local/bin/etcd:/root/.cargo/bin:${PATH}" +export LD_LIBRARY_PATH="${UCX_HOME}/lib:${RIXL_HOME}/lib:${RIXL_HOME}/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" + +_SETUP_END=$(date +%s) +if [[ ${#_SETUP_INSTALLED[@]} -eq 0 ]]; then + echo "[SETUP] All dependencies already present (${_SETUP_END}s wallclock)" +else + echo "[SETUP] Installed: ${_SETUP_INSTALLED[*]} in $(( _SETUP_END - _SETUP_START ))s" +fi diff --git a/benchmarks/multi_node/amd_utils/start_etcd.sh b/benchmarks/multi_node/amd_utils/start_etcd.sh new file mode 100755 index 000000000..46bbd2964 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/start_etcd.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -x + +IPADDRS="${IPADDRS:-localhost}" + +# Use management network IP (matching what the Slurm script resolved) +host_ip=$(ip route get 1.1.1.1 2>/dev/null | sed -n 's/.*src \([^ ]*\).*/\1/p') +if [[ -z "$host_ip" ]]; then + host_ip=$(hostname -I | awk '{print $1}') +fi + +IFS=',' read -ra ADDR <<< "$IPADDRS" + +# Determine node name based on position in the IPADDRS list +index=0 +for ip in "${ADDR[@]}"; do + if [[ "$ip" == "$host_ip" ]]; then + break + fi + index=$((index + 1)) +done +node_name="etcd-$((index+1))" + +# Build initial cluster string +initial_cluster="" +for i in "${!ADDR[@]}"; do + peer_name="etcd-$((i+1))" + initial_cluster+="$peer_name=http://${ADDR[i]}:2380" + if [[ $i -lt $((${#ADDR[@]} - 1)) ]]; then + initial_cluster+="," + fi +done + +mkdir -p /var/lib/etcd +rm -rf /var/lib/etcd/* + +/usr/local/bin/etcd/etcd \ + --name "$node_name" \ + --data-dir /var/lib/etcd \ + --initial-advertise-peer-urls http://$host_ip:2380 \ + --listen-peer-urls http://0.0.0.0:2380 \ + --listen-client-urls http://0.0.0.0:2379 \ + --advertise-client-urls http://$host_ip:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster "$initial_cluster" \ + --initial-cluster-state new \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/etcd_NODE${NODE_RANK}.log diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index d2c49bc9e..a77462fc5 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -2,37 +2,51 @@ # # Cluster Configuration Template for Multi-Node Disaggregated Serving # -# This script submits a multi-node SGLang disaggregated benchmark job to SLURM. +# This script submits a multi-node disaggregated benchmark job to SLURM. # It must be configured for your specific cluster before use. +# +# ENGINE=sglang (default): SGLang disaggregated serving +# ENGINE=vllm: vLLM disaggregated serving +# +# Router is co-located with the first prefill node (same for both engines), +# so NUM_NODES = PREFILL_NODES + DECODE_NODES. usage() { cat << 'USAGE' -This script aims to provide a one-liner call to the submit_job_script.py, -so that the deployment process can be further simplified. - -To use this script, fill in the following script and run it under your `slurm_jobs` directory: -======== begin script area ======== -# REQUIRED: Cluster-specific configuration -export SLURM_ACCOUNT= # Your SLURM account name -export SLURM_PARTITION= # SLURM partition to submit to -export TIME_LIMIT= # Job time limit (e.g., "08:00:00") - -# REQUIRED: Model and container paths -export MODEL_PATH= # Path to model directory (e.g., /mnt/models, /nfsdata) -export CONTAINER_IMAGE= # Path to container squash file - -# REQUIRED: Hardware configuration -export GPUS_PER_NODE= # GPUs per node (e.g., 8 for MI355X, 4 for MI325X) - -# OPTIONAL: RDMA/Network configuration (set in runners/launch_mi355x-amds.sh for AMD) -# export IBDEVICES= # RDMA device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) -# export MORI_RDMA_TC= # RDMA traffic class (e.g., 96, 104) - -bash submit.sh \ -$PREFILL_NODES $PREFILL_WORKERS $DECODE_NODES $DECODE_WORKERS \ -$ADDITIONAL_FRONTENDS \ -$ISL $OSL $CONCURRENCIES $REQUEST_RATE -======== end script area ======== +Usage: + bash submit.sh \ + \ + \ + \ + \ + [NODE_LIST] + +Arguments: + PREFILL_NODES Number of prefill nodes + PREFILL_WORKERS Number of prefill workers (usually 1) + DECODE_NODES Number of decode nodes + DECODE_WORKERS Number of decode workers (usually 1) + ISL Input sequence length + OSL Output sequence length + CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") + REQUEST_RATE Request rate ("inf" for max throughput) + PREFILL_ENABLE_EP true/false or 1/0 (expert parallelism on prefill) + PREFILL_ENABLE_DP true/false or 1/0 (data-parallel attention on prefill) + DECODE_ENABLE_EP true/false or 1/0 (expert parallelism on decode) + DECODE_ENABLE_DP true/false or 1/0 (data-parallel attention on decode) + PREFILL_TP Tensor parallel size per prefill node + DECODE_TP Tensor parallel size per decode node + RANDOM_RANGE_RATIO Random range ratio for benchmark client + NODE_LIST Optional: comma-separated hostnames (must match NUM_NODES) + +Required environment variables: + SLURM_ACCOUNT SLURM account name + SLURM_PARTITION SLURM partition + TIME_LIMIT Job time limit (e.g., "08:00:00") + MODEL_PATH Path to model directory (e.g., /nfsdata) + MODEL_NAME Model name directory + CONTAINER_IMAGE Docker image name (e.g., vllm_disagg_pd:latest) + RUNNER_NAME Runner identifier (for job name) USAGE } @@ -53,6 +67,7 @@ check_env MODEL_PATH check_env MODEL_NAME check_env CONTAINER_IMAGE check_env RUNNER_NAME +check_env FRAMEWORK # GPUS_PER_NODE defaults to 8 (MI355X). Set to 4 for MI325X if needed. GPUS_PER_NODE="${GPUS_PER_NODE:-8}" @@ -66,31 +81,32 @@ ISL=$5 OSL=$6 CONCURRENCIES=$7 REQUEST_RATE=$8 -PREFILL_ENABLE_EP=${9:-1} -PREFILL_ENABLE_DP=${10:-1} -DECODE_ENABLE_EP=${11:-1} -DECODE_ENABLE_DP=${12:-1} +PREFILL_ENABLE_EP=${9:-true} +PREFILL_ENABLE_DP=${10:-true} +DECODE_ENABLE_EP=${11:-true} +DECODE_ENABLE_DP=${12:-true} PREFILL_TP=${13:-8} DECODE_TP=${14:-8} -RANDOM_RANGE_RATIO=${15} +RANDOM_RANGE_RATIO=${15:-0.8} NODE_LIST=${16} - NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}" # Export variables for the SLURM job +export ENGINE="${FRAMEWORK:-sglang}" export MODEL_DIR=$MODEL_PATH export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE export PROFILER_ARGS=$profiler_args - - +# Engine-specific xP/yD semantics and TP exports +if [[ "$ENGINE" == "vllm" ]]; then + export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} + export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} +fi +# xP = prefill workers, yD = decode workers (may span multiple nodes) export xP=$PREFILL_WORKERS export yD=$DECODE_WORKERS -export NUM_NODES=$NUM_NODES -export GPUS_PER_NODE=$GPUS_PER_NODE -export MODEL_NAME=$MODEL_NAME export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $PREFILL_TP / $PREFILL_WORKERS )) export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP} export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP} @@ -98,12 +114,16 @@ export DECODE_TP_SIZE=$(( $DECODE_NODES * $DECODE_TP / $DECODE_WORKERS )) export DECODE_ENABLE_EP=${DECODE_ENABLE_EP} export DECODE_ENABLE_DP=${DECODE_ENABLE_DP} export DECODE_MTP_SIZE=${DECODE_MTP_SIZE} + +export NUM_NODES=$NUM_NODES +export GPUS_PER_NODE=$GPUS_PER_NODE +export MODEL_NAME=$MODEL_NAME export BENCH_INPUT_LEN=${ISL} export BENCH_OUTPUT_LEN=${OSL} -export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO} -export BENCH_NUM_PROMPTS_MULTIPLIER=10 +export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} +export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} # Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker) export RUN_EVAL="${RUN_EVAL:-false}" @@ -118,13 +138,10 @@ export SPEC_DECODING="${SPEC_DECODING:-}" export IS_MULTINODE="${IS_MULTINODE:-false}" # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. -# SLURM writes output files on the batch node, so /tmp won't work (node-local). -# Defaults to a sibling directory of the submit working directory. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" mkdir -p "$BENCHMARK_LOGS_DIR" # Optional: pass an explicit node list to sbatch. -# NODE_LIST is expected to be comma-separated hostnames. NODELIST_OPT=() if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then IFS=',' read -r -a NODE_ARR <<< "$NODE_LIST" @@ -137,6 +154,13 @@ if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then NODELIST_OPT=(--nodelist "$NODELIST_CSV") fi +# Optional: exclude specific nodes (e.g. nodes with broken Docker sockets). +# Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames. +EXCLUDE_OPT=() +if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then + EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") +fi + # Construct the sbatch command sbatch_cmd=( sbatch @@ -145,6 +169,7 @@ sbatch_cmd=( -N "$NUM_NODES" -n "$NUM_NODES" "${NODELIST_OPT[@]}" + "${EXCLUDE_OPT[@]}" --time "$TIME_LIMIT" --partition "$SLURM_PARTITION" --account "$SLURM_ACCOUNT" @@ -154,7 +179,6 @@ sbatch_cmd=( "$(dirname "$0")/job.slurm" ) -# todo: --parsable outputs only the jobid and cluster name, test if jobid;clustername is correct JOB_ID=$("${sbatch_cmd[@]}") if [[ $? -ne 0 ]]; then echo "Error: Failed to submit job with sbatch" >&2 diff --git a/benchmarks/multi_node/amd_utils/sync.py b/benchmarks/multi_node/amd_utils/sync.py index 140951519..3678e7614 100755 --- a/benchmarks/multi_node/amd_utils/sync.py +++ b/benchmarks/multi_node/amd_utils/sync.py @@ -143,7 +143,10 @@ def close_port(): time.sleep(30) if args.enable_port: - time.sleep(30) + # Keep the port open long enough for slow nodes to pass their barrier. + # The previous 30s was too short when setup times vary by minutes. + grace = max(60, args.timeout // 2) if args.timeout > 0 else 300 + time.sleep(grace) close_port() diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh index 6a7314ab4..d17d1a323 100644 --- a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh index 0124d4b4d..a8c0d2743 100644 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh new file mode 100755 index 000000000..d7995fb25 --- /dev/null +++ b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh new file mode 100644 index 000000000..a9a28d889 --- /dev/null +++ b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" From 98ad4f30abf061235ac4a08f9bdba522baacc7e8 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 21 Apr 2026 07:57:08 +0000 Subject: [PATCH 03/98] use vLLM router as default router for vllm disagg Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/job.slurm | 34 ++++++++++++++++ .../multi_node/amd_utils/server_vllm.sh | 40 +++++++++++-------- 2 files changed, 58 insertions(+), 16 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 56fefb0ed..491f27aa8 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -77,6 +77,11 @@ PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} +# Router selection: "vllm-router" (external container) or "moriio" (in-container proxy) +ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}" +ROUTER_PORT="${ROUTER_PORT:-30000}" +PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" + # ============================================================================= # Docker privilege detection # ============================================================================= @@ -289,6 +294,10 @@ export IS_MULTINODE="${IS_MULTINODE:-false}" SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" + +# vLLM external router container +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy}" +ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) @@ -397,6 +406,24 @@ echo \"Rank \$SLURM_PROCID on \$(hostname)\" \$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$_DCMD rm -f || true \$DOCKER_CMD ps -aq | xargs -r \$_DCMD stop || true +# Start vLLM external router container on node 0 +if [[ \"$ENGINE\" == \"vllm\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then + \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true + \$DOCKER_CMD run -d \\ + --name \"$ROUTER_CONT_NAME\" \\ + --network host \\ + \"$VLLM_ROUTER_IMAGE\" \\ + vllm-router \\ + --vllm-pd-disaggregation \\ + --vllm-discovery-address \"0.0.0.0:${PROXY_PING_PORT}\" \\ + --port \"${ROUTER_PORT}\" \\ + --host 0.0.0.0 \\ + --policy consistent_hash \\ + --prefill-policy consistent_hash \\ + --decode-policy consistent_hash \\ + --log-level info +fi + exec \$DOCKER_CMD run --rm \ --init \ --stop-timeout 10 \ @@ -446,3 +473,10 @@ fi " srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' + +# Clean up vLLM external router container on node 0 +if [[ "$ENGINE" == "vllm" && "$ROUTER_TYPE" == "vllm-router" ]]; then + srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' + '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true + ' +fi diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index a10e45d6d..6b70014ee 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -282,19 +282,24 @@ if [ "$NODE_RANK" -eq 0 ]; then setup_vllm_env # Start MoRI-IO proxy FIRST — workers register via ZMQ on startup - echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." - PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ - python3 $WS_PATH/moriio_proxy.py" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PROXY_CMD" + # Skipped when ROUTER_TYPE=vllm-router (external router container started by job.slurm) + if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then + echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." + PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ + python3 $WS_PATH/moriio_proxy.py" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PROXY_CMD" + else + PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" + set -x + eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 & + set +x + proxy_pid=$! + sleep 3 + fi else - PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" - set -x - eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 & - set +x - proxy_pid=$! - sleep 3 + echo "Using external vLLM router (ROUTER_TYPE=${ROUTER_TYPE:-vllm-router})" fi PREFILL_CMD="vllm serve ${MODEL_PATH} \ @@ -368,13 +373,16 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" fi - echo "Killing the proxy server and prefill server" + echo "Killing the prefill server" if [[ "$DRY_RUN" -eq 0 ]]; then - [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true + if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then + [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true + fi [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true sleep 2 - # Fallback: ensure no orphaned processes keep ports open - pkill -f moriio_proxy 2>/dev/null || true + if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then + pkill -f moriio_proxy 2>/dev/null || true + fi pkill -f "vllm serve" 2>/dev/null || true fi From 1dbaad82749a54f8356669b8f0249a2798d8942e Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 23 Apr 2026 01:49:52 +0000 Subject: [PATCH 04/98] fix bugs Signed-off-by: Chun Fang --- benchmarks/multi_node/amd_utils/bench.sh | 6 +- benchmarks/multi_node/amd_utils/env.sh | 4 +- benchmarks/multi_node/amd_utils/job.slurm | 60 +- benchmarks/multi_node/amd_utils/server.sh | 712 +----------------- .../multi_node/amd_utils/server_vllm.sh | 54 +- benchmarks/multi_node/amd_utils/setup_deps.sh | 10 +- benchmarks/multi_node/amd_utils/submit.sh | 2 +- 7 files changed, 74 insertions(+), 774 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 87f3b1e8a..aecc29e83 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -11,7 +11,7 @@ # \ # -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" n_prefill=$1 n_decode=$2 @@ -67,7 +67,7 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do # Engine-specific extra flags extra_flags="" - if [[ "$ENGINE" == "vllm" ]]; then + if [[ "$ENGINE" == "vllm-disagg" ]]; then extra_flags="--trust-remote-code" else if [ "$IS_MTP" = "true" ]; then @@ -92,7 +92,7 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do echo "-----------------------------------------" # vLLM: cooldown between rounds for idle KV block reaper - if [[ "$ENGINE" == "vllm" ]]; then + if [[ "$ENGINE" == "vllm-disagg" ]]; then echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." sleep 10 fi diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index c5a438541..81da415e8 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -9,7 +9,7 @@ # Set by runner or auto-detected from hostname. set -x -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" export PYTHONDONTWRITEBYTECODE=1 # ============================================================================= @@ -43,7 +43,7 @@ export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} # Engine-specific environment # ============================================================================= -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then # ========================================================================= # vLLM/Nixl-specific environment # ========================================================================= diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 491f27aa8..b9a83941a 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -8,7 +8,7 @@ #SBATCH --time=24:00:00 # --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" echo "=== Job Start Time ===" echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" @@ -23,7 +23,7 @@ echo "" # Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/ # at runtime, but the CWD remains the submit-time directory (amd_utils/). -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then MODELS_YAML="$(pwd)/models_vllm.yaml" else MODELS_YAML="$(pwd)/models.yaml" @@ -111,7 +111,7 @@ if [[ -z "$MODEL_DIR" ]]; then fi export MODEL_DIR -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then # vLLM: Extract hf_dir from models.yaml, search multiple paths, resolve HF cache snapshots DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} found && /^[^ ]/{exit} @@ -278,6 +278,7 @@ export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE export DRY_RUN="${DRY_RUN:-0}" export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +export KEEP_CONTAINERS="${KEEP_CONTAINERS:-0}" export ENGINE=$ENGINE # Eval-related env vars (threaded from submit.sh) @@ -367,7 +368,7 @@ DOCKER_ENV_COMMON=( ) # Engine-specific env vars -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then DOCKER_ENV_ENGINE=( -e VLLM_WS_PATH=${WS_PATH} -e MODEL_PATH=$DOCKER_MODEL_PATH @@ -403,28 +404,29 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" # Pre-clean (idempotent) -\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$_DCMD rm -f || true -\$DOCKER_CMD ps -aq | xargs -r \$_DCMD stop || true +\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true +\$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true # Start vLLM external router container on node 0 -if [[ \"$ENGINE\" == \"vllm\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then +if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true - \$DOCKER_CMD run -d \\ - --name \"$ROUTER_CONT_NAME\" \\ - --network host \\ - \"$VLLM_ROUTER_IMAGE\" \\ - vllm-router \\ - --vllm-pd-disaggregation \\ - --vllm-discovery-address \"0.0.0.0:${PROXY_PING_PORT}\" \\ - --port \"${ROUTER_PORT}\" \\ - --host 0.0.0.0 \\ - --policy consistent_hash \\ - --prefill-policy consistent_hash \\ - --decode-policy consistent_hash \\ - --log-level info + \$DOCKER_CMD run -d \ + --name \"$ROUTER_CONT_NAME\" \ + --network host \ + -v /tmp:/run_logs \ + \"$VLLM_ROUTER_IMAGE\" \ + bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \ + --vllm-pd-disaggregation \ + --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \ + --port ${ROUTER_PORT} \ + --host 0.0.0.0 \ + --policy consistent_hash \ + --prefill-policy consistent_hash \ + --decode-policy consistent_hash \ + --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log \" fi -exec \$DOCKER_CMD run --rm \ +exec \$DOCKER_CMD run \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -472,11 +474,13 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then fi " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' +if [[ "${KEEP_CONTAINERS}" != "1" ]]; then + srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' -# Clean up vLLM external router container on node 0 -if [[ "$ENGINE" == "vllm" && "$ROUTER_TYPE" == "vllm-router" ]]; then - srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' - '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true - ' -fi + # Clean up vLLM external router container on node 0 + if [[ "$ENGINE" == "vllm-disagg" && "$ROUTER_TYPE" == "vllm-router" ]]; then + srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' + '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true + ' + fi +fi \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 3c92422be..5c441a793 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -2,722 +2,18 @@ # Dual-Engine Disaggregated Server Dispatcher # ============================================================================= # Dispatches to the engine-specific server launcher based on ENGINE env var. -# ENGINE=sglang (default) -> server_sglang.sh (SGLang + MoRI) -# ENGINE=vllm -> server_vllm.sh (vLLM + Nixl/MoRI-IO) +# ENGINE=sglang-disagg (default) -> server_sglang.sh (SGLang + MoRI) +# ENGINE=vllm-disagg -> server_vllm.sh (vLLM + Nixl/MoRI-IO) # ============================================================================= -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}" export WS_PATH ENGINE echo "[DISPATCHER] ENGINE=$ENGINE WS_PATH=$WS_PATH" -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then source "$WS_PATH/server_vllm.sh" else source "$WS_PATH/server_sglang.sh" fi -<<<<<<< HEAD - -# ============================================================================= -# Model-Specific Configuration from YAML -# ============================================================================= -MODELS_YAML="${SGLANG_WS_PATH}/models.yaml" - -if [[ ! -f "$MODELS_YAML" ]]; then - echo "ERROR: models.yaml not found at $MODELS_YAML" - exit 1 -fi - -# Load model config via inline Python (PyYAML is available in SGLang containers) -# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP") -# is done here in Python to avoid bash glob-expanding the * characters. -eval "$(python3 -c " -import yaml, sys, os - -config_path = '${MODELS_YAML}' -model_name = '${MODEL_NAME}' - -with open(config_path) as f: - models = yaml.safe_load(f) - -if model_name not in models: - print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') - sys.exit(0) - -m = models[model_name] - -def eval_formula(val): - \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\" - if isinstance(val, (int, float)): - return int(val) - s = str(val) - # Build a namespace from env vars (convert numeric values to int) - ns = {} - for k, v in os.environ.items(): - try: - ns[k] = int(v) - except (ValueError, TypeError): - pass - try: - return int(eval(s, {'__builtins__': {}}, ns)) - except Exception as e: - print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr) - return val - -def parse_range(cuda_range, default_start, default_end): - if '-' in str(cuda_range): - s, e = str(cuda_range).split('-') - return s, e - return str(default_start), str(default_end) - -# Output shell variables -print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"') -print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') -print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"') - -prefill = m.get('prefill', {}) -decode = m.get('decode', {}) - -print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"') -print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"') - -dp = prefill.get('dp', {}) -no_dp = prefill.get('no_dp', {}) -print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') -print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') -print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') -print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"') -print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"') -print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"') -print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') -print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) -print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') -print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') - -print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"') -print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"') - -dp = decode.get('dp', {}) -ep_only = decode.get('ep_only', {}) -no_dp = decode.get('no_dp', {}) - -# Decode DP config -print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160) -print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"') - -# Decode EP-only config (EP enabled but DP disabled) -print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256) -print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"') - -# Decode no-DP config -print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) -print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') -")" - -echo "Loaded model configuration for: $MODEL_NAME" - -# Compute DP-dependent prefill parameters -if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then - prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) - prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP - prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP - prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP - prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP - prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP -else - prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) - prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP - prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP - prefill_context_length="" - prefill_max_total_tokens="" - prefill_enable_two_batch_overlap="false" -fi - -# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) -if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END)) - decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE)) -elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END)) - decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY -else - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END)) - decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP -fi - -# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " -if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" -fi -if [[ -n "$prefill_context_length" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}" -fi -if [[ -n "$prefill_max_total_tokens" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}" -fi -if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap" - PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true" -fi - -DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} " - -if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then - DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" -fi - -if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then - MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) - MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) -fi - -# ============================================================================= -# Cluster Topology Configuration -# ============================================================================= -IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" - -# Ceiling division by GPUS_PER_NODE for nodes-per-worker -PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE)) -DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE)) -NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP)) - -# Build prefill arguments dynamically based on xP -PREFILL_HEADNODE_URLS=() -PREFILL_ARGS="" -for i in $(seq 0 $((xP - 1))); do - prefill_idx=$((i * PREFILL_NODES_PER_WORKER)) - PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}" - PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000" -done - -# Build decode arguments dynamically based on yD -DECODE_HEADNODE_URLS=() -DECODE_ARGS="" -for i in $(seq 0 $((yD - 1))); do - decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET)) - DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}" - DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000" -done - -echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}" -echo "Decode worker headnode list: ${DECODE_HEADNODE_URLS[@]}" - -# ============================================================================= -# Configuration Builder Functions -# ============================================================================= - -build_server_config() { - local mode="$1" - local model_name="$2" - local tp_size="$3" - local enable_ep="$4" - local enable_dp="$5" - local decode_mtp_size="$6" - - # Calculate EP and DP sizes based on enable flags - local ep_size=1 - local dp_size=1 - - if [[ "$enable_ep" == "true" ]]; then - ep_size=$tp_size - fi - - if [[ "$enable_dp" == "true" ]]; then - dp_size=$tp_size - fi - - # Build parallelism arguments - local parallel_args="--tp-size ${tp_size}" - - if [[ "$enable_ep" == "true" ]]; then - parallel_args="$parallel_args --ep-size ${ep_size}" - fi - - if [[ "$enable_dp" == "true" ]]; then - parallel_args="$parallel_args --dp-size ${dp_size}" - fi - - # Get model-specific configuration from YAML-loaded variables - local base_config="$MODEL_BASE_FLAGS" - local mtp_config="" - local dp_config="" - local specific_config="" - - # MTP config (only if MTP is enabled and mode is decode) - if [ "$decode_mtp_size" -gt 0 ]; then - mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))" - fi - - # DP config (only if DP is enabled) - if [[ "$enable_dp" == "true" ]]; then - dp_config="$MODEL_DP_FLAGS" - fi - - # Mode-specific config - if [[ "$mode" == "prefill" ]]; then - specific_config="$PREFILL_MODE_FLAGS" - elif [[ "$mode" == "decode" ]]; then - specific_config="$DECODE_MODE_FLAGS" - fi - - # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config - local full_config="$parallel_args" - if [[ -n "$base_config" ]]; then - full_config="$full_config $base_config" - fi - if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then - full_config="$full_config $mtp_config" - fi - if [[ -n "$dp_config" ]]; then - full_config="$full_config $dp_config" - fi - if [[ -n "$specific_config" ]]; then - full_config="$full_config $specific_config" - fi - - echo "$full_config" -} - -# Build complete server configurations -PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE") -DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE") - -if [[ -n "$MODEL_NAME" ]]; then - echo "Using model-specific configuration for: $MODEL_NAME" -fi - -if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]; then - PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') - DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') - unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL - unset MORI_MOE_MAX_INPUT_TOKENS_DECODE - # NOTE: that currently with fp8_combine set, the evals do not pass on InferenceX eval harness - # or on SGLang native harness for high concurrency 4k and gets no where near the golden score of - # 0.95 on even basic GSM8k grade school math as confirmed by @billishyahao from AMD - # and as confirmed by @Oseltamivir. This was initally merged with @billishyahao promising - # that an fast follow PR to fix the evals via having quant correction in the fp8 combine -fi - -# ============================================================================= -# Container Synchronization -# ============================================================================= - -echo "Waiting at the container creation barrier on $host_name" -python3 $SGLANG_WS_PATH/sync.py barrier \ - --local-ip ${host_ip} \ - --local-port 5000 \ - --enable-port \ - --node-ips ${IPADDRS} \ - --node-ports 5000 \ - --wait-for-all-ports \ - --timeout 300 - - -# ============================================================================= -# Node Role Assignment and Server Launch -# ============================================================================= - -if [ "$NODE_RANK" -eq 0 ]; then - echo "NODE INFO =======================================" - echo "================================================" - echo "Node List : ${SLURM_JOB_NODELIST}" - echo "Node IPs : ${IPADDRS}" - echo "Model Name : ${MODEL_NAME:-'Not specified'}" - echo "================================================" - - echo "CLUSTER INFO ====================================" - echo "================================================" - echo "${host_name}:${host_ip} is Proxy Node and Prefill Node" - echo "Using prefill config: $PREFILL_SERVER_CONFIG" - echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" - echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" - echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" - echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" - echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}" - echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} " - echo "Decode env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} " - - echo "================================================" - - # start the head prefill server - PREFILL_MORI_MOE_ENV="" - set -x - if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then - PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" - fi - set +x - PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ - --model-path $MODEL_DIR/$MODEL_NAME \ - --disaggregation-mode prefill \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} " - - if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then - PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" - fi - - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PREFILL_CMD" - else - set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & - set +x - prefill0_pid=$! - fi - - - echo "Waiting for all prefill and decode servers to be up . . ." - - - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 8000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - echo "Congratulations!!! All prefill and decode servers are up . . ." - - ROUTER_CMD="python -m sglang_router.launch_router \ - --pd-disaggregation \ - --port 30000 \ - --policy random \ - --prefill-policy random \ - --decode-policy random \ - ${PREFILL_ARGS} \ - ${DECODE_ARGS}" - - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $ROUTER_CMD" - else - ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log" - set -x - if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then - eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & - else - eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 & - fi - set +x - proxy_pid=$! - - # Wait for router to be ready via health endpoint - HEALTH_BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-health \ - --health-endpoint /readiness \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $HEALTH_BARRIER_CMD" - else - eval "$HEALTH_BARRIER_CMD" - fi - - echo "Router is ready for benchmarking" - fi - - - echo "Ready for benchmarking on ${host_name}:${host_ip}" - - echo "Benchmarking on ${host_name}:${host_ip}" - cd $SGLANG_WS_PATH - - # Export IS_MTP based on whether MTP is enabled - if [ "$DECODE_MTP_SIZE" -gt 0 ]; then - export IS_MTP=true - else - export IS_MTP=false - fi - - # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier - BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ - $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ - ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ - ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" - - if [[ "${EVAL_ONLY:-false}" == "true" ]]; then - echo "EVAL_ONLY mode: skipping throughput benchmark" - elif [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BENCH_CMD" - else - set -x - eval "$BENCH_CMD" - set +x - fi - - # Run evaluation if requested (before killing router) - if [[ "${RUN_EVAL:-false}" == "true" ]]; then - echo "Running lm-eval evaluation on Node 0..." - - # Health check: verify the router is still serving before running eval. - # The throughput benchmark may have crashed/exhausted decode workers. - EVAL_HEALTH_OK=false - for _attempt in 1 2 3; do - if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then - EVAL_HEALTH_OK=true - break - fi - echo "Eval health check attempt $_attempt failed, retrying in 10s..." - sleep 10 - done - - if [[ "$EVAL_HEALTH_OK" != "true" ]]; then - echo "WARNING: Router health check failed after 3 attempts. Skipping eval." - else - # Must run from repo root so utils/evals/${task}.yaml resolves - pushd /workspace - - # Source eval functions from benchmark_lib.sh - source /workspace/benchmarks/benchmark_lib.sh - - # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list - if [[ -n "${EVAL_CONC:-}" ]]; then - export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" - else - export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) - fi - - # Override eval context length with model's configured context_length - if [[ -n "$prefill_context_length" ]]; then - export EVAL_MAX_MODEL_LEN="$prefill_context_length" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" - else - # Run lm-eval against the router on port 30000 - run_eval --framework lm-eval --port 30000 - eval_rc=$? - - if [[ $eval_rc -ne 0 ]]; then - echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2 - EVAL_FAILED=1 - else - # Set metadata env vars for append_lm_eval_summary - export TP="${PREFILL_TP_SIZE}" - export CONC="${EVAL_CONCURRENT_REQUESTS}" - export EP_SIZE=1 - [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" - export PREFILL_TP="${PREFILL_TP_SIZE}" - export PREFILL_EP=1 - [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}" - export PREFILL_NUM_WORKERS="${xP}" - export DECODE_TP="${DECODE_TP_SIZE}" - export DECODE_EP=1 - [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}" - export DECODE_NUM_WORKERS="${yD}" - export DP_ATTENTION="${PREFILL_ENABLE_DP}" - export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}" - export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}" - export ISL="${BENCH_INPUT_LEN}" - export OSL="${BENCH_OUTPUT_LEN}" - # IS_MULTINODE, FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, - # RESULT_FILENAME are already set via Docker -e flags from job.slurm - - append_lm_eval_summary - # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace - - # Copy eval artifacts to run_logs for NFS extraction by runner - EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" - mkdir -p "$EVAL_COPY_DIR" - for f in meta_env.json; do - [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" - done - # Use find for glob patterns to avoid "no match" errors - find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; - find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; - - echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" - fi - fi - - popd - fi - fi - - # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) - LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" - mkdir -p "$LOGS_OUTPUT" - - if [[ "$DRY_RUN" -eq 0 ]]; then - cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" - echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" - fi - - echo "Killing the proxy server and prefill server" - - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $proxy_pid - kill $prefill0_pid - fi - - if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then - echo "ERROR: eval failed; exiting node-0 with rc=1" - exit 1 - fi - -elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then - echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" - echo "Using prefill config: $PREFILL_SERVER_CONFIG" - echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" - - PREFILL_MORI_MOE_ENV="" - set -x - if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then - PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" - fi - set +x - PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ - --model-path $MODEL_DIR/${MODEL_NAME} \ - --disaggregation-mode prefill \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} " - - if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then - rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) - prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER)) - PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PREFILL_CMD" - else - set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & - set +x - prefill_pid=$! - fi - - echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - - echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ - --remote-ip ${NODE0_ADDR} \ - --remote-port 30000" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $WAIT_CMD" - else - eval "$WAIT_CMD" - fi - - echo "Killing the rank $NODE_RANK prefill server" - - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $prefill_pid - fi - -else - RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER)) - echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})" - echo "Using decode config: $DECODE_SERVER_CONFIG" - echo "Decode node rank: $RANK" - echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" - - DECODE_MORI_MOE_ENV="" - set -x - if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_DECODE" ]]; then - DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}" - fi - set +x - DECODE_CMD="${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ - --model-path ${MODEL_DIR}/${MODEL_NAME} \ - --disaggregation-mode decode \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${DECODE_SERVER_CONFIG} " - - if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then - rank=$((RANK % DECODE_NODES_PER_WORKER)) - decode_idx=$((RANK / DECODE_NODES_PER_WORKER)) - DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $DECODE_CMD" - else - set -x - eval "$DECODE_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & - - set +x - decode_pid=$! - fi - - - echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - - - echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ - --remote-ip ${NODE0_ADDR} \ - --remote-port 30000" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $WAIT_CMD" - else - eval "$WAIT_CMD" - fi - - echo "Killing the rank $RANK decode server" - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $decode_pid - fi - -fi - -echo "Script completed successfully" -exit 0 -======= ->>>>>>> 766ba4ee (consolidate amd_utils for sglang and vllm) diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 6b70014ee..73cad3adc 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -199,29 +199,29 @@ python3 $WS_PATH/sync.py barrier \ # ETCD Server Setup # ============================================================================= -echo "Proceeding to start etcd server on $host_name" -bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 & -etcd_pid=$! - -echo "Waiting at etcd server barrier on $host_name" -python3 $WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 2379 \ - --wait-for-all-ports \ - --timeout 300 - -echo "All etcd servers are up : $host_name" -sleep 3 - -echo "etcd endpoint health==================" -etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true -echo "======================================" - -python3 $WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 2379 \ - --wait-for-all-ports \ - --timeout 300 +# echo "Proceeding to start etcd server on $host_name" +# bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 & +# etcd_pid=$! + +# echo "Waiting at etcd server barrier on $host_name" +# python3 $WS_PATH/sync.py barrier \ +# --node-ips ${IPADDRS} \ +# --node-ports 2379 \ +# --wait-for-all-ports \ +# --timeout 300 + +# echo "All etcd servers are up : $host_name" +# sleep 3 + +# echo "etcd endpoint health==================" +# etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true +# echo "======================================" + +# python3 $WS_PATH/sync.py barrier \ +# --node-ips ${IPADDRS} \ +# --node-ports 2379 \ +# --wait-for-all-ports \ +# --timeout 300 # ============================================================================= # Cluster Topology Configuration @@ -343,7 +343,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "DRY RUN: $HEALTH_BARRIER_CMD" else eval "$HEALTH_BARRIER_CMD" - echo "MoRI-IO proxy is ready for benchmarking" + echo "${ROUTER_TYPE} is ready for benchmarking" fi echo "Ready for benchmarking on ${host_name}:${host_ip}" @@ -490,9 +490,9 @@ else [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true fi -echo "Killing the etcd server" -kill $etcd_pid 2>/dev/null || true -pkill -f etcd 2>/dev/null || true +# echo "Killing the etcd server" +# kill $etcd_pid 2>/dev/null || true +# pkill -f etcd 2>/dev/null || true echo "Script completed successfully" exit 0 diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 8c7a9f07a..589399f74 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -875,11 +875,11 @@ except Exception as e: # Run installers # ============================================================================= -install_ucx -install_rixl -install_etcd -install_libionic -install_mori +# install_ucx +# install_rixl +# install_etcd +# install_libionic +# install_mori install_amd_quark install_mori_proxy_deps patch_mori_fp8_compat diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index a77462fc5..f6670b5ee 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -100,7 +100,7 @@ export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE export PROFILER_ARGS=$profiler_args # Engine-specific xP/yD semantics and TP exports -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} fi From ee133d7d71732e4f913760c08a6342fa0f27d588 Mon Sep 17 00:00:00 2001 From: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com> Date: Mon, 4 May 2026 12:58:19 +0200 Subject: [PATCH 05/98] [AMD] Bump to nightly vllm and vllm-router images (#1208) --------- Signed-off-by: Simon Danielsson --- .github/configs/amd-master.yaml | 4 +- benchmarks/multi_node/amd_utils/env.sh | 9 +- benchmarks/multi_node/amd_utils/job.slurm | 5 +- .../multi_node/amd_utils/moriio_proxy.py | 327 ------------------ .../amd_utils/patches/minimax_m2.py | 4 +- .../multi_node/amd_utils/server_vllm.sh | 32 +- benchmarks/multi_node/amd_utils/setup_deps.sh | 46 +-- 7 files changed, 43 insertions(+), 384 deletions(-) delete mode 100644 benchmarks/multi_node/amd_utils/moriio_proxy.py diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a8480f4b9..b8f7b679e 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1146,7 +1146,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:v0.18.0 + image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg @@ -1199,7 +1199,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:v0.18.0 + image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x-disagg diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 81da415e8..cd4794ed5 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -32,8 +32,13 @@ fi export IBDEVICES # Shared: Auto-detect default network interface (portable across clusters) -export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) -export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) +# Only auto-detect if not already set by the runner/environment +if [[ -z "$GLOO_SOCKET_IFNAME" ]]; then + export GLOO_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1) +fi +if [[ -z "$NCCL_SOCKET_IFNAME" ]]; then + export NCCL_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1) +fi set +x diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index b9a83941a..70f501df6 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -297,7 +297,7 @@ SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" # vLLM external router container -VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy}" +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260503-e8992ca}" ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" @@ -417,6 +417,7 @@ if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \ \"$VLLM_ROUTER_IMAGE\" \ bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \ --vllm-pd-disaggregation \ + --kv-connector moriio \ --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \ --port ${ROUTER_PORT} \ --host 0.0.0.0 \ @@ -483,4 +484,4 @@ if [[ "${KEEP_CONTAINERS}" != "1" ]]; then '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true ' fi -fi \ No newline at end of file +fi diff --git a/benchmarks/multi_node/amd_utils/moriio_proxy.py b/benchmarks/multi_node/amd_utils/moriio_proxy.py deleted file mode 100644 index 7d1e8454b..000000000 --- a/benchmarks/multi_node/amd_utils/moriio_proxy.py +++ /dev/null @@ -1,327 +0,0 @@ -#!/usr/bin/env python3 -# MoRI-IO proxy server for vLLM PD disaggregation. -# -# Based on vLLM's examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py -# with the following adaptations for production multi-node use: -# - Ports configurable via PROXY_HTTP_PORT / PROXY_PING_PORT env vars -# - /health endpoint for sync.py barrier readiness checks -# - Uses stdlib `re` instead of `regex` to avoid extra dep -# -# The proxy performs two roles that vllm-router cannot: -# 1. ZMQ service discovery — prefill/decode workers register their RDMA ports -# 2. Request enrichment — injects remote endpoint info into kv_transfer_params - -import asyncio -import copy -import logging -import os -import re -import socket -import threading -import time -import uuid - -import aiohttp -import msgpack -import zmq -from quart import Quart, make_response, request - -logger = logging.getLogger("moriio_proxy") -logger.setLevel(logging.DEBUG) -handler = logging.StreamHandler() -handler.setFormatter(logging.Formatter( - "%(asctime)s %(levelname)s [%(name)s] %(message)s")) -logger.addHandler(handler) - -prefill_instances: list[dict] = [] -decode_instances: list[dict] = [] -request_nums = 0 -app = Quart(__name__) - -STREAM_IDLE_TIMEOUT = int(os.environ.get("PROXY_STREAM_IDLE_TIMEOUT", "300")) - -IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)") - -TRANSFER_TYPE = None - - -def _append_whole_dict_unique(target_list, data_dict): - new_filtered = {k: v for k, v in data_dict.items() if k != "index"} - for existed in target_list: - existed_filtered = {k: v for k, v in existed.items() if k != "index"} - if existed_filtered == new_filtered: - return False - logger.info("Registered instance: role=%s addr=%s hs_port=%s notify=%s dp=%s tp=%s", - data_dict.get("role"), data_dict.get("request_address"), - data_dict.get("handshake_port"), data_dict.get("notify_port"), - data_dict.get("dp_size"), data_dict.get("tp_size")) - target_list.append(data_dict) - transfer_mode = data_dict.get("transfer_mode", "unknown") - global TRANSFER_TYPE - - if TRANSFER_TYPE is None: - TRANSFER_TYPE = transfer_mode - logger.info("Transfer mode set to: %s", TRANSFER_TYPE) - elif transfer_mode != TRANSFER_TYPE: - raise ValueError(f"mismatched transfer mode {TRANSFER_TYPE} vs {transfer_mode}") - - return True - - -_list_lock = threading.RLock() - - -def _listen_for_register(hostname, port): - context = zmq.Context() - router_socket = context.socket(zmq.ROUTER) - router_socket.bind(f"tcp://{hostname}:{port}") - poller = zmq.Poller() - poller.register(router_socket, zmq.POLLIN) - global prefill_instances - global decode_instances - - while True: - socks = dict(poller.poll()) - if router_socket in socks: - remote_addr, msg = router_socket.recv_multipart() - data = msgpack.loads(msg) - if data["type"] == "HELLO": - pass - elif ( - data["type"] == "register" - and data["role"] == "P" - and data["request_address"] not in prefill_instances - ): - with _list_lock: - _append_whole_dict_unique(prefill_instances, data) - - elif ( - data["type"] == "register" - and data["role"] == "D" - and data["request_address"] not in decode_instances - ): - with _list_lock: - _append_whole_dict_unique(decode_instances, data) - - -def start_service_discovery(hostname, port): - if not hostname: - hostname = socket.gethostname() - if port == 0: - raise ValueError("Port cannot be 0") - - _listener_thread = threading.Thread( - target=_listen_for_register, args=(hostname, port), daemon=True - ) - _listener_thread.start() - logger.info("Service discovery listening on %s:%s", hostname, port) - return _listener_thread - - -async def send_request_to_prefill( - endpoint, req_data, request_id, d_endpoint, dip, dport, selected_prefill_dp_rank -): - req_data_copy = req_data - - req_data_copy["kv_transfer_params"].update( - { - "do_remote_decode": True, - "do_remote_prefill": False, - "remote_handshake_port": d_endpoint["handshake_port"], - "remote_notify_port": d_endpoint["notify_port"], - "remote_engine_id": None, - "remote_block_ids": None, - "remote_host": dip, - "remote_port": dport, - } - ) - req_data_copy["stream"] = False - req_data_copy["max_tokens"] = 1 - if "max_completion_tokens" in req_data_copy: - req_data_copy["max_completion_tokens"] = 1 - if "stream_options" in req_data_copy: - del req_data_copy["stream_options"] - async with aiohttp.ClientSession( - timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) - ) as session: - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - "X-Request-Id": request_id, - } - if selected_prefill_dp_rank is not None: - headers["X-data-parallel-rank"] = str(selected_prefill_dp_rank) - async with session.post( - url=endpoint, json=req_data_copy, headers=headers - ) as response: - if response.status == 200: - return await response.json() - else: - raise RuntimeError( - f"Prefill response status={response.status}" - ) - - -async def start_decode_request(endpoint, req_data, request_id): - session = aiohttp.ClientSession( - timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) - ) - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - "X-Request-Id": request_id, - } - response = await session.post(url=endpoint, json=req_data, headers=headers) - return session, response - - -async def stream_decode_response(session, response, request_id): - try: - if response.status == 200: - chunk_iter = response.content.iter_chunked(1024).__aiter__() - while True: - try: - chunk_bytes = await asyncio.wait_for( - chunk_iter.__anext__(), timeout=STREAM_IDLE_TIMEOUT, - ) - yield chunk_bytes - except StopAsyncIteration: - break - except asyncio.TimeoutError: - logger.error( - "Decode stream %s idle for %ds, aborting", - request_id, STREAM_IDLE_TIMEOUT, - ) - break - else: - raise RuntimeError( - f"Decode response status={response.status}" - ) - finally: - await response.release() - await session.close() - - -@app.route("/health", methods=["GET"]) -async def health_check(): - with _list_lock: - p_count = len(prefill_instances) - d_count = len(decode_instances) - return await make_response( - ({"status": "ok", "prefill_instances": p_count, "decode_instances": d_count}, 200) - ) - - -@app.route("/v1/completions", methods=["POST"]) -@app.route("/v1/chat/completions", methods=["POST"]) -async def handle_request(): - try: - with _list_lock: - global request_nums - request_nums += 1 - - def extract_ip_port_fast(url): - match = IP_PORT_PATTERN.search(url) - if not match: - raise ValueError(f"Invalid URL format: {url}") - return match.groups() - - req_data = await request.get_json() - request_id = str(uuid.uuid4()) - - if not prefill_instances or not decode_instances: - return await make_response( - ("Service Unavailable: No prefill or decode instances registered.", 503) - ) - - pid = request_nums % len(prefill_instances) - did = request_nums % len(decode_instances) - prefill_instance_endpoint = prefill_instances[pid] - decode_instance_endpoint = decode_instances[did] - - selected_prefill_dp_rank = None - if prefill_instance_endpoint["dp_size"] > 1: - selected_prefill_dp_rank = request_nums % prefill_instance_endpoint["dp_size"] - - dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"]) - - req_data_to_prefill = copy.deepcopy(req_data) - req_data_to_prefill["kv_transfer_params"] = {"transfer_id": request_id} - req_data["kv_transfer_params"] = {"transfer_id": request_id} - req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = ( - decode_instance_endpoint["dp_size"] - ) - req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = ( - decode_instance_endpoint["tp_size"] - ) - - send_prefill_task = asyncio.create_task( - send_request_to_prefill( - prefill_instance_endpoint["request_address"], - req_data_to_prefill, - request_id, - decode_instance_endpoint, - dip, - dport, - selected_prefill_dp_rank, - ) - ) - ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"]) - - req_data["max_tokens"] -= 1 - - req_data["kv_transfer_params"] = { - "transfer_id": request_id, - "do_remote_decode": False, - "do_remote_prefill": True, - "remote_handshake_port": prefill_instance_endpoint["handshake_port"], - "remote_notify_port": prefill_instance_endpoint["notify_port"], - "remote_engine_id": None, - "remote_block_ids": None, - "remote_host": ip, - "remote_port": port, - } - if TRANSFER_TYPE == "READ": - prefill_response = await send_prefill_task - req_data["kv_transfer_params"]["remote_engine_id"] = prefill_response[ - "kv_transfer_params" - ]["remote_engine_id"] - req_data["kv_transfer_params"]["remote_block_ids"] = prefill_response[ - "kv_transfer_params" - ]["remote_block_ids"] - - req_data["kv_transfer_params"]["remote_dp_size"] = prefill_instance_endpoint[ - "dp_size" - ] - req_data["kv_transfer_params"]["remote_tp_size"] = prefill_instance_endpoint[ - "tp_size" - ] - - if selected_prefill_dp_rank is not None: - req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank - - decode_request_task = asyncio.create_task( - start_decode_request( - decode_instance_endpoint["request_address"], req_data, request_id - ) - ) - - session, decode_response = await decode_request_task - stream_generator = stream_decode_response(session, decode_response, request_id) - response = await make_response(stream_generator) - return response - except Exception as e: - logger.exception("Error handling request: %s", e) - return await make_response((f"Internal Server Error: {e!s}", 500)) - - -if __name__ == "__main__": - http_port = int(os.environ.get("PROXY_HTTP_PORT", "30000")) - ping_port = int(os.environ.get("PROXY_PING_PORT", "36367")) - - t = start_service_discovery("0.0.0.0", ping_port) - app.debug = False - app.config["BODY_TIMEOUT"] = 360000 - app.config["RESPONSE_TIMEOUT"] = 360000 - - logger.info("MoRI-IO proxy starting: HTTP=%d, ZMQ=%d", http_port, ping_port) - app.run(host="0.0.0.0", port=http_port) - t.join() diff --git a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py index 8290276fb..ac830eb1f 100644 --- a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py +++ b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py @@ -137,7 +137,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, - reduce_results=False, renormalize=True, scoring_func=getattr(config, "scoring_func", "softmax"), e_score_correction_bias=self.e_score_correction_bias, @@ -185,7 +184,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: ) final_hidden_states = final_hidden_states[:num_tokens] elif self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( + from vllm.distributed.communication_op import tensor_model_parallel_all_reduce + final_hidden_states = tensor_model_parallel_all_reduce( final_hidden_states ) diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 73cad3adc..9acb05f54 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -242,7 +242,7 @@ done echo "Prefill node IPs: ${PREFILL_ARGS}" echo "Decode node IPs: ${DECODE_ARGS}" -# MoRI-IO proxy ZMQ registration port (must match moriio_proxy.py PROXY_PING_PORT) +# MoRI-IO proxy ZMQ registration port (must match vllm-router --vllm-discovery-address) PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" # vLLM environment (UCX transport vars are set at the Docker level in job.slurm) @@ -281,26 +281,8 @@ if [ "$NODE_RANK" -eq 0 ]; then setup_vllm_env - # Start MoRI-IO proxy FIRST — workers register via ZMQ on startup - # Skipped when ROUTER_TYPE=vllm-router (external router container started by job.slurm) - if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then - echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." - PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ - python3 $WS_PATH/moriio_proxy.py" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PROXY_CMD" - else - PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" - set -x - eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 & - set +x - proxy_pid=$! - sleep 3 - fi - else - echo "Using external vLLM router (ROUTER_TYPE=${ROUTER_TYPE:-vllm-router})" - fi + # Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE) + echo "Using external vllm-router container (started by job.slurm on this node)" PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ @@ -343,7 +325,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "DRY RUN: $HEALTH_BARRIER_CMD" else eval "$HEALTH_BARRIER_CMD" - echo "${ROUTER_TYPE} is ready for benchmarking" + echo "MoRI-IO proxy is ready for benchmarking" fi echo "Ready for benchmarking on ${host_name}:${host_ip}" @@ -375,14 +357,8 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Killing the prefill server" if [[ "$DRY_RUN" -eq 0 ]]; then - if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then - [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true - fi [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true sleep 2 - if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then - pkill -f moriio_proxy 2>/dev/null || true - fi pkill -f "vllm serve" 2>/dev/null || true fi diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 589399f74..958cb9808 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -242,43 +242,48 @@ patch_mori_fp8_compat() { import re, os, sys patched = [] -# 1. Patch layer.py: remove multi-line AITER assertion for MoRI +# Patch layer.py: remove AITER requirement assertion(s) for MoRI try: import vllm.model_executor.layers.fused_moe.layer as lm f = lm.__file__ src = open(f).read() - if "Mori needs to be used with aiter" in src: + if "[PATCHED] AITER requirement removed for MoRI-EP + FP8" in src: + print("[SETUP] layer.py MoRI-FP8 patch already applied") + elif "Mori needs to be used with aiter" in src: + # v0.19+: two consecutive assertions inside `if self.moe_config.use_mori_kernels:` new = re.sub( - r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", + r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)\s*" + r"assert not self\.aiter_fmoe_shared_expert_enabled,\s*\([^)]*\)", "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", src, flags=re.DOTALL) + if new == src: + # v0.17.1/v0.18.0: only the first assertion existed + new = re.sub( + r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", + "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", + src, flags=re.DOTALL) if new != src: open(f, "w").write(new) patched.append("layer.py") + else: + print("[SETUP] ERROR: layer.py pattern found but regex had no effect", file=sys.stderr) + sys.exit(1) + else: + print("[SETUP] ERROR: layer.py AITER assertion pattern not found — vLLM API may have changed", file=sys.stderr) + sys.exit(1) except Exception as e: - print(f"[SETUP] WARN patch layer.py: {e}", file=sys.stderr) + print(f"[SETUP] ERROR patch layer.py: {e}", file=sys.stderr) + sys.exit(1) -# 2. Patch mori_prepare_finalize.py: remove defer_input_quant restriction -try: - import vllm.model_executor.layers.fused_moe.mori_prepare_finalize as mm - f = mm.__file__ - src = open(f).read() - if "defer_input_quant" in src: - new = re.sub( - r"raise NotImplementedError\([^)]*defer_input_quant[^)]*\)", - "pass # [PATCHED] defer_input_quant check removed for MoRI-EP + FP8", - src) - if new != src: - open(f, "w").write(new) - patched.append("mori_prepare_finalize.py") -except Exception as e: - print(f"[SETUP] WARN patch mori_pf: {e}", file=sys.stderr) +# prepare_finalize/mori.py (v0.19+) already handles defer_input_quant correctly +# (skips FP8 quant when True). No patch needed for that file. +# Added in 0.18.1: https://github.com/vllm-project/vllm/commit/6a9cceb219fcbd6b1eb540ddfdc77ec160f0e209 if patched: print(f"[SETUP] Patched: {chr(44).join(patched)}") else: print("[SETUP] No MoRI-FP8 patches needed") -' +' || exit 1 _SETUP_INSTALLED+=("MoRI-FP8-patch") } @@ -881,7 +886,6 @@ except Exception as e: # install_libionic # install_mori install_amd_quark -install_mori_proxy_deps patch_mori_fp8_compat patch_moriio_save_kv_timeout patch_moriio_transfer_timeout From 49401536acbf6476b6a615ea8048137fa365fe04 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 12 May 2026 08:33:11 +0000 Subject: [PATCH 06/98] update vllm image and vllm router image --- .github/configs/amd-master.yaml | 2 +- benchmarks/multi_node/amd_utils/job.slurm | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index b8f7b679e..1244f51b6 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1146,7 +1146,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c + image: aigmkt/vllm-dev:ainic2 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 70f501df6..47eed2149 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -297,7 +297,7 @@ SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" # vLLM external router container -VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260503-e8992ca}" +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260511-e667ebb}" ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" From d1004548aef3b2429eb0754e76691e52e3bdec90 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 12 May 2026 10:12:22 +0000 Subject: [PATCH 07/98] update the interface prefix for tw cluster Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/env.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index cd4794ed5..ffdc9682e 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -54,9 +54,9 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then # ========================================================================= set -x - # UCX_NET_DEVICES: Use the first benic interface for UCX TCP transport + # UCX_NET_DEVICES: Use the first tw-eth interface for UCX TCP transport if [[ -z "$UCX_NET_DEVICES" ]]; then - UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1) + UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth/{print $2}' | head -1) if [[ -n "$UCX_NET_DEV" ]]; then export UCX_NET_DEVICES="$UCX_NET_DEV" else From 2fa7ee3405cf88de2f1f0cfc9abea84c5b276675 Mon Sep 17 00:00:00 2001 From: Shan Theresa Date: Wed, 13 May 2026 06:33:57 +0000 Subject: [PATCH 08/98] add deps for ib device auto-detection Signed-off-by: Shan Theresa --- benchmarks/multi_node/amd_utils/env.sh | 4 ++ benchmarks/multi_node/amd_utils/setup_deps.sh | 31 ++++++------ benchmarks/multi_node/amd_utils/submit.sh | 49 +++++++++++++++++++ 3 files changed, 68 insertions(+), 16 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index ffdc9682e..e01365503 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -56,7 +56,11 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then # UCX_NET_DEVICES: Use the first tw-eth interface for UCX TCP transport if [[ -z "$UCX_NET_DEVICES" ]]; then +<<<<<<< Updated upstream UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth/{print $2}' | head -1) +======= + UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth0/{print $2}' | head -1) +>>>>>>> Stashed changes if [[ -n "$UCX_NET_DEV" ]]; then export UCX_NET_DEVICES="$UCX_NET_DEV" else diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 958cb9808..860cecf96 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -144,28 +144,26 @@ install_libionic() { } # --------------------------------------------------------------------------- -# 5. MoRI-IO proxy deps (Python packages for the MoRI-IO-aware proxy server) -# The proxy replaces vllm-router: it handles both HTTP routing AND the -# MoRI-IO ZMQ registration/request-enrichment protocol. -# Only needed on NODE_RANK=0 (proxy node). +# 5. Container RDMA/net tools +# - ibv_devinfo comes from ibverbs-utils +# - iproute2 provides the `ip` command +# Used for in-container NIC/RDMA validation and routing checks. # --------------------------------------------------------------------------- -install_mori_proxy_deps() { - if python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then - echo "[SETUP] MoRI-IO proxy Python deps already present" +install_recipe_deps() { + if command -v ibv_devinfo >/dev/null 2>&1 && command -v ip >/dev/null 2>&1; then + echo "[SETUP] Container RDMA/net tools already present" return 0 fi - echo "[SETUP] Installing MoRI-IO proxy Python deps..." - # v0.18.0 ships aiohttp, pyzmq, blinker(distutils); only quart and msgpack - # are missing. --ignore-installed blinker avoids pip's distutils uninstall - # error when quart pulls a newer blinker version. - pip install --quiet --ignore-installed blinker - pip install --quiet quart msgpack + echo "[SETUP] Installing ibv_devinfo + iproute2 in container..." + apt-get update -q -y && apt-get install -q -y \ + ibverbs-utils iproute2 \ + && rm -rf /var/lib/apt/lists/* - if ! python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then - echo "[SETUP] ERROR: MoRI-IO proxy deps install failed"; exit 1 + if ! command -v ibv_devinfo >/dev/null 2>&1 || ! command -v ip >/dev/null 2>&1; then + echo "[SETUP] ERROR: Failed to install ibv_devinfo/iproute2"; exit 1 fi - _SETUP_INSTALLED+=("mori-proxy-deps") + _SETUP_INSTALLED+=("ibverbs-utils+iproute2") } # --------------------------------------------------------------------------- @@ -885,6 +883,7 @@ except Exception as e: # install_etcd # install_libionic # install_mori +install_recipe_deps install_amd_quark patch_mori_fp8_compat patch_moriio_save_kv_timeout diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index f6670b5ee..524b00c65 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -161,6 +161,55 @@ if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") fi +# ============================================================================= +# Reuse existing allocation (skip sbatch) +# ============================================================================= +# When SLURM_REUSE_JOBID is set, run job.slurm directly in the current shell, +# attaching to the existing allocation. Inner `srun` calls pick up the +# allocation via SLURM_JOB_ID; SLURM_OVERLAP=1 lets them share task slots with +# the interactive shell already holding the allocation. +if [[ -n "${SLURM_REUSE_JOBID:-}" ]]; then + REUSE_JID="$SLURM_REUSE_JOBID" + echo "Reusing existing Slurm allocation ${REUSE_JID} (skipping sbatch)" >&2 + + # Resolve allocation's nodelist if not already provided. + ALLOC_NODELIST="${SLURM_JOB_NODELIST:-$(squeue -h -j "$REUSE_JID" -o '%N' 2>/dev/null)}" + if [[ -z "$ALLOC_NODELIST" ]]; then + echo "Error: could not resolve nodelist for job ${REUSE_JID}" >&2 + exit 1 + fi + ALLOC_NNODES=$(scontrol show hostnames "$ALLOC_NODELIST" | wc -l) + if [[ "$ALLOC_NNODES" -lt "$NUM_NODES" ]]; then + echo "Error: allocation ${REUSE_JID} has ${ALLOC_NNODES} nodes, need ${NUM_NODES}" >&2 + exit 1 + fi + + export SLURM_JOB_ID="$REUSE_JID" + export SLURM_JOBID="$REUSE_JID" + export SLURM_JOB_NODELIST="$ALLOC_NODELIST" + export SLURM_NODELIST="$ALLOC_NODELIST" + export SLURM_NNODES="$ALLOC_NNODES" + export SLURM_JOB_NUM_NODES="$ALLOC_NNODES" + export SLURM_NTASKS="$ALLOC_NNODES" + export SLURM_NPROCS="$ALLOC_NNODES" + export SLURM_NTASKS_PER_NODE=1 + export SLURM_TASKS_PER_NODE="1(x${ALLOC_NNODES})" + export SLURM_OVERLAP=1 + export SLURM_SUBMIT_DIR="$(pwd)" + + STDOUT_LOG="${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.out" + STDERR_LOG="${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.err" + rm -f "$STDOUT_LOG" "$STDERR_LOG" + + nohup bash "$(dirname "$0")/job.slurm" >"$STDOUT_LOG" 2>"$STDERR_LOG" & + INLINE_PID=$! + echo "$INLINE_PID" > "${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.pid" + echo "Started job.slurm (pid=${INLINE_PID}); logs: ${STDOUT_LOG}" >&2 + + echo "$REUSE_JID" + exit 0 +fi + # Construct the sbatch command sbatch_cmd=( sbatch From 9115482b18318b346826e009a05d22eeceef8c23 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 10:42:03 +0000 Subject: [PATCH 09/98] update vllm image Signed-off-by: Theresa Shan --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 1244f51b6..06d292f9d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1146,7 +1146,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: aigmkt/vllm-dev:ainic2 + image: ghcr.io/simondanielsson/vllm-dev:ainic-test-hydra model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg From 784a5a048f14f01ba66b6bc7f02a50086bd34c5a Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 13:57:43 +0000 Subject: [PATCH 10/98] fix indentation and add missing finally block in async_request_openai_chat_completions Co-Authored-By: Claude Opus 4 --- utils/bench_serving/backend_request_func.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py index 7f4a93284..e8577016a 100644 --- a/utils/bench_serving/backend_request_func.py +++ b/utils/bench_serving/backend_request_func.py @@ -421,10 +421,13 @@ async def async_request_openai_chat_completions( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output def get_model(pretrained_model_name_or_path: str) -> str: From d4e1daf0b31fa172125a9816ef55926a019327b4 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 13:59:32 +0000 Subject: [PATCH 11/98] fix tw-eth interface detection pattern in env.sh Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/env.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index e01365503..ffdc9682e 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -56,11 +56,7 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then # UCX_NET_DEVICES: Use the first tw-eth interface for UCX TCP transport if [[ -z "$UCX_NET_DEVICES" ]]; then -<<<<<<< Updated upstream UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth/{print $2}' | head -1) -======= - UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth0/{print $2}' | head -1) ->>>>>>> Stashed changes if [[ -n "$UCX_NET_DEV" ]]; then export UCX_NET_DEVICES="$UCX_NET_DEV" else From e2d3a28d9cef670edb55524a8bed3830246ca1b3 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 14:09:40 +0000 Subject: [PATCH 12/98] fix vllm-disagg config schema: use scenarios.fixed-seq-len Co-Authored-By: Claude Opus 4 --- .github/configs/amd-master.yaml | 178 ++++++++++++++++---------------- 1 file changed, 90 insertions(+), 88 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 06d292f9d..08ec35c0a 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1154,49 +1154,50 @@ kimik2.5-fp4-mi355x-vllm-disagg: framework: vllm-disagg multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c @@ -1207,51 +1208,52 @@ minimaxm2.5-fp8-mi355x-vllm-disagg: framework: vllm-disagg multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536, - # TP8 shards to 192 which is not divisible by FP8 block_n=128. - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536, + # TP8 shards to 192 which is not divisible by FP8 block_n=128. + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" dsr1-fp4-mi355x-sglang-disagg: image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501 From 83e75540de0200699c0f5f5306b98d287209c671 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 15:10:04 +0000 Subject: [PATCH 13/98] fix vllm-disagg routing to multi_node benchmark subdir Co-Authored-By: Claude Opus 4 --- runners/launch_mi355x-amds.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index ec0881bdd..794f6b3cd 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -56,7 +56,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" - if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then + if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then BENCHMARK_SUBDIR="multi_node" else BENCHMARK_SUBDIR="single_node" From 3daec02e0f0cdc33175e3cf2e2b7da7fcd3dcc4b Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 15:51:26 +0000 Subject: [PATCH 14/98] fix result collection to use FRAMEWORK as log directory prefix The inline collect_latest_results.py hardcoded "sglang" as the log directory prefix, causing "No logs directory found" for vllm-disagg runs where bench.sh creates directories named vllm-disagg_isl_X_osl_Y. Co-Authored-By: Claude Opus 4 --- runners/launch_mi355x-amds.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 794f6b3cd..49300abae 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -108,12 +108,12 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if [[ "${EVAL_ONLY:-false}" != "true" ]]; then cat > collect_latest_results.py <<'PY' import os, sys -sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) -for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: +sgl_job_dir, isl, osl, nexp, framework = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]), sys.argv[5] +for path in sorted([f"{sgl_job_dir}/logs/{name}/{framework}_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/{framework}_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY - LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1) + LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1 "$FRAMEWORK") if [ -z "$LOGS_DIR" ]; then echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" exit 1 From 51c92a7c24e1668b1336b0222c0da97455565459 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 02:23:11 +0000 Subject: [PATCH 15/98] suppress tokenizer warnings and debug output in bench.sh Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/bench.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index aecc29e83..33cc918bf 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -37,6 +37,9 @@ IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" ROUTER_PORT="${ROUTER_PORT:-30000}" +export TRANSFORMERS_VERBOSITY=error +export TOKENIZERS_PARALLELISM=false + echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}" From 3569b0a228e02535ed1c13eb79e6cb7f083b2228 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 02:44:58 +0000 Subject: [PATCH 16/98] fix vllm-disagg deadlock: stop router after rank 0 container exits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vllm-router runs as a separate container on node 0. After node 0's main container finishes the benchmark and exits, decode nodes remain stuck waiting for the router port to close. The router cleanup in job.slurm can't run until srun completes, but srun can't complete because decode nodes are blocked — deadlock. Fix: skip exec on rank 0 for vllm-disagg so the srun bash script continues after docker exits and can stop the router container, allowing decode nodes to detect the port closure and exit. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/job.slurm | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 47eed2149..20ecb6683 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -427,7 +427,16 @@ if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \ --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log \" fi -exec \$DOCKER_CMD run \ +# Skip exec on vllm-disagg rank 0 so we can stop the router after the main +# container exits. Without this, decode nodes block forever waiting for the +# router port to close (the router is a separate container). +MAYBE_EXEC=exec +if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then + MAYBE_EXEC= + set +e +fi + +\$MAYBE_EXEC \$DOCKER_CMD run \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -468,11 +477,11 @@ exec \$DOCKER_CMD run \ '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log ' +# Only reached when exec was skipped (vllm-disagg rank 0) DOCKER_EXIT_CODE=\$? -if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then - echo \"ERROR: docker exited rc=\$DOCKER_EXIT_CODE on \$(hostname)\" - exit \$DOCKER_EXIT_CODE -fi +echo \"[rank 0] Main container exited (rc=\$DOCKER_EXIT_CODE). Stopping vllm-router...\" +\$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true +exit \$DOCKER_EXIT_CODE " if [[ "${KEEP_CONTAINERS}" != "1" ]]; then From 73d649a60859db2f0a81ecf258575758037de201 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 02:57:46 +0000 Subject: [PATCH 17/98] reduce vllm-disagg concurrency sweep to single point for faster iteration Co-Authored-By: Claude Opus 4 --- .github/configs/amd-master.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 08ec35c0a..ba5ec1f2a 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1159,9 +1159,9 @@ kimik2.5-fp4-mi355x-vllm-disagg: - isl: 1024 osl: 1024 search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total , 16, 32, 64, 128, 256, 512 - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + conc-list: [ 8 ] prefill: num-worker: 1 tp: 8 @@ -1182,7 +1182,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: osl: 1024 search-space: - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + conc-list: [ 8 ] prefill: num-worker: 1 tp: 8 From 50864b45bce9963e8c126c0ecdf8d5295d646d28 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 03:30:18 +0000 Subject: [PATCH 18/98] preserve slurm logs on failure and print stderr inline The EXIT trap deleted benchmark_logs/ before saving artifacts, making it impossible to debug container startup failures. Now the trap always copies slurm .out/.err to the artifact directory and prints the last 100 lines of .err inline in the CI output. Co-Authored-By: Claude Opus 4 --- runners/launch_mi355x-amds.sh | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 49300abae..1b4b24ce7 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -52,8 +52,24 @@ if [[ "$IS_MULTINODE" == "true" ]]; then sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true # Ensure root-owned files are cleaned up even on early exit to prevent - # EACCES errors when the next GH Actions job checks out on this runner - trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT + # EACCES errors when the next GH Actions job checks out on this runner. + # Always preserve slurm logs as CI artifacts for debugging. + cleanup_and_save_logs() { + if [[ -n "${GITHUB_ACTIONS:-}" && -n "${JOB_ID:-}" ]]; then + local art_dir="$GITHUB_WORKSPACE/benchmark_artifacts" + mkdir -p "$art_dir" + cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$art_dir/" 2>/dev/null || true + fi + # Print .err inline so failures are visible in CI output + local err_file="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID:-unknown}.err" + if [[ -s "$err_file" ]]; then + echo "=== Slurm job stderr ===" + tail -100 "$err_file" + echo "========================" + fi + sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true + } + trap cleanup_and_save_logs EXIT SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then @@ -162,16 +178,7 @@ PY sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true - # Upload logs as artifact if running in GitHub Actions - if [[ -n "${GITHUB_ACTIONS:-}" ]]; then - ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts" - mkdir -p "$ARTIFACT_DIR" - cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$ARTIFACT_DIR/" 2>/dev/null || true - echo "Logs copied to $ARTIFACT_DIR for artifact upload" - fi - - # Clean up root-owned files to prevent EACCES on GH Actions checkout cleanup - sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true + # Log preservation and cleanup handled by EXIT trap (cleanup_and_save_logs) else From 0454199b31fefef17b35de31130329b6f6514172 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 09:16:42 +0000 Subject: [PATCH 19/98] enable set -x around docker privilege detection for CI debugging Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/job.slurm | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 20ecb6683..8d904044a 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -86,12 +86,14 @@ PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" # Docker privilege detection # ============================================================================= # Detect on the batch host. Per-node detection happens inside srun below. +set -x if docker ps &>/dev/null; then DOCKER_CMD="docker" else DOCKER_CMD="sudo docker" fi export DOCKER_CMD +set +x # ============================================================================= # Model Path Resolution From 672e6932abdbdc48b334c2016df2119450dbc501 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 10:16:43 +0000 Subject: [PATCH 20/98] fix docker detection: test on compute node, not batch host The batch host has docker socket permissions but the compute nodes do not, causing "permission denied" on all srun tasks. Move the detection after SELECTED_NODES is known and probe via srun. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/job.slurm | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 8d904044a..1da4b4890 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -82,19 +82,6 @@ ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}" ROUTER_PORT="${ROUTER_PORT:-30000}" PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" -# ============================================================================= -# Docker privilege detection -# ============================================================================= -# Detect on the batch host. Per-node detection happens inside srun below. -set -x -if docker ps &>/dev/null; then - DOCKER_CMD="docker" -else - DOCKER_CMD="sudo docker" -fi -export DOCKER_CMD -set +x - # ============================================================================= # Model Path Resolution # ============================================================================= @@ -212,6 +199,16 @@ FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') +# Docker privilege detection — test on a compute node, not the batch host. +FIRST_NODE=$(echo "$SELECTED_NODES" | head -1) +if srun --nodelist="$FIRST_NODE" -N1 -n1 --overlap bash -c 'docker ps &>/dev/null'; then + DOCKER_CMD="docker" +else + DOCKER_CMD="sudo docker" +fi +export DOCKER_CMD +echo "[docker-detect] DOCKER_CMD=$DOCKER_CMD (tested on $FIRST_NODE)" + # Update SLURM environment variables export SLURM_NNODES=$NUM_NODES export SLURM_NTASKS=$NUM_NODES From fb2d77113f6c9ec2a81cd20ad6ea93b97070d876 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 10:50:01 +0000 Subject: [PATCH 21/98] fix docker detection: per-node probe since group membership varies Export DOCKER_CMD_DETECT as a shell snippet that each srun participant evaluates locally, instead of testing a single node and assuming all nodes have the same docker socket permissions. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/job.slurm | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 1da4b4890..22b1ebcb3 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -199,15 +199,9 @@ FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') -# Docker privilege detection — test on a compute node, not the batch host. -FIRST_NODE=$(echo "$SELECTED_NODES" | head -1) -if srun --nodelist="$FIRST_NODE" -N1 -n1 --overlap bash -c 'docker ps &>/dev/null'; then - DOCKER_CMD="docker" -else - DOCKER_CMD="sudo docker" -fi -export DOCKER_CMD -echo "[docker-detect] DOCKER_CMD=$DOCKER_CMD (tested on $FIRST_NODE)" +# Docker privilege detection — evaluated per-node since group membership varies. +# Exported as a snippet so every srun participant resolves it locally. +export DOCKER_CMD_DETECT='if docker ps &>/dev/null 2>&1; then DOCKER_CMD=docker; else DOCKER_CMD="sudo docker"; fi' # Update SLURM environment variables export SLURM_NNODES=$NUM_NODES @@ -402,6 +396,10 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" +# Per-node docker privilege detection +eval \"\$DOCKER_CMD_DETECT\" +echo \"[docker-detect] rank \$SLURM_PROCID: DOCKER_CMD=\$DOCKER_CMD\" + # Pre-clean (idempotent) \$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true \$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true @@ -484,12 +482,12 @@ exit \$DOCKER_EXIT_CODE " if [[ "${KEEP_CONTAINERS}" != "1" ]]; then - srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' + srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' # Clean up vLLM external router container on node 0 if [[ "$ENGINE" == "vllm-disagg" && "$ROUTER_TYPE" == "vllm-router" ]]; then srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' - '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true + eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true ' fi fi From cecd65a781c55dc11bb36d5fb62220a497e0c0fb Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 14:19:27 +0000 Subject: [PATCH 22/98] add vllm-disagg changelog entries and update kimi conc-list - Add perf-changelog entries for kimik2.5-fp4-mi355x-vllm-disagg and minimaxm2.5-fp8-mi355x-vllm-disagg to trigger CI benchmarks - Update kimi 1k1k conc-list from [8] to [16] - Comment out kimi 8k1k config until eval pipeline is wired up Co-Authored-By: Claude Opus 4 --- .github/configs/amd-master.yaml | 44 ++++++++++++++++----------------- perf-changelog.yaml | 10 ++++++++ 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index ba5ec1f2a..c29a4d972 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1159,9 +1159,9 @@ kimik2.5-fp4-mi355x-vllm-disagg: - isl: 1024 osl: 1024 search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total , 16, 32, 64, 128, 256, 512 + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - spec-decoding: "none" - conc-list: [ 8 ] + conc-list: [ 16 ] prefill: num-worker: 1 tp: 8 @@ -1178,26 +1178,26 @@ kimik2.5-fp4-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "none" - conc-list: [ 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" + # - isl: 8192 + # osl: 1024 + # search-space: + # - spec-decoding: "none" + # conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + # decode: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: false + # additional-settings: + # - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1250f809e..1e3711a6c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2430,3 +2430,13 @@ description: - "Update SGLang image from v0.5.10.post1-cu130 to v0.5.11-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1346 + +- config-keys: + - kimik2.5-fp4-mi355x-vllm-disagg + description: + - "Add vLLM disaggregated prefill-decode benchmark for Kimi-K2.5-MXFP4 on MI355X" + +- config-keys: + - minimaxm2.5-fp8-mi355x-vllm-disagg + description: + - "Add vLLM disaggregated prefill-decode benchmark for MiniMax-M2.5 on MI355X" From 5959f8dd52d0e5e35f8c53fd3419f4f3423fd678 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 14:50:15 +0000 Subject: [PATCH 23/98] switch vllm-disagg to 8k1k config to trigger multi-node eval Comment out 1k1k config and enable 8k1k with conc-list [16] so mark_eval_entries picks it up for the eval pipeline. Co-Authored-By: Claude Opus 4 --- .github/configs/amd-master.yaml | 46 ++++++++++++++++----------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index c29a4d972..462df433b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1156,31 +1156,10 @@ kimik2.5-fp4-mi355x-vllm-disagg: disagg: true scenarios: fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - - spec-decoding: "none" - conc-list: [ 16 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - # - isl: 8192 + # - isl: 1024 # osl: 1024 # search-space: + # # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total # - spec-decoding: "none" # conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] # prefill: @@ -1199,6 +1178,27 @@ kimik2.5-fp4-mi355x-vllm-disagg: # additional-settings: # - "DECODE_NODES=2" + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 16 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c model: MiniMaxAI/MiniMax-M2.5 From f479f0dec6b7a27c920296aca7b6d6b8c9a0053f Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 02:52:17 +0000 Subject: [PATCH 24/98] add multi-node eval feature Signed-off-by: Theresa Shan --- .../multi_node/amd_utils/server_sglang.sh | 209 +++++++++++++++--- .../multi_node/amd_utils/server_vllm.sh | 84 ++++++- 2 files changed, 255 insertions(+), 38 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index 53ca29cc5..b410bc978 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -43,7 +43,7 @@ GPUS_PER_NODE="${GPUS_PER_NODE:-8}" # ============================================================================= # Dependencies and Environment Setup # ============================================================================= -source $WS_PATH/env.sh +source $SGLANG_WS_PATH/env.sh host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') host_name=$(hostname) @@ -62,7 +62,7 @@ fi # ============================================================================= # Model-Specific Configuration from YAML # ============================================================================= -MODELS_YAML="${WS_PATH}/models.yaml" +MODELS_YAML="${SGLANG_WS_PATH}/models.yaml" if [[ ! -f "$MODELS_YAML" ]]; then echo "ERROR: models.yaml not found at $MODELS_YAML" @@ -127,6 +127,9 @@ no_dp = prefill.get('no_dp', {}) print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') +print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"') +print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"') +print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"') print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) @@ -169,10 +172,16 @@ if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP + prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP + prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP + prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP else prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP + prefill_context_length="" + prefill_max_total_tokens="" + prefill_enable_two_batch_overlap="false" fi # Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) @@ -187,29 +196,31 @@ else decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP fi -# Use Decode configuration to configure different TP/DP size between P and D -PREFILL_DECODE_DIFFERENT_TP="" -if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then - if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}" - else - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1" - fi -fi - # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}" +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi +if [[ -n "$prefill_context_length" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}" +fi +if [[ -n "$prefill_max_total_tokens" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}" +fi +if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap" + PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true" +fi + +DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} " -DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}" if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" fi if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) + MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) fi # ============================================================================= @@ -327,12 +338,24 @@ if [[ -n "$MODEL_NAME" ]]; then echo "Using model-specific configuration for: $MODEL_NAME" fi +if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]; then + PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') + DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') + unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL + unset MORI_MOE_MAX_INPUT_TOKENS_DECODE + # NOTE: that currently with fp8_combine set, the evals do not pass on InferenceX eval harness + # or on SGLang native harness for high concurrency 4k and gets no where near the golden score of + # 0.95 on even basic GSM8k grade school math as confirmed by @billishyahao from AMD + # and as confirmed by @Oseltamivir. This was initally merged with @billishyahao promising + # that an fast follow PR to fix the evals via having quant correction in the fp8 combine +fi + # ============================================================================= # Container Synchronization # ============================================================================= echo "Waiting at the container creation barrier on $host_name" -python3 $WS_PATH/sync.py barrier \ +python3 $SGLANG_WS_PATH/sync.py barrier \ --local-ip ${host_ip} \ --local-port 5000 \ --enable-port \ @@ -362,20 +385,27 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" - echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}" - echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}" + echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}" + echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} " + echo "Decode env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} " + echo "================================================" # start the head prefill server - PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_MORI_MOE_ENV="" + set -x + if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then + PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" + fi + set +x + PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/$MODEL_NAME \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} \ - --log-level-http warning" + ${PREFILL_SERVER_CONFIG} " if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" @@ -396,7 +426,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Waiting for all prefill and decode servers to be up . . ." - BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ --node-ips ${IPADDRS} \ --node-ports 8000 \ --wait-for-all-ports \ @@ -433,7 +463,7 @@ if [ "$NODE_RANK" -eq 0 ]; then proxy_pid=$! # Wait for router to be ready via health endpoint - HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + HEALTH_BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ --node-ips ${NODE0_ADDR} \ --node-ports 30000 \ --wait-for-all-health \ @@ -453,7 +483,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Ready for benchmarking on ${host_name}:${host_ip}" echo "Benchmarking on ${host_name}:${host_ip}" - cd $WS_PATH + cd $SGLANG_WS_PATH # Export IS_MTP based on whether MTP is enabled if [ "$DECODE_MTP_SIZE" -gt 0 ]; then @@ -463,12 +493,14 @@ if [ "$NODE_RANK" -eq 0 ]; then fi # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier - BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ + BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" - if [[ "$DRY_RUN" -eq 1 ]]; then + if [[ "${EVAL_ONLY:-false}" == "true" ]]; then + echo "EVAL_ONLY mode: skipping throughput benchmark" + elif [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $BENCH_CMD" else set -x @@ -476,6 +508,96 @@ if [ "$NODE_RANK" -eq 0 ]; then set +x fi + # Run evaluation if requested (before killing router) + if [[ "${RUN_EVAL:-false}" == "true" ]]; then + echo "Running lm-eval evaluation on Node 0..." + + # Health check: verify the router is still serving before running eval. + # The throughput benchmark may have crashed/exhausted decode workers. + EVAL_HEALTH_OK=false + for _attempt in 1 2 3; do + if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then + EVAL_HEALTH_OK=true + break + fi + echo "Eval health check attempt $_attempt failed, retrying in 10s..." + sleep 10 + done + + if [[ "$EVAL_HEALTH_OK" != "true" ]]; then + echo "WARNING: Router health check failed after 3 attempts. Skipping eval." + else + # Must run from repo root so utils/evals/${task}.yaml resolves + pushd /workspace + + # Source eval functions from benchmark_lib.sh + source /workspace/benchmarks/benchmark_lib.sh + + # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list + if [[ -n "${EVAL_CONC:-}" ]]; then + export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" + else + export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + fi + + # Override eval context length with model's configured context_length + if [[ -n "$prefill_context_length" ]]; then + export EVAL_MAX_MODEL_LEN="$prefill_context_length" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" + else + # Run lm-eval against the router on port 30000 + run_eval --framework lm-eval --port 30000 + eval_rc=$? + + if [[ $eval_rc -ne 0 ]]; then + echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2 + EVAL_FAILED=1 + else + # Set metadata env vars for append_lm_eval_summary + export TP="${PREFILL_TP_SIZE}" + export CONC="${EVAL_CONCURRENT_REQUESTS}" + export EP_SIZE=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" + export PREFILL_TP="${PREFILL_TP_SIZE}" + export PREFILL_EP=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}" + export PREFILL_NUM_WORKERS="${xP}" + export DECODE_TP="${DECODE_TP_SIZE}" + export DECODE_EP=1 + [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}" + export DECODE_NUM_WORKERS="${yD}" + export DP_ATTENTION="${PREFILL_ENABLE_DP}" + export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}" + export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}" + export ISL="${BENCH_INPUT_LEN}" + export OSL="${BENCH_OUTPUT_LEN}" + # IS_MULTINODE, FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, + # RESULT_FILENAME are already set via Docker -e flags from job.slurm + + append_lm_eval_summary + # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace + + # Copy eval artifacts to run_logs for NFS extraction by runner + EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" + mkdir -p "$EVAL_COPY_DIR" + for f in meta_env.json; do + [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" + done + # Use find for glob patterns to avoid "no match" errors + find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; + find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; + + echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" + fi + fi + + popd + fi + fi + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" mkdir -p "$LOGS_OUTPUT" @@ -492,20 +614,30 @@ if [ "$NODE_RANK" -eq 0 ]; then kill $prefill0_pid fi + if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then + echo "ERROR: eval failed; exiting node-0 with rc=1" + exit 1 + fi + elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" echo "Using prefill config: $PREFILL_SERVER_CONFIG" echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" - PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_MORI_MOE_ENV="" + set -x + if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then + PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" + fi + set +x + PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/${MODEL_NAME} \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} \ - --log-level-http warning" + ${PREFILL_SERVER_CONFIG} " if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) @@ -524,7 +656,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then fi echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ --node-ips ${NODE0_ADDR} \ --node-ports 30000 \ --wait-for-all-ports \ @@ -537,7 +669,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then fi echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $WS_PATH/sync.py wait \ + WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ --remote-ip ${NODE0_ADDR} \ --remote-port 30000" @@ -560,15 +692,20 @@ else echo "Decode node rank: $RANK" echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" - DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + DECODE_MORI_MOE_ENV="" + set -x + if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_DECODE" ]]; then + DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}" + fi + set +x + DECODE_CMD="${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ --model-path ${MODEL_DIR}/${MODEL_NAME} \ --disaggregation-mode decode \ --disaggregation-ib-device ${IBDEVICES} \ --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ - ${DECODE_SERVER_CONFIG} \ - --log-level-http warning" + ${DECODE_SERVER_CONFIG} " if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then rank=$((RANK % DECODE_NODES_PER_WORKER)) @@ -589,7 +726,7 @@ else echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ --node-ips ${NODE0_ADDR} \ --node-ports 30000 \ --wait-for-all-ports \ @@ -603,7 +740,7 @@ else echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $WS_PATH/sync.py wait \ + WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ --remote-ip ${NODE0_ADDR} \ --remote-port 30000" @@ -621,4 +758,4 @@ else fi echo "Script completed successfully" -exit 0 +exit 0 \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 9acb05f54..60b0adb92 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -338,7 +338,9 @@ if [ "$NODE_RANK" -eq 0 ]; then ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" - if [[ "$DRY_RUN" -eq 1 ]]; then + if [[ "${EVAL_ONLY:-false}" == "true" ]]; then + echo "EVAL_ONLY mode: skipping throughput benchmark" + elif [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $BENCH_CMD" else set -x @@ -346,7 +348,80 @@ if [ "$NODE_RANK" -eq 0 ]; then set +x fi - # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + # Run evaluation if requested (before killing router) + if [[ "${RUN_EVAL:-false}" == "true" ]]; then + echo "Running lm-eval evaluation on Node 0..." + + EVAL_HEALTH_OK=false + for _attempt in 1 2 3; do + if curl -sf --max-time 10 "http://0.0.0.0:${ROUTER_PORT}/health" >/dev/null 2>&1; then + EVAL_HEALTH_OK=true + break + fi + echo "Eval health check attempt $_attempt failed, retrying in 10s..." + sleep 10 + done + + if [[ "$EVAL_HEALTH_OK" != "true" ]]; then + echo "WARNING: Router health check failed after 3 attempts. Skipping eval." + else + pushd /workspace + + source /workspace/benchmarks/benchmark_lib.sh + + if [[ -n "${EVAL_CONC:-}" ]]; then + export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" + else + export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: run_eval --framework lm-eval --port $ROUTER_PORT (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" + else + run_eval --framework lm-eval --port "$ROUTER_PORT" + eval_rc=$? + + if [[ $eval_rc -ne 0 ]]; then + echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2 + EVAL_FAILED=1 + else + export TP="${PREFILL_TP_SIZE}" + export CONC="${EVAL_CONCURRENT_REQUESTS}" + export EP_SIZE=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" + export PREFILL_TP="${PREFILL_TP_SIZE}" + export PREFILL_EP=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}" + export PREFILL_NUM_WORKERS="${xP}" + export DECODE_TP="${DECODE_TP_SIZE}" + export DECODE_EP=1 + [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}" + export DECODE_NUM_WORKERS="${yD}" + export DP_ATTENTION="${PREFILL_ENABLE_DP}" + export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}" + export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}" + export ISL="${BENCH_INPUT_LEN}" + export OSL="${BENCH_OUTPUT_LEN}" + + append_lm_eval_summary + + EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" + mkdir -p "$EVAL_COPY_DIR" + for f in meta_env.json; do + [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" + done + find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; + find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; + + echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" + fi + fi + + popd + fi + fi + + # Copy benchmark/eval results to BENCHMARK_LOGS_DIR (mounted from host) LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" mkdir -p "$LOGS_OUTPUT" @@ -362,6 +437,11 @@ if [ "$NODE_RANK" -eq 0 ]; then pkill -f "vllm serve" 2>/dev/null || true fi + if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then + echo "ERROR: eval failed; exiting node-0 with rc=1" + exit 1 + fi + elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then echo "${host_name}:${host_ip} is Additional Prefill Node (Model: ${MODEL_NAME})" echo "Using prefill config: $PREFILL_SERVER_CONFIG" From 7f80da743d6165e8f676152a210f47a0479b7e80 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 02:53:02 +0000 Subject: [PATCH 25/98] remove start_etcd.sh Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/start_etcd.sh | 47 ------------------- 1 file changed, 47 deletions(-) delete mode 100755 benchmarks/multi_node/amd_utils/start_etcd.sh diff --git a/benchmarks/multi_node/amd_utils/start_etcd.sh b/benchmarks/multi_node/amd_utils/start_etcd.sh deleted file mode 100755 index 46bbd2964..000000000 --- a/benchmarks/multi_node/amd_utils/start_etcd.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -set -x - -IPADDRS="${IPADDRS:-localhost}" - -# Use management network IP (matching what the Slurm script resolved) -host_ip=$(ip route get 1.1.1.1 2>/dev/null | sed -n 's/.*src \([^ ]*\).*/\1/p') -if [[ -z "$host_ip" ]]; then - host_ip=$(hostname -I | awk '{print $1}') -fi - -IFS=',' read -ra ADDR <<< "$IPADDRS" - -# Determine node name based on position in the IPADDRS list -index=0 -for ip in "${ADDR[@]}"; do - if [[ "$ip" == "$host_ip" ]]; then - break - fi - index=$((index + 1)) -done -node_name="etcd-$((index+1))" - -# Build initial cluster string -initial_cluster="" -for i in "${!ADDR[@]}"; do - peer_name="etcd-$((i+1))" - initial_cluster+="$peer_name=http://${ADDR[i]}:2380" - if [[ $i -lt $((${#ADDR[@]} - 1)) ]]; then - initial_cluster+="," - fi -done - -mkdir -p /var/lib/etcd -rm -rf /var/lib/etcd/* - -/usr/local/bin/etcd/etcd \ - --name "$node_name" \ - --data-dir /var/lib/etcd \ - --initial-advertise-peer-urls http://$host_ip:2380 \ - --listen-peer-urls http://0.0.0.0:2380 \ - --listen-client-urls http://0.0.0.0:2379 \ - --advertise-client-urls http://$host_ip:2379 \ - --initial-cluster-token etcd-cluster-1 \ - --initial-cluster "$initial_cluster" \ - --initial-cluster-state new \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/etcd_NODE${NODE_RANK}.log From 0238ad1af7cbdf1f3db78fa3ae1a4a962b83245a Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 03:03:23 +0000 Subject: [PATCH 26/98] change decode to 1, easier for testing Signed-off-by: Theresa Shan --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 462df433b..8dd41de75 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1192,12 +1192,12 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "PREFILL_NODES=1" - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: - num-worker: 2 + num-worker: 1 tp: 8 ep: 8 dp-attn: false additional-settings: - - "DECODE_NODES=2" + - "DECODE_NODES=1" minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c From 5a3a390fb874f68a0b7ffc5236c013628b8d8b70 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 06:49:13 +0000 Subject: [PATCH 27/98] add --served-model-name to vllm serve commands and wire up eval Set --served-model-name on all prefill/decode vllm serve commands so the model name matches what run_lm_eval sends in API requests. Also add eval pipeline support (health check, run_eval, artifact staging) mirroring server_sglang.sh. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/server_vllm.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 60b0adb92..35da4ad27 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -284,7 +284,9 @@ if [ "$NODE_RANK" -eq 0 ]; then # Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE) echo "Using external vllm-router container (started by job.slurm on this node)" + SERVED_MODEL="${MODEL:-${MODEL_NAME}}" PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ --trust-remote-code \ --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ @@ -448,7 +450,9 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then setup_vllm_env + SERVED_MODEL="${MODEL:-${MODEL_NAME}}" PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ --trust-remote-code \ --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ @@ -502,7 +506,9 @@ else echo "[DECODE_ENV] $env_pair" done + SERVED_MODEL="${MODEL:-${MODEL_NAME}}" DECODE_CMD="vllm serve ${MODEL_PATH} \ + --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ --trust-remote-code \ --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ From b4657452d74d9ed271585ac8ed346c9a56e83be7 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 08:31:41 +0000 Subject: [PATCH 28/98] fix model name consistency between vllm serve and bench client bench.sh now uses MODEL_NAME for vllm-disagg to match --served-model-name, and MODEL_PATH for sglang to match its default. Simplified SERVED_MODEL to use MODEL_NAME directly since MODEL env var is not available inside the container. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/bench.sh | 8 +++++++- benchmarks/multi_node/amd_utils/server_vllm.sh | 6 +++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 33cc918bf..24dfbf587 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -20,6 +20,12 @@ decode_gpus=$4 model_path=$5 model_name=$6 MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" +# vllm-disagg uses --served-model-name MODEL_NAME; sglang defaults to MODEL_PATH +if [[ "$ENGINE" == "vllm-disagg" ]]; then + BENCH_MODEL="${MODEL_NAME:-${MODEL_PATH}}" +else + BENCH_MODEL="${MODEL_PATH}" +fi log_path=$7 chosen_isl=${8:-1024} @@ -80,7 +86,7 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do run_benchmark_serving \ --bench-serving-dir "$REPO_ROOT" \ - --model "$MODEL_PATH" \ + --model "$BENCH_MODEL" \ --port "$ROUTER_PORT" \ --backend openai \ --input-len "$chosen_isl" \ diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 35da4ad27..ecab81656 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -284,7 +284,7 @@ if [ "$NODE_RANK" -eq 0 ]; then # Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE) echo "Using external vllm-router container (started by job.slurm on this node)" - SERVED_MODEL="${MODEL:-${MODEL_NAME}}" + SERVED_MODEL="${MODEL_NAME}" PREFILL_CMD="vllm serve ${MODEL_PATH} \ --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ @@ -450,7 +450,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then setup_vllm_env - SERVED_MODEL="${MODEL:-${MODEL_NAME}}" + SERVED_MODEL="${MODEL_NAME}" PREFILL_CMD="vllm serve ${MODEL_PATH} \ --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ @@ -506,7 +506,7 @@ else echo "[DECODE_ENV] $env_pair" done - SERVED_MODEL="${MODEL:-${MODEL_NAME}}" + SERVED_MODEL="${MODEL_NAME}" DECODE_CMD="vllm serve ${MODEL_PATH} \ --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ From 7240dcf6475d6ddc9d0ff95053eb75c9186c9f16 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Fri, 15 May 2026 08:49:06 +0000 Subject: [PATCH 29/98] Initial commit Signed-off-by: simondanielsson From 41b2fc583ac2de978093ed1e0e3c3f96ec3c6e52 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Fri, 15 May 2026 09:11:00 +0000 Subject: [PATCH 30/98] feat: add configs for minimax on mi300 and mi325 Signed-off-by: simondanielsson --- .github/configs/amd-master.yaml | 110 +++++++++++++++++- .../minimaxm2.5_fp8_mi300x_vllm-disagg.sh | 78 +++++++++++++ .../minimaxm2.5_fp8_mi325x_vllm-disagg.sh | 78 +++++++++++++ 3 files changed, 264 insertions(+), 2 deletions(-) create mode 100644 benchmarks/multi_node/minimaxm2.5_fp8_mi300x_vllm-disagg.sh create mode 100644 benchmarks/multi_node/minimaxm2.5_fp8_mi325x_vllm-disagg.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8dd41de75..eba01ea32 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1146,7 +1146,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: ghcr.io/simondanielsson/vllm-dev:ainic-test-hydra + image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg @@ -1200,7 +1200,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "DECODE_NODES=1" minimaxm2.5-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c + image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x-disagg @@ -1255,6 +1255,112 @@ minimaxm2.5-fp8-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" +minimaxm2.5-fp8-mi300x-vllm-disagg: + image: ghcr.io/simondanielsson/vllm/vllm-openai-rocm:fix-moriio-hangs-high-concurrency-bnxt + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi300x-disagg + precision: fp8 + framework: vllm-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + +minimaxm2.5-fp8-mi325x-vllm-disagg: + image: ghcr.io/simondanielsson/vllm/vllm-openai-rocm:fix-moriio-hangs-high-concurrency-bnxt + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi325x-disagg + precision: fp8 + framework: vllm-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + dsr1-fp4-mi355x-sglang-disagg: image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501 model: amd/DeepSeek-R1-0528-MXFP4-v2 diff --git a/benchmarks/multi_node/minimaxm2.5_fp8_mi300x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm2.5_fp8_mi300x_vllm-disagg.sh new file mode 100644 index 000000000..a9a28d889 --- /dev/null +++ b/benchmarks/multi_node/minimaxm2.5_fp8_mi300x_vllm-disagg.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/minimaxm2.5_fp8_mi325x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm2.5_fp8_mi325x_vllm-disagg.sh new file mode 100644 index 000000000..a9a28d889 --- /dev/null +++ b/benchmarks/multi_node/minimaxm2.5_fp8_mi325x_vllm-disagg.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" From bfbaed7fe82c3a9b96b59e79a6c80ee576f91cc2 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 09:28:13 +0000 Subject: [PATCH 31/98] add token patch to bench for vllm Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 24dfbf587..554db8b91 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -77,7 +77,7 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do # Engine-specific extra flags extra_flags="" if [[ "$ENGINE" == "vllm-disagg" ]]; then - extra_flags="--trust-remote-code" + extra_flags="--trust-remote-code --tokenizer $MODEL_PATH" else if [ "$IS_MTP" = "true" ]; then extra_flags="--use-chat-template" From cd374a1f7fca0c149fe81e17c8109c231d5817a7 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 09:50:34 +0000 Subject: [PATCH 32/98] add --tokenizer passthrough to run_benchmark_serving MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit benchmark_lib.sh rejected unknown flags — add --tokenizer support so vllm-disagg bench can resolve the tokenizer from the local model path instead of attempting an HF download with the short model name. Co-Authored-By: Claude Opus 4 --- benchmarks/benchmark_lib.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 0cb8fdcd0..9fc9986d9 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -208,6 +208,7 @@ run_benchmark_serving() { local dsv4=false local trust_remote_code=false local server_pid="" + local tokenizer="" while [[ $# -gt 0 ]]; do case $1 in @@ -276,6 +277,10 @@ run_benchmark_serving() { server_pid="$2" shift 2 ;; + --tokenizer) + tokenizer="$2" + shift 2 + ;; *) echo "Unknown parameter: $1" return 1 @@ -383,6 +388,10 @@ run_benchmark_serving() { benchmark_cmd+=(--trust-remote-code) fi + if [[ -n "$tokenizer" ]]; then + benchmark_cmd+=(--tokenizer "$tokenizer") + fi + # Run benchmark with optional server monitoring set -x if [[ -n "$server_pid" ]]; then From 4e138f44947775926b5ff7c85cab61aeb0ba56ed Mon Sep 17 00:00:00 2001 From: Shan Theresa Date: Fri, 15 May 2026 10:38:14 +0000 Subject: [PATCH 33/98] update vllm image for kimi2.5 and Minimax disagg. Signed-off-by: Shan Theresa --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8dd41de75..4ebd7c0c0 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1146,7 +1146,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: ghcr.io/simondanielsson/vllm-dev:ainic-test-hydra + image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg @@ -1200,7 +1200,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "DECODE_NODES=1" minimaxm2.5-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c + image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x-disagg From 0a50b080181c9a75c9d97c8f79a93600423c1d38 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Fri, 15 May 2026 10:49:21 +0000 Subject: [PATCH 34/98] fix: add tokenizer path optionally Signed-off-by: simondanielsson --- benchmarks/benchmark_lib.sh | 9 +++++++++ benchmarks/multi_node/amd_utils/bench.sh | 3 ++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 0cb8fdcd0..167b55d93 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -204,6 +204,7 @@ run_benchmark_serving() { local result_filename="" local result_dir="" local workspace_dir="" + local tokenizer="" local use_chat_template=false local dsv4=false local trust_remote_code=false @@ -268,6 +269,10 @@ run_benchmark_serving() { use_chat_template=true shift ;; + --tokenizer) + tokenizer="$2" + shift 2 + ;; --trust-remote-code) trust_remote_code=true shift @@ -383,6 +388,10 @@ run_benchmark_serving() { benchmark_cmd+=(--trust-remote-code) fi + if [[ -n "$tokenizer" ]]; then + benchmark_cmd+=(--tokenizer "$tokenizer") + fi + # Run benchmark with optional server monitoring set -x if [[ -n "$server_pid" ]]; then diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 24dfbf587..866f5a778 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -77,7 +77,8 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do # Engine-specific extra flags extra_flags="" if [[ "$ENGINE" == "vllm-disagg" ]]; then - extra_flags="--trust-remote-code" + extra_flags="--trust-remote-code --tokenizer ${MODEL_PATH}" + else if [ "$IS_MTP" = "true" ]; then extra_flags="--use-chat-template" From 2989913361afa081bba1e24f7ee3755c3d8ab291 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Fri, 15 May 2026 15:45:38 +0000 Subject: [PATCH 35/98] fix: remove minimax wideep patch which caused gibberish output Signed-off-by: simondanielsson --- benchmarks/multi_node/amd_utils/setup_deps.sh | 40 ------------------- 1 file changed, 40 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 860cecf96..44434d64e 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -835,45 +835,6 @@ except Exception as e: _SETUP_INSTALLED+=("idle-kv-reaper") } -# --------------------------------------------------------------------------- -# 13. Patch MiniMax M2.5 WideEP + MoRI + EPLB support -# Replaces the upstream minimax_m2.py with our patched version that adds -# GateLinear, EP group integration, sequence parallelism, and the -# MixtureOfExperts EPLB protocol. Idempotent: skips if already patched. -# --------------------------------------------------------------------------- -patch_minimax_m2_wideep_mori() { - local patch_file="${WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}/patches/minimax_m2.py" - if [[ ! -f "$patch_file" ]]; then - # Also check the Docker-baked location - patch_file="/opt/vllm_disagg/patches/minimax_m2.py" - fi - if [[ ! -f "$patch_file" ]]; then - echo "[SETUP] minimax_m2.py patch not found, skipping (WideEP/MoRI not patched)" - return 0 - fi - - python3 -c ' -import os, sys, shutil - -try: - import vllm.model_executor.models.minimax_m2 as mmod - target = mmod.__file__ - src = sys.argv[1] - - with open(target) as f: - if "get_ep_group" in f.read(): - print("[SETUP] minimax_m2.py already has WideEP+MoRI support") - sys.exit(0) - - shutil.copy2(src, target) - print(f"[SETUP] Patched minimax_m2.py: {src} -> {target}") - -except Exception as e: - print(f"[SETUP] WARN patch minimax_m2: {e}", file=sys.stderr) -' "$patch_file" - _SETUP_INSTALLED+=("minimax-m2-wideep-mori") -} - # ============================================================================= # Run installers # ============================================================================= @@ -891,7 +852,6 @@ patch_moriio_transfer_timeout patch_moriio_load_kv_timeout patch_scheduler_read_mode_fix patch_prefill_idle_kv_reaper -patch_minimax_m2_wideep_mori # ============================================================================= # Export paths (persists for server.sh since this file is sourced) From 927064dc100259114a09a89670ccc8c678c2e47c Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Fri, 15 May 2026 15:48:26 +0000 Subject: [PATCH 36/98] fix: remove patch causing gibberish output Signed-off-by: simondanielsson --- .../amd_utils/patches/minimax_m2.py | 672 ------------------ 1 file changed, 672 deletions(-) delete mode 100644 benchmarks/multi_node/amd_utils/patches/minimax_m2.py diff --git a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py deleted file mode 100644 index ac830eb1f..000000000 --- a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py +++ /dev/null @@ -1,672 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Copyright 2025 The MiniMax AI team. -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only MiniMaxM2/M2.5 model.""" - -from collections.abc import Iterable -from typing import Any - -import torch -from torch import nn -from transformers import PretrainedConfig - -from vllm._aiter_ops import rocm_aiter_ops -from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config -from vllm.distributed import ( - get_ep_group, - get_pp_group, - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_gather, -) -from vllm.logger import init_logger -from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( - QKVParallelLinear, - RowParallelLinear, -) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, - VocabParallelEmbedding, -) -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) -from vllm.model_executor.models.utils import sequence_parallel_chunk -from vllm.sequence import IntermediateTensors - -from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP -from .utils import ( - AutoWeightsLoader, - PPMissingLayer, - is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, - make_layers, - maybe_prefix, -) - -logger = init_logger(__name__) - - -class MiniMaxM2MoE(nn.Module): - """MoE layer for MiniMax M2/M2.5 with EP/WideEP/Mori support. - - Follows the DeepSeek V2 MoE pattern: GateLinear + FusedMoE with - expert parallelism, EPLB, and sequence parallel awareness. - """ - - def __init__( - self, - config: PretrainedConfig, - quant_config: QuantizationConfig | None = None, - prefix: str = "", - ): - super().__init__() - vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config - - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() - - self.ep_group = get_ep_group().device_group - self.ep_rank = get_ep_group().rank_in_group - self.ep_size = self.ep_group.size() - - self.n_routed_experts: int = config.num_local_experts - self.n_shared_experts: int = 0 - - self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe - self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) - self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() - - eplb_config = parallel_config.eplb_config - self.enable_eplb = parallel_config.enable_eplb - self.n_redundant_experts = eplb_config.num_redundant_experts - self.n_logical_experts = self.n_routed_experts - self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts - self.n_local_physical_experts = self.n_physical_experts // self.ep_size - - self.use_routing_bias = getattr(config, "use_routing_bias", False) - if self.use_routing_bias: - self.e_score_correction_bias = nn.Parameter( - torch.empty(config.num_local_experts, dtype=torch.float32) - ) - self.e_score_correction_bias.weight_loader = ( - MiniMaxM2MoE.ebias_weight_loader - ) - else: - self.e_score_correction_bias = None - - self.gate = GateLinear( - config.hidden_size, - config.num_local_experts, - out_dtype=torch.float32, - prefix=f"{prefix}.gate", - ) - - self.experts = FusedMoE( - num_experts=config.num_local_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - renormalize=True, - scoring_func=getattr(config, "scoring_func", "softmax"), - e_score_correction_bias=self.e_score_correction_bias, - quant_config=quant_config, - prefix=f"{prefix}.experts", - enable_eplb=self.enable_eplb, - num_redundant_experts=self.n_redundant_experts, - is_sequence_parallel=self.is_sequence_parallel, - router_logits_dtype=torch.float32, - gate=self.gate, - routed_scaling_factor=1.0 - if not self.is_rocm_aiter_moe_enabled - else self.routed_scaling_factor, - ) - - @staticmethod - def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None: - assert param.size() == loaded_weight.size() - param.data.copy_(loaded_weight.to(torch.float32)) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - num_tokens, hidden_dim = hidden_states.shape - hidden_states = hidden_states.view(-1, hidden_dim) - - if self.is_sequence_parallel: - hidden_states = sequence_parallel_chunk(hidden_states) - - if self.experts.is_internal_router: - final_hidden_states = self.experts( - hidden_states=hidden_states, router_logits=hidden_states - ) - else: - router_logits, _ = self.gate(hidden_states) - final_hidden_states = self.experts( - hidden_states=hidden_states, router_logits=router_logits - ) - - if hidden_states.dtype != torch.float16: - if not self.is_rocm_aiter_moe_enabled: - final_hidden_states = final_hidden_states * self.routed_scaling_factor - - if self.is_sequence_parallel: - final_hidden_states = tensor_model_parallel_all_gather( - final_hidden_states, 0 - ) - final_hidden_states = final_hidden_states[:num_tokens] - elif self.tp_size > 1: - from vllm.distributed.communication_op import tensor_model_parallel_all_reduce - final_hidden_states = tensor_model_parallel_all_reduce( - final_hidden_states - ) - - return final_hidden_states.view(num_tokens, hidden_dim) - - -class MiniMaxM2Attention(nn.Module): - def __init__( - self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rotary_dim: int, - rope_parameters: dict[str, Any] | None = None, - attn_window_size: int | None = None, - max_position_embeddings: int = 8192, - head_dim: int | None = None, - rms_norm_eps: float = 1e-06, - qkv_bias: bool = False, - cache_config: CacheConfig | None = None, - quant_config: QuantizationConfig | None = None, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = head_dim or (hidden_size // self.total_num_heads) - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.max_position_embeddings = max_position_embeddings - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=qkv_bias, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) - - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.o_proj", - ) - - if ( - rope_parameters is not None - and "partial_rotary_factor" not in rope_parameters - ): - rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim - self.rotary_emb = get_rope( - self.head_dim, - max_position=max_position_embeddings, - rope_parameters=rope_parameters, - ) - self.attn = Attention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - per_layer_sliding_window=attn_window_size, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - ) - - self.q_norm = MiniMaxText01RMSNormTP( - self.head_dim * self.total_num_heads, eps=rms_norm_eps - ) - self.k_norm = MiniMaxText01RMSNormTP( - self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps - ) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = MiniMaxText01RMSNormTP.forward_qk( - self.q_norm, self.k_norm, q.contiguous(), k.contiguous() - ) - q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, k, v) - output, _ = self.o_proj(attn_output) - return output - - -class MiniMaxM2DecoderLayer(nn.Module): - def __init__( - self, - config: PretrainedConfig, - prefix: str, - model_config: ModelConfig, - cache_config: CacheConfig | None = None, - quant_config: QuantizationConfig | None = None, - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - max_position_embeddings = getattr(config, "max_position_embeddings", 8192) - if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): - max_position_embeddings = max( - config.max_position_embeddings, config.max_model_len - ) - # DecoderLayers are created with `make_layers` which passes the prefix - # with the layer's index. - layer_idx = int(prefix.split(sep=".")[-1]) - - self.layer_idx = layer_idx - self.self_attn = MiniMaxM2Attention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - num_kv_heads=config.num_key_value_heads, - rotary_dim=config.rotary_dim, - rope_parameters=config.rope_parameters, - max_position_embeddings=max_position_embeddings, - rms_norm_eps=config.rms_norm_eps, - qkv_bias=getattr(config, "attention_bias", False), - head_dim=getattr(config, "head_dim", None), - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - ) - - self.block_sparse_moe = MiniMaxM2MoE( - config=config, - quant_config=quant_config, - prefix=f"{prefix}.mlp", - ) - self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - residual: torch.Tensor | None, - ) -> torch.Tensor: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm(hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - - hidden_states = self.block_sparse_moe(hidden_states) - - return hidden_states, residual - - -@support_torch_compile -class MiniMaxM2Model(nn.Module): - fall_back_to_pt_during_load = False - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - model_config = vllm_config.model_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - self.config = config - - self.vocab_size = config.vocab_size - - if get_pp_group().is_first_rank: - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - quant_config=None, - prefix=f"{prefix}.embed_tokens", - ) - else: - self.embed_tokens = PPMissingLayer() - - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: MiniMaxM2DecoderLayer( - config, - prefix, - model_config=model_config, - cache_config=cache_config, - quant_config=quant_config, - ), - prefix=f"{prefix}.layers", - ) - - if get_pp_group().is_last_rank: - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - else: - self.norm = PPMissingLayer() - self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( - ["hidden_states", "residual"], config.hidden_size - ) - - def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.embed_tokens(input_ids) - - def forward( - self, - input_ids: torch.Tensor | None, - positions: torch.Tensor, - intermediate_tensors: IntermediateTensors | None, - inputs_embeds: torch.Tensor | None = None, - ) -> torch.Tensor | IntermediateTensors: - if get_pp_group().is_first_rank: - if inputs_embeds is not None: - hidden_states = inputs_embeds - else: - hidden_states = self.embed_input_ids(input_ids) - residual = None - else: - assert intermediate_tensors is not None - hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] - - for layer in self.layers[self.start_layer : self.end_layer]: - hidden_states, residual = layer(positions, hidden_states, residual) - - if not get_pp_group().is_last_rank: - return IntermediateTensors( - {"hidden_states": hidden_states, "residual": residual} - ) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return FusedMoE.make_expert_params_mapping( - self, - ckpt_gate_proj_name="w1", - ckpt_down_proj_name="w2", - ckpt_up_proj_name="w3", - num_experts=self.config.num_local_experts, - num_redundant_experts=0, - ) - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - - # Params for weights, fp8 weight scales, fp8 activation scales - # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = self.get_expert_mapping() - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - - spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) - if spec_layer is not None: - continue # skip spec decode layers for main model - - for param_name, weight_name, shard_id in stacked_params_mapping: - # Skip non-stacked layers and experts (experts handled below). - if weight_name not in name: - continue - # We have mlp.experts[0].gate_proj in the checkpoint. - # Since we handle the experts below in expert_params_mapping, - # we need to skip here BEFORE we update the name, otherwise - # name will be updated to mlp.experts[0].gate_up_proj, which - # will then be updated below in expert_params_mapping - # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if ("mlp.experts." in name) and name not in params_dict: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - for mapping in expert_params_mapping: - param_name, weight_name, expert_id, shard_id = mapping - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader( - param, - loaded_weight, - name, - shard_id=shard_id, - expert_id=expert_id, - ) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr( - param, "weight_loader", default_weight_loader - ) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - -class MiniMaxM2MixtureOfExperts(MixtureOfExperts): - """EPLB protocol implementation for MiniMax M2/M2.5.""" - - moe_mlp_layers: list[MiniMaxM2MoE] - - def extract_moe_parameters(self, example_moe: MiniMaxM2MoE | None): - if example_moe is None: - self.num_moe_layers = 0 - self.num_expert_groups = 0 - self.num_logical_experts = 0 - self.num_physical_experts = 0 - self.num_local_physical_experts = 0 - self.num_routed_experts = 0 - self.num_shared_experts = 0 - self.num_redundant_experts = 0 - logger.warning("MiniMax M2: No MoE layer found in model.layers.") - else: - self.num_logical_experts = example_moe.n_logical_experts - self.num_physical_experts = example_moe.n_physical_experts - self.num_local_physical_experts = example_moe.n_local_physical_experts - self.num_routed_experts = example_moe.n_routed_experts - self.num_shared_experts = example_moe.n_shared_experts - self.num_redundant_experts = example_moe.n_redundant_experts - - def update_physical_experts_metadata( - self, - num_physical_experts: int, - num_local_physical_experts: int, - ) -> None: - assert self.num_local_physical_experts == num_local_physical_experts - self.num_physical_experts = num_physical_experts - self.num_local_physical_experts = num_local_physical_experts - self.num_redundant_experts = num_physical_experts - self.num_logical_experts - for moe in self.moe_mlp_layers: - moe.n_local_physical_experts = num_local_physical_experts - moe.n_physical_experts = num_physical_experts - moe.n_redundant_experts = self.num_redundant_experts - moe.experts.update_expert_map() - - -class MiniMaxM2ForCausalLM( - nn.Module, SupportsLoRA, SupportsPP, MiniMaxM2MixtureOfExperts -): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - } - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - if hasattr(vllm_config.model_config, "max_model_len"): - self.config.max_model_len = vllm_config.model_config.max_model_len - self.model = MiniMaxM2Model( - vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") - ) - if get_pp_group().is_last_rank: - self.lm_head = ParallelLMHead( - config.vocab_size, config.hidden_size, quant_config=None - ) - else: - self.lm_head = PPMissingLayer() - self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors - ) - - self.num_moe_layers = config.num_hidden_layers - self._set_moe_parameters() - - def _set_moe_parameters(self): - self.expert_weights: list = [] - self.num_expert_groups = 1 - self.moe_layers: list = [] - self.moe_mlp_layers: list[MiniMaxM2MoE] = [] - example_moe = None - for layer in self.model.layers: - if isinstance(layer, PPMissingLayer): - continue - assert isinstance(layer, MiniMaxM2DecoderLayer) - if isinstance(layer.block_sparse_moe, MiniMaxM2MoE): - example_moe = layer.block_sparse_moe - self.moe_mlp_layers.append(layer.block_sparse_moe) - self.moe_layers.append(layer.block_sparse_moe.experts) - self.extract_moe_parameters(example_moe) - - def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.embed_input_ids(input_ids) - - def forward( - self, - input_ids: torch.Tensor | None, - positions: torch.Tensor, - intermediate_tensors: IntermediateTensors | None = None, - inputs_embeds: torch.Tensor | None = None, - **kwargs, - ) -> torch.Tensor | IntermediateTensors: - hidden_states = self.model( - input_ids, positions, intermediate_tensors, inputs_embeds - ) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - ) -> torch.Tensor | None: - logits = self.logits_processor(self.lm_head, hidden_states) - return logits - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) - return loader.load_weights(weights) - - def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return self.model.get_expert_mapping() - - -def get_spec_layer_idx_from_weight_name( - config: PretrainedConfig, weight_name: str -) -> int | None: - if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0): - layer_idx = config.num_hidden_layers - for i in range(config.num_mtp_modules): - if weight_name.startswith(f"model.layers.{layer_idx + i}."): - return layer_idx + i - return None From 73bd20fd0bca47379f10175b12458e4cf06458af Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Mon, 18 May 2026 11:47:11 +0000 Subject: [PATCH 37/98] fix: install BNXT userspace libs at runtime and remove unused patches Signed-off-by: simondanielsson --- .github/configs/amd-master.yaml | 4 +- benchmarks/multi_node/amd_utils/setup_deps.sh | 596 ++---------------- 2 files changed, 45 insertions(+), 555 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index eba01ea32..9ff607888 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1256,7 +1256,7 @@ minimaxm2.5-fp8-mi355x-vllm-disagg: - "DECODE_NODES=2" minimaxm2.5-fp8-mi300x-vllm-disagg: - image: ghcr.io/simondanielsson/vllm/vllm-openai-rocm:fix-moriio-hangs-high-concurrency-bnxt + image: ghcr.io/simondanielsson/vllm/vllm-openai-rocm:fix-moriio-hangs-high-concurrency model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi300x-disagg @@ -1309,7 +1309,7 @@ minimaxm2.5-fp8-mi300x-vllm-disagg: - "DECODE_NODES=2" minimaxm2.5-fp8-mi325x-vllm-disagg: - image: ghcr.io/simondanielsson/vllm/vllm-openai-rocm:fix-moriio-hangs-high-concurrency-bnxt + image: ghcr.io/simondanielsson/vllm/vllm-openai-rocm:fix-moriio-hangs-high-concurrency model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi325x-disagg diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 44434d64e..cca6785b6 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -286,572 +286,62 @@ else: } # --------------------------------------------------------------------------- -# 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock) -# In WRITE mode, save_kv_layer spins forever waiting for the handshake -# callback to set write_ready_flags. This blocks the model worker thread, -# preventing it from responding to EngineCore shm_broadcast, causing a -# TimeoutError cascade and crash. -# Patch: add time.sleep(0.001) and a 30s timeout to yield CPU and prevent -# the model worker from deadlocking. +# 8. Broadcom bnxt RDMA userspace libraries (libbnxt_re verbs provider) +# Required on nodes with Broadcom Thor2 NICs (bcm5760x) when the base +# image does not ship the bnxt_re verbs provider. # --------------------------------------------------------------------------- -patch_moriio_save_kv_timeout() { - python3 -c ' -import os, sys - -try: - import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc - f = mc.__file__ - src = open(f).read() - - # Already patched? - if "[PATCHED] save_kv_layer timeout" in src: - print("[SETUP] save_kv_layer timeout patch already applied") - sys.exit(0) - - old = """ while True: - if ( - self._ready_requests.empty() - and remote_engine_id not in self.write_ready_flags - ): - continue""" - - if old not in src: - print("[SETUP] WARN: save_kv_layer busy-spin pattern not found, skipping patch") - sys.exit(0) - - new = """ # [PATCHED] save_kv_layer — null guard + timeout + sleep - if remote_engine_id is None: - return - import time as _time, os as _os - _wait_start = _time.monotonic() - _SAVE_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) - while True: - if ( - self._ready_requests.empty() - and remote_engine_id not in self.write_ready_flags - ): - _elapsed = _time.monotonic() - _wait_start - if _elapsed > _SAVE_KV_TIMEOUT: - import logging as _logging - _logging.getLogger("vllm.moriio").warning( - "[HANGFIX] save_kv_layer: timeout (%.1fs) waiting for " - "write_ready_flags[%s], breaking to unblock model " - "worker", _elapsed, remote_engine_id) - break - _time.sleep(0.001) - continue""" - - new_src = src.replace(old, new) - if new_src == src: - print("[SETUP] WARN: replacement had no effect") - sys.exit(0) - - open(f, "w").write(new_src) - print("[SETUP] Patched save_kv_layer: null guard + timeout + sleep") -except Exception as e: - print(f"[SETUP] WARN patch save_kv_layer: {e}", file=sys.stderr) -' - _SETUP_INSTALLED+=("MoRIIO-save-kv-timeout-patch") -} - -# --------------------------------------------------------------------------- -# 9. Patch MoRIIO waiting_for_transfer_complete with bounded timeout -# The original status.Wait() blocks forever if an RDMA completion never -# arrives (e.g., NIC queue saturation at C256). This replaces the unbounded -# wait with a polling loop using status.Succeeded() + configurable timeout. -# Also adds error handling to the write worker loop so a single failed -# transfer doesn't kill the background thread. -# --------------------------------------------------------------------------- -patch_moriio_transfer_timeout() { - python3 -c ' -import os, sys, textwrap - -try: - import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine as me - f = me.__file__ - src = open(f).read() - - if "[PATCHED] transfer completion timeout" in src: - print("[SETUP] transfer completion timeout patch already applied") - sys.exit(0) - - # --- Patch 1: Replace waiting_for_transfer_complete with polling + timeout --- - old_wait = """ def waiting_for_transfer_complete(self): - if not self.transfer_status: - return - - transfers_to_wait = [] - with self.lock: - transfers_to_wait = self.transfer_status[:] - self.transfer_status.clear() - - for status in transfers_to_wait: - try: - status.Wait() - if not status.Succeeded(): - logger.error( - "Transfer failed: %s, Code: %s", status.Message(), status.Code() - ) - raise TransferError("MoRIIO transfer failed!") - except Exception as e: - logger.error("Transfer %s failed: %s", status, e) - raise""" - - new_wait = """ def waiting_for_transfer_complete(self): - # [PATCHED] transfer completion timeout — bounded polling loop - import time as _time, os as _os - if not self.transfer_status: - return - - _timeout = float(_os.environ.get("VLLM_MORIIO_TRANSFER_TIMEOUT", "120")) - - transfers_to_wait = [] - with self.lock: - transfers_to_wait = self.transfer_status[:] - self.transfer_status.clear() - - _start = _time.monotonic() - remaining = list(transfers_to_wait) - _polls = 0 - _completed = 0 - - while remaining: - _elapsed = _time.monotonic() - _start - if _elapsed > _timeout: - logger.error( - "[HANGFIX] transfer_timeout elapsed=%.1fs " - "pending=%d/%d completed=%d polls=%d " - "action=raise_transfer_error", - _elapsed, len(remaining), len(transfers_to_wait), - _completed, _polls, - ) - raise TransferError( - f"RDMA transfer timeout after {_elapsed:.1f}s, " - f"{len(remaining)}/{len(transfers_to_wait)} pending" - ) - - still_waiting = [] - for status in remaining: - try: - if status.Succeeded(): - _completed += 1 - continue - still_waiting.append(status) - except Exception as e: - logger.error( - "[HANGFIX] transfer_poll_error error=%s", e) - raise TransferError( - f"Transfer failed during poll: {e}" - ) from e - - remaining = still_waiting - if remaining: - _time.sleep(0.005) - _polls += 1 - if _polls % 2000 == 0: - logger.warning( - "[HANGFIX] transfer_wait pending=%d " - "completed=%d elapsed=%.1fs timeout=%.0fs", - len(remaining), _completed, - _time.monotonic() - _start, _timeout, - )""" - - if old_wait not in src: - print("[SETUP] WARN: waiting_for_transfer_complete pattern not found") - sys.exit(0) - - new_src = src.replace(old_wait, new_wait) - - # --- Patch 2: Add error handling + cleanup to _write_worker_loop --- - old_loop = """ self._execute_write_task(task)""" - - new_loop = """ try: - self._execute_write_task(task) - except Exception as _e: - logger.error( - "[HANGFIX] req=%s write_task_failed error=%s " - "action=cleanup_and_mark_done", - task.request_id, _e, - ) - try: - _wr = self.worker.moriio_wrapper - with _wr.lock: - _wr.done_req_ids.append(task.request_id) - _wr.done_remote_allocate_req_dict.pop( - task.request_id, None - ) - except Exception: - pass""" - - if old_loop in new_src: - new_src = new_src.replace(old_loop, new_loop, 1) - else: - print("[SETUP] WARN: _write_worker_loop pattern not found for error handling") - - # --- Patch 3: Add deferred task timeout to _process_deferred_tasks --- - old_deferred = """ def _process_deferred_tasks(self) -> None: - \"\"\"Process tasks that were previously deferred.\"\"\" - if not self._deferred_tasks: - return - - still_deferred: list[WriteTask] = [] - for task in self._deferred_tasks: - if self._is_remote_ready(task): - self._execute_write_task(task) - else: - still_deferred.append(task) - - self._deferred_tasks = still_deferred""" - - new_deferred = """ def _process_deferred_tasks(self) -> None: - \"\"\"Process tasks that were previously deferred.\"\"\" - # [PATCHED] deferred task timeout — prune stale tasks - import time as _time, os as _os - if not self._deferred_tasks: - return - - _DEFER_TIMEOUT = float( - _os.environ.get("VLLM_MORIIO_DEFER_TIMEOUT", "60")) - - still_deferred: list[WriteTask] = [] - for task in self._deferred_tasks: - _age = _time.monotonic() - getattr(task, "_defer_ts", _time.monotonic()) - if _age > _DEFER_TIMEOUT: - logger.error( - "[HANGFIX] req=%s deferred_task_expired age=%.1fs " - "action=drop_and_mark_done", - task.request_id, _age, - ) - try: - _wr = self.worker.moriio_wrapper - with _wr.lock: - _wr.done_req_ids.append(task.request_id) - _wr.done_remote_allocate_req_dict.pop( - task.request_id, None) - except Exception: - pass - continue - if self._is_remote_ready(task): - try: - self._execute_write_task(task) - except Exception as _e: - logger.error( - "[HANGFIX] req=%s deferred_write_failed error=%s", - task.request_id, _e, - ) - try: - _wr = self.worker.moriio_wrapper - with _wr.lock: - _wr.done_req_ids.append(task.request_id) - _wr.done_remote_allocate_req_dict.pop( - task.request_id, None) - except Exception: - pass - else: - still_deferred.append(task) - - self._deferred_tasks = still_deferred""" - - if old_deferred in new_src: - new_src = new_src.replace(old_deferred, new_deferred, 1) - else: - print("[SETUP] WARN: _process_deferred_tasks pattern not found") - - # --- Patch 4: Stamp defer time when task is deferred --- - old_defer_add = """ self._deferred_tasks.append(task)""" - new_defer_add = """ import time as _time2 - if not hasattr(task, "_defer_ts"): - task._defer_ts = _time2.monotonic() - self._deferred_tasks.append(task)""" - if old_defer_add in new_src: - new_src = new_src.replace(old_defer_add, new_defer_add, 1) - else: - print("[SETUP] WARN: deferred task timestamp patch target not found") - - open(f, "w").write(new_src) - print("[SETUP] Patched: transfer timeout + writer error handling") - -except Exception as e: - print(f"[SETUP] WARN patch transfer_timeout: {e}", file=sys.stderr) -' - _SETUP_INSTALLED+=("MoRIIO-transfer-timeout-patch") -} - -# --------------------------------------------------------------------------- -# 10. Patch MoRIIO start_load_kv busy-spin (same pattern as save_kv_layer) -# The READ-mode spin loop in start_load_kv has the same unbounded-spin -# issue as save_kv_layer. Add timeout + sleep + null guard. -# --------------------------------------------------------------------------- -patch_moriio_load_kv_timeout() { - python3 -c ' -import os, sys - -try: - import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc - f = mc.__file__ - src = open(f).read() - - if "[PATCHED] start_load_kv timeout" in src: - print("[SETUP] start_load_kv timeout patch already applied") - sys.exit(0) - - old = """ while True: - if ( - self._ready_requests.empty() - and remote_engine_id not in self.load_ready_flag - and wait_handshake_readd_req - ): - continue""" - - if old not in src: - print("[SETUP] WARN: start_load_kv busy-spin pattern not found, skipping") - sys.exit(0) - - new = """ # [PATCHED] start_load_kv timeout — prevent model worker deadlock - if remote_engine_id is None and not wait_handshake_readd_req: - self._reqs_to_send.update(metadata.reqs_to_send) - return - import time as _time, os as _os - _wait_start = _time.monotonic() - _LOAD_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) - while True: - if ( - self._ready_requests.empty() - and remote_engine_id not in self.load_ready_flag - and wait_handshake_readd_req - ): - if _time.monotonic() - _wait_start > _LOAD_KV_TIMEOUT: - import logging as _logging - _logging.getLogger("vllm.moriio").warning( - "[HANGFIX] start_load_kv: timeout (%.1fs) waiting for " - "load_ready_flag[%s]", _time.monotonic() - _wait_start, - remote_engine_id) - break - _time.sleep(0.001) - continue""" - - new_src = src.replace(old, new) - if new_src == src: - print("[SETUP] WARN: start_load_kv replacement had no effect") - sys.exit(0) - - open(f, "w").write(new_src) - print("[SETUP] Patched start_load_kv busy-spin with timeout + sleep") -except Exception as e: - print(f"[SETUP] WARN patch start_load_kv: {e}", file=sys.stderr) -' - _SETUP_INSTALLED+=("MoRIIO-load-kv-timeout-patch") -} - -# --------------------------------------------------------------------------- -# 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished -# vLLM asserts that a request in finished_recving must be either -# WAITING_FOR_REMOTE_KVS or finished. In READ mode the request can -# transition to RUNNING before the aggregated recv notification arrives, -# crashing the engine with AssertionError. -# (present in v0.17.1 & v0.18.0) -# --------------------------------------------------------------------------- -patch_scheduler_read_mode_fix() { - python3 -c ' -import os, sys - -try: - import vllm.v1.core.sched.scheduler as smod - f = smod.__file__ - src = open(f).read() - - if "[PATCHED] read-mode recv assertion" in src: - print("[SETUP] scheduler read-mode assertion fix already applied") - sys.exit(0) - - old_recv = """ for req_id in kv_connector_output.finished_recving or (): - logger.debug("Finished recving KV transfer for request %s", req_id) - assert req_id in self.requests - req = self.requests[req_id] - if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: - self.finished_recving_kv_req_ids.add(req_id) - else: - assert RequestStatus.is_finished(req.status) - self._free_blocks(self.requests[req_id])""" - - new_recv = """ # [PATCHED] read-mode recv assertion — handle intermediate states - for req_id in kv_connector_output.finished_recving or (): - logger.debug("Finished recving KV transfer for request %s", req_id) - if req_id not in self.requests: - logger.debug("Request %s already removed, skipping recv", req_id) - continue - req = self.requests[req_id] - if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: - self.finished_recving_kv_req_ids.add(req_id) - elif RequestStatus.is_finished(req.status): - self._free_blocks(self.requests[req_id]) - else: - logger.debug( - "Request %s recv finished but status=%s (not " - "WAITING_FOR_REMOTE_KVS or finished), skipping " - "block free — will be freed on request completion", - req_id, req.status.name)""" - - if old_recv not in src: - print("[SETUP] WARN: scheduler finished_recving pattern not found, skipping") - sys.exit(0) - - new_src = src.replace(old_recv, new_recv, 1) - - old_send = """ for req_id in kv_connector_output.finished_sending or (): - logger.debug("Finished sending KV transfer for request %s", req_id) - assert req_id in self.requests - self._free_blocks(self.requests[req_id])""" - - new_send = """ for req_id in kv_connector_output.finished_sending or (): - logger.debug("Finished sending KV transfer for request %s", req_id) - if req_id not in self.requests: - logger.debug("Request %s already removed, skipping send", req_id) - continue - self._free_blocks(self.requests[req_id])""" - - if old_send in new_src: - new_src = new_src.replace(old_send, new_send, 1) - else: - print("[SETUP] WARN: scheduler finished_sending pattern not found") - - open(f, "w").write(new_src) - print("[SETUP] Patched: scheduler _update_from_kv_xfer_finished read-mode fix") - -except Exception as e: - print(f"[SETUP] WARN patch scheduler read-mode: {e}", file=sys.stderr) -' - _SETUP_INSTALLED+=("scheduler-read-mode-fix") -} - -# --------------------------------------------------------------------------- -# 12. Idle KV block reaper for disaggregated prefill (READ mode) -# The RIXL notification path can lose `finished_sending` signals under -# high concurrency with ibv_post_send failures. This leaves KV blocks -# permanently allocated on the prefill engine even after the decode has -# finished reading. Over multiple benchmark rounds, leaked blocks -# accumulate and eventually saturate the prefill KV cache. -# -# Fix: instrument the scheduler's `schedule()` method to detect idle -# periods (0 running, 0 waiting for >5s) and force-free blocks for -# any remaining requests whose status is finished. -# --------------------------------------------------------------------------- -patch_prefill_idle_kv_reaper() { - python3 -c ' -import os, sys - -try: - import vllm.v1.core.sched.scheduler as smod - f = smod.__file__ - src = open(f).read() - - if "[PATCHED] idle-kv-reaper" in src: - print("[SETUP] idle KV block reaper already applied") - sys.exit(0) - - # Find the _update_from_kv_xfer_finished method end and add reaper logic - # We inject into the method that processes KV transfer completions. - marker = "[PATCHED] read-mode recv assertion" - if marker not in src: - print("[SETUP] WARN: scheduler read-mode patch not found, skipping reaper") - sys.exit(0) - - # Add reaper state initialization to __init__ - old_init_marker = "self.finished_recving_kv_req_ids" - if old_init_marker not in src: - print("[SETUP] WARN: finished_recving_kv_req_ids not found in scheduler") - sys.exit(0) - - # Find the first occurrence to insert reaper state - init_pos = src.find(old_init_marker) - # Find the line containing it - line_end = src.find("\n", init_pos) - init_line = src[init_pos:line_end] - - # Add reaper state after this line - reaper_init = init_line + """ - # [PATCHED] idle-kv-reaper state - self._idle_kv_reaper_ts = 0.0 - self._idle_kv_reaper_active = False""" - - src = src.replace(init_line, reaper_init, 1) - - # Now add the reaper logic at the end of _update_from_kv_xfer_finished - # Find the finished_sending handler we patched - send_handler = """ for req_id in kv_connector_output.finished_sending or (): - logger.debug("Finished sending KV transfer for request %s", req_id) - if req_id not in self.requests: - logger.debug("Request %s already removed, skipping send", req_id) - continue - self._free_blocks(self.requests[req_id])""" - - reaper_logic = send_handler + """ - - # [PATCHED] idle-kv-reaper — force-free leaked prefill KV blocks - import time as _time - _REAPER_IDLE_SECS = 5.0 - _num_running = sum(1 for r in self.requests.values() - if r.status == RequestStatus.RUNNING) - _should_reap = (_num_running == 0) - - if _should_reap: - if not self._idle_kv_reaper_active: - self._idle_kv_reaper_active = True - self._idle_kv_reaper_ts = _time.monotonic() - elif _time.monotonic() - self._idle_kv_reaper_ts > _REAPER_IDLE_SECS: - _reaped = 0 - _reap_ids = [] - for _rid, _req in list(self.requests.items()): - if RequestStatus.is_finished(_req.status): - _reap_ids.append(_rid) - for _rid in _reap_ids: - try: - _req = self.requests[_rid] - self._free_blocks(_req) - _reaped += 1 - except Exception as _e: - logger.debug("[KV-REAPER] free_blocks failed for %s: %s", _rid, _e) - if _reaped > 0: - logger.warning( - "[KV-REAPER] Force-freed blocks for %d finished " - "requests after %.1fs idle", - _reaped, _time.monotonic() - self._idle_kv_reaper_ts) - self._idle_kv_reaper_ts = _time.monotonic() - else: - self._idle_kv_reaper_active = False""" +install_bnxt_rdma() { + local existing + existing=$(find /usr/local/lib /usr/lib64 /usr/lib -name "libbnxt_re-rdmav*.so" 2>/dev/null) + if [[ -n "$existing" ]]; then + echo "[SETUP] Existing bnxt RDMA libraries found (will override):" + for lib in $existing; do + echo " $lib ($(ls -l "$lib" 2>/dev/null | awk '{print $5, $6, $7, $8}'))" + done + fi - if send_handler in src: - src = src.replace(send_handler, reaper_logic, 1) - else: - print("[SETUP] WARN: send handler not found for reaper injection") - sys.exit(0) + echo "[SETUP] Installing bnxt RDMA build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + wget unzip autoconf automake libtool pkg-config \ + libibverbs-dev librdmacm-dev ibverbs-utils \ + && rm -rf /var/lib/apt/lists/* - open(f, "w").write(src) - print("[SETUP] Patched: idle KV block reaper for prefill") + echo "[SETUP] Downloading and building Broadcom bnxt RDMA userspace libraries..." + ( + set -e + cd /tmp + wget -q https://docs.broadcom.com/docs-and-downloads/ethernet-network-adapters/NXE/Thor2/GCA1/bcm5760x_230.2.52.0a.zip + unzip -q bcm5760x_230.2.52.0a.zip + cd bcm5760x_230.2.52.0a/drivers_linux/bnxt_rocelib/ + results=$(find . -name "libbnxt*.tar.gz") + tar -xf $results + untar_dir=$(find . -maxdepth 1 -type d -name "libbnxt*" ! -name "*.tar.gz" | head -n 1) + cd "$untar_dir" + sh autogen.sh + ./configure + make -j"$(nproc)" + find /usr/lib64/ /usr/lib -name "libbnxt_re-rdmav*.so" -exec mv {} {}.inbox \; 2>/dev/null || true + make install all + echo /usr/local/lib >> /etc/ld.so.conf + ldconfig + cp -f bnxt_re.driver /etc/libibverbs.d/ + ) + rm -rf /tmp/bcm5760x_230.2.52.0a /tmp/bcm5760x_230.2.52.0a.zip -except Exception as e: - print(f"[SETUP] WARN patch idle-kv-reaper: {e}", file=sys.stderr) -' - _SETUP_INSTALLED+=("idle-kv-reaper") + if ! ibv_devices 2>/dev/null; then + echo "[SETUP] WARN: ibv_devices failed after bnxt install (may be OK if no Broadcom NIC on this node)" + fi + _SETUP_INSTALLED+=("bnxt-rdma") } # ============================================================================= # Run installers # ============================================================================= -# install_ucx -# install_rixl -# install_etcd -# install_libionic -# install_mori +install_bnxt_rdma install_recipe_deps install_amd_quark patch_mori_fp8_compat -patch_moriio_save_kv_timeout -patch_moriio_transfer_timeout -patch_moriio_load_kv_timeout -patch_scheduler_read_mode_fix -patch_prefill_idle_kv_reaper # ============================================================================= # Export paths (persists for server.sh since this file is sourced) From fef0c72e81f05359b3c545603e9e41b8a9d9633b Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Mon, 18 May 2026 14:04:59 +0000 Subject: [PATCH 38/98] fix: use read mode for decode instances as well Signed-off-by: simondanielsson --- .github/configs/amd-master.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 9ff607888..5712d20c7 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1307,6 +1307,7 @@ minimaxm2.5-fp8-mi300x-vllm-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" minimaxm2.5-fp8-mi325x-vllm-disagg: image: ghcr.io/simondanielsson/vllm/vllm-openai-rocm:fix-moriio-hangs-high-concurrency @@ -1360,6 +1361,7 @@ minimaxm2.5-fp8-mi325x-vllm-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" dsr1-fp4-mi355x-sglang-disagg: image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501 From f89d2b612563aea18524ebc59e4a97ade52ddea5 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Tue, 19 May 2026 10:17:12 +0200 Subject: [PATCH 39/98] fix: pin non-down mi300 nodes Signed-off-by: simondanielsson --- runners/launch_mi300x-amds.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index 20addccf4..aa56bb8f2 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -9,7 +9,9 @@ LOCK_FILE="${SQUASH_FILE}.lock" set -x -JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +# Pin to the known-good mi300x nodes as of May 19; others are unavailable: +# Down: chi-mi300x-033, chi-mi300x-035, chi-mi300x-037, chi-mi300x-049 +JOB_ID=$(salloc --partition=$PARTITION --nodelist=chi-mi300x-[034,036,054,057,058].ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job" @@ -37,4 +39,4 @@ srun --jobid=$JOB_ID \ --no-container-entrypoint --export=ALL \ bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x.sh -scancel $JOB_ID \ No newline at end of file +scancel $JOB_ID From 83f160977921d912566284820d37a2c32b6365ba Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Tue, 19 May 2026 10:49:02 +0200 Subject: [PATCH 40/98] feat: multi-node support for gfx942 Signed-off-by: simondanielsson --- runners/launch_mi300x-amds.sh | 224 +++++++++++++++++++++++++++++----- runners/launch_mi325x-amds.sh | 222 ++++++++++++++++++++++++++++----- 2 files changed, 382 insertions(+), 64 deletions(-) diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index aa56bb8f2..2c51729ad 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -1,42 +1,200 @@ #!/usr/bin/env bash -export HF_HUB_CACHE_MOUNT="/raid/hf-hub-cache/" -export PORT=8888 +scancel_sync() { + local jobid=$1 + local timeout=${2:-600} + local interval=10 + local start + start=$(date +%s) -PARTITION="compute" -SQUASH_FILE="/home/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -LOCK_FILE="${SQUASH_FILE}.lock" + echo "[scancel_sync] Requesting cancel of job $jobid" + scancel "$jobid" || true -set -x + while [[ -n "$(squeue -j "$jobid" --noheader 2>/dev/null)" ]]; do + local now + now=$(date +%s) + if (( now - start >= timeout )); then + echo "[scancel_sync][WARN] job $jobid still present after ${timeout}s" + return 1 + fi + echo "[scancel_sync] waiting for job $jobid to exit. $((timeout-(now-start))) secs remaining..." + sleep "$interval" + done + echo "[scancel_sync] job $jobid exited" + return 0 +} -# Pin to the known-good mi300x nodes as of May 19; others are unavailable: -# Down: chi-mi300x-033, chi-mi300x-035, chi-mi300x-037, chi-mi300x-049 -JOB_ID=$(salloc --partition=$PARTITION --nodelist=chi-mi300x-[034,036,054,057,058].ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +# Default exclude list for known-down mi300x nodes; override via SLURM_EXCLUDE_NODES env. +export SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-chi-mi300x-033,chi-mi300x-035,chi-mi300x-037,chi-mi300x-049}" -if [ -z "$JOB_ID" ]; then - echo "ERROR: salloc failed to allocate a job" - exit 1 -fi +if [[ "$IS_MULTINODE" == "true" ]]; then + set -x + + export SLURM_ACCOUNT="$USER" + export SLURM_PARTITION="compute" + export SLURM_JOB_NAME="benchmark-${FRAMEWORK}.job" + + export MODEL_NAME=${MODEL##*/} + export MODEL_PATH="/raid/hf-hub-cache" + export IBDEVICES="bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7" + export MORI_RDMA_TC=104 + + export MODEL_DIR="$MODEL_PATH" + export GPUS_PER_NODE=8 + + export ISL="$ISL" + export OSL="$OSL" -# Use flock to serialize concurrent imports to the same squash file -srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " - exec 9>\"$LOCK_FILE\" - flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } - if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then - echo 'Squash file already exists and is valid, skipping import' + export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}" + mkdir -p "$BENCHMARK_LOGS_DIR" + sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + + cleanup_and_save_logs() { + if [[ -n "${GITHUB_ACTIONS:-}" && -n "${JOB_ID:-}" ]]; then + local art_dir="$GITHUB_WORKSPACE/benchmark_artifacts" + mkdir -p "$art_dir" + cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$art_dir/" 2>/dev/null || true + fi + local err_file="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID:-unknown}.err" + if [[ -s "$err_file" ]]; then + echo "=== Slurm job stderr ===" + tail -100 "$err_file" + echo "========================" + fi + sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true + } + trap cleanup_and_save_logs EXIT + + SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi300x_${FRAMEWORK}.sh" + if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then + BENCHMARK_SUBDIR="multi_node" else - rm -f \"$SQUASH_FILE\" - enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + BENCHMARK_SUBDIR="single_node" + fi + JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}") + + LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out" + + sleep 10 + + while ! ls "$LOG_FILE" &>/dev/null; do + if ! squeue -u "$USER" --noheader --format='%i' | grep -q "$JOB_ID"; then + echo "ERROR: Job $JOB_ID failed before creating log file" + scontrol show job "$JOB_ID" + exit 1 + fi + sleep 5 + done + + set +x + + ( + while squeue -u $USER --noheader --format='%i' | grep -q "$JOB_ID"; do + sleep 10 + done + ) & + POLL_PID=$! + + tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null + + wait $POLL_PID + + set -x + + if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + cat > collect_latest_results.py <<'PY' +import os, sys +sgl_job_dir, isl, osl, nexp, framework = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]), sys.argv[5] +for path in sorted([f"{sgl_job_dir}/logs/{name}/{framework}_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/{framework}_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: + print(path) +PY + + LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1 "$FRAMEWORK") + if [ -z "$LOGS_DIR" ]; then + echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" + exit 1 + fi + + echo "Found logs directory: $LOGS_DIR" + ls -la "$LOGS_DIR" + + for result_file in $(find $LOGS_DIR -type f); do + file_name=$(basename $result_file) + if [ -f $result_file ]; then + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" + echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}" + cp $result_file $WORKSPACE_RESULT_FILE + fi + done + fi + + if [[ "${RUN_EVAL:-false}" == "true" ]]; then + EVAL_DIR=$(find "$BENCHMARK_LOGS_DIR/logs" -type d -name eval_results 2>/dev/null | head -1) + if [ -n "$EVAL_DIR" ] && [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + shopt -s nullglob + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] || continue + cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" + done + shopt -u nullglob + else + echo "WARNING: RUN_EVAL=true but no eval results found under $BENCHMARK_LOGS_DIR/logs" + fi + fi + + echo "All result files processed" + set +x + scancel_sync $JOB_ID + set -x + echo "Canceled the slurm job $JOB_ID" + + sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + +else + + export HF_HUB_CACHE_MOUNT="/raid/hf-hub-cache/" + export PORT=8888 + + PARTITION="compute" + SQUASH_FILE="/home/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + LOCK_FILE="${SQUASH_FILE}.lock" + + set -x + + EXCLUDE_OPT=() + if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then + EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") fi -" -srun --jobid=$JOB_ID \ ---container-image=$SQUASH_FILE \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ ---container-mount-home \ ---container-writable \ ---container-remap-root \ ---container-workdir=/workspace/ \ ---no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x.sh - -scancel $JOB_ID + + JOB_ID=$(salloc --partition=$PARTITION "${EXCLUDE_OPT[@]}" --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') + + if [ -z "$JOB_ID" ]; then + echo "ERROR: salloc failed to allocate a job" + exit 1 + fi + + # Use flock to serialize concurrent imports to the same squash file + srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi + " + srun --jobid=$JOB_ID \ + --container-image=$SQUASH_FILE \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mount-home \ + --container-writable \ + --container-remap-root \ + --container-workdir=/workspace/ \ + --no-container-entrypoint --export=ALL \ + bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x.sh + + scancel $JOB_ID +fi diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh index 144b54646..d13887d34 100644 --- a/runners/launch_mi325x-amds.sh +++ b/runners/launch_mi325x-amds.sh @@ -1,40 +1,200 @@ #!/usr/bin/env bash -export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/" -export PORT=8888 +scancel_sync() { + local jobid=$1 + local timeout=${2:-600} + local interval=10 + local start + start=$(date +%s) -PARTITION="compute" -SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -LOCK_FILE="${SQUASH_FILE}.lock" + echo "[scancel_sync] Requesting cancel of job $jobid" + scancel "$jobid" || true -set -x + while [[ -n "$(squeue -j "$jobid" --noheader 2>/dev/null)" ]]; do + local now + now=$(date +%s) + if (( now - start >= timeout )); then + echo "[scancel_sync][WARN] job $jobid still present after ${timeout}s" + return 1 + fi + echo "[scancel_sync] waiting for job $jobid to exit. $((timeout-(now-start))) secs remaining..." + sleep "$interval" + done + echo "[scancel_sync] job $jobid exited" + return 0 +} -JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +# Override-able exclude list for known-down mi325x nodes via SLURM_EXCLUDE_NODES env. +export SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-}" -if [ -z "$JOB_ID" ]; then - echo "ERROR: salloc failed to allocate a job" - exit 1 -fi +if [[ "$IS_MULTINODE" == "true" ]]; then + set -x + + export SLURM_ACCOUNT="$USER" + export SLURM_PARTITION="compute" + export SLURM_JOB_NAME="benchmark-${FRAMEWORK}.job" + + export MODEL_NAME=${MODEL##*/} + export MODEL_PATH="/nfsdata" + export IBDEVICES="rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7" + export MORI_RDMA_TC=104 + + export MODEL_DIR="$MODEL_PATH" + export GPUS_PER_NODE=8 + + export ISL="$ISL" + export OSL="$OSL" -# Use flock to serialize concurrent imports to the same squash file -srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " - exec 9>\"$LOCK_FILE\" - flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } - if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then - echo 'Squash file already exists and is valid, skipping import' + export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}" + mkdir -p "$BENCHMARK_LOGS_DIR" + sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + + cleanup_and_save_logs() { + if [[ -n "${GITHUB_ACTIONS:-}" && -n "${JOB_ID:-}" ]]; then + local art_dir="$GITHUB_WORKSPACE/benchmark_artifacts" + mkdir -p "$art_dir" + cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$art_dir/" 2>/dev/null || true + fi + local err_file="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID:-unknown}.err" + if [[ -s "$err_file" ]]; then + echo "=== Slurm job stderr ===" + tail -100 "$err_file" + echo "========================" + fi + sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true + } + trap cleanup_and_save_logs EXIT + + SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi325x_${FRAMEWORK}.sh" + if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then + BENCHMARK_SUBDIR="multi_node" else - rm -f \"$SQUASH_FILE\" - enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + BENCHMARK_SUBDIR="single_node" + fi + JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}") + + LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out" + + sleep 10 + + while ! ls "$LOG_FILE" &>/dev/null; do + if ! squeue -u "$USER" --noheader --format='%i' | grep -q "$JOB_ID"; then + echo "ERROR: Job $JOB_ID failed before creating log file" + scontrol show job "$JOB_ID" + exit 1 + fi + sleep 5 + done + + set +x + + ( + while squeue -u $USER --noheader --format='%i' | grep -q "$JOB_ID"; do + sleep 10 + done + ) & + POLL_PID=$! + + tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null + + wait $POLL_PID + + set -x + + if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + cat > collect_latest_results.py <<'PY' +import os, sys +sgl_job_dir, isl, osl, nexp, framework = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]), sys.argv[5] +for path in sorted([f"{sgl_job_dir}/logs/{name}/{framework}_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/{framework}_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: + print(path) +PY + + LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1 "$FRAMEWORK") + if [ -z "$LOGS_DIR" ]; then + echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" + exit 1 + fi + + echo "Found logs directory: $LOGS_DIR" + ls -la "$LOGS_DIR" + + for result_file in $(find $LOGS_DIR -type f); do + file_name=$(basename $result_file) + if [ -f $result_file ]; then + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" + echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}" + cp $result_file $WORKSPACE_RESULT_FILE + fi + done + fi + + if [[ "${RUN_EVAL:-false}" == "true" ]]; then + EVAL_DIR=$(find "$BENCHMARK_LOGS_DIR/logs" -type d -name eval_results 2>/dev/null | head -1) + if [ -n "$EVAL_DIR" ] && [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + shopt -s nullglob + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] || continue + cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" + done + shopt -u nullglob + else + echo "WARNING: RUN_EVAL=true but no eval results found under $BENCHMARK_LOGS_DIR/logs" + fi + fi + + echo "All result files processed" + set +x + scancel_sync $JOB_ID + set -x + echo "Canceled the slurm job $JOB_ID" + + sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + +else + + export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/" + export PORT=8888 + + PARTITION="compute" + SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + LOCK_FILE="${SQUASH_FILE}.lock" + + set -x + + EXCLUDE_OPT=() + if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then + EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") fi -" -srun --jobid=$JOB_ID \ ---container-image=$SQUASH_FILE \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ ---container-mount-home \ ---container-writable \ ---container-remap-root \ ---container-workdir=/workspace/ \ ---no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi325x.sh - -scancel $JOB_ID + + JOB_ID=$(salloc --partition=$PARTITION "${EXCLUDE_OPT[@]}" --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') + + if [ -z "$JOB_ID" ]; then + echo "ERROR: salloc failed to allocate a job" + exit 1 + fi + + # Use flock to serialize concurrent imports to the same squash file + srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi + " + srun --jobid=$JOB_ID \ + --container-image=$SQUASH_FILE \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mount-home \ + --container-writable \ + --container-remap-root \ + --container-workdir=/workspace/ \ + --no-container-entrypoint --export=ALL \ + bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi325x.sh + + scancel $JOB_ID +fi From cd3e243b3b662a6654429e92b8e5ad53bfaeb302 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Tue, 19 May 2026 15:43:48 +0200 Subject: [PATCH 41/98] fix: use full node name to mi300 exclude list Signed-off-by: simondanielsson --- runners/launch_mi300x-amds.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index 2c51729ad..e793eb721 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -25,7 +25,7 @@ scancel_sync() { } # Default exclude list for known-down mi300x nodes; override via SLURM_EXCLUDE_NODES env. -export SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-chi-mi300x-033,chi-mi300x-035,chi-mi300x-037,chi-mi300x-049}" +export SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-chi-mi300x-033.ord.vultr.cpe.ice.amd.com,chi-mi300x-035.ord.vultr.cpe.ice.amd.com,chi-mi300x-037.ord.vultr.cpe.ice.amd.com,chi-mi300x-049.ord.vultr.cpe.ice.amd.com}" if [[ "$IS_MULTINODE" == "true" ]]; then set -x From 49112be682b05965d078e355f251161f56a58c43 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Mon, 13 Apr 2026 03:00:45 +0000 Subject: [PATCH 42/98] remove vllm disagg for dpsr1 and dpv3 Signed-off-by: Theresa Shan --- .github/configs/amd-master.yaml | 108 ++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 4c4ff67b8..23c7c7461 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1312,6 +1312,114 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" +kimik2.5-fp4-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:v0.18.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x-disagg + precision: fp4 + framework: vllm-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + +minimaxm2.5-fp8-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:v0.18.0 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi355x-disagg + precision: fp8 + framework: vllm-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536, + # TP8 shards to 192 which is not divisible by FP8 block_n=128. + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + dsr1-fp4-mi355x-sglang-disagg: image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501 model: amd/DeepSeek-R1-0528-MXFP4-v2 From 78639d8049a49202ce009f3f0e205d7b1c3cef19 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 21 Apr 2026 06:40:27 +0000 Subject: [PATCH 43/98] consolidate amd_utils for sglang and vllm Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/bench.sh | 72 +- benchmarks/multi_node/amd_utils/env.sh | 231 +++-- benchmarks/multi_node/amd_utils/job.slurm | 468 ++++----- .../multi_node/amd_utils/models_vllm.yaml | 42 + .../multi_node/amd_utils/moriio_proxy.py | 327 +++++++ .../amd_utils/patches/minimax_m2.py | 672 +++++++++++++ benchmarks/multi_node/amd_utils/server.sh | 66 +- .../multi_node/amd_utils/server_sglang.sh | 624 ++++++++++++ .../multi_node/amd_utils/server_vllm.sh | 490 ++++++++++ benchmarks/multi_node/amd_utils/setup_deps.sh | 908 ++++++++++++++++++ benchmarks/multi_node/amd_utils/start_etcd.sh | 47 + benchmarks/multi_node/amd_utils/submit.sh | 112 ++- benchmarks/multi_node/amd_utils/sync.py | 5 +- .../dsr1_fp4_mi355x_sglang-disagg.sh | 3 +- .../dsr1_fp8_mi355x_sglang-disagg.sh | 3 +- .../kimik2.5_fp4_mi355x_vllm-disagg.sh | 80 ++ .../minimaxm2.5_fp8_mi355x_vllm-disagg.sh | 78 ++ 17 files changed, 3800 insertions(+), 428 deletions(-) create mode 100644 benchmarks/multi_node/amd_utils/models_vllm.yaml create mode 100644 benchmarks/multi_node/amd_utils/moriio_proxy.py create mode 100644 benchmarks/multi_node/amd_utils/patches/minimax_m2.py create mode 100755 benchmarks/multi_node/amd_utils/server_sglang.sh create mode 100755 benchmarks/multi_node/amd_utils/server_vllm.sh create mode 100644 benchmarks/multi_node/amd_utils/setup_deps.sh create mode 100755 benchmarks/multi_node/amd_utils/start_etcd.sh create mode 100755 benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh create mode 100644 benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index ac996c5a9..87f3b1e8a 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -1,4 +1,17 @@ #!/bin/bash +# Dual-Engine Disaggregated Benchmark Runner +# +# ENGINE=sglang (default): SGLang benchmark +# ENGINE=vllm: vLLM benchmark +# +# Produces JSON result files via benchmark_serving.py so that the CI pipeline +# can collect and process results. +# +# Usage: bash bench.sh \ +# \ +# + +ENGINE="${ENGINE:-sglang}" n_prefill=$1 n_decode=$2 @@ -6,58 +19,81 @@ prefill_gpus=$3 decode_gpus=$4 model_path=$5 model_name=$6 -MODEL_PATH="${model_path}/${model_name}" +MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" log_path=$7 chosen_isl=${8:-1024} chosen_osl=${9:-1024} concurrency_list=${10:-"512x1"} -chosen_req_rate=${11:-1} +if [[ "$ENGINE" == "vllm" ]]; then + chosen_req_rate=${11:-inf} +else + chosen_req_rate=${11:-1} +fi random_range_ratio=${12:-0.8} num_prompts_multiplier=${13:-10} IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" -echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" - -head_node="localhost" -head_port="30000" +ROUTER_PORT="${ROUTER_PORT:-30000}" +echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" -profile_folder="${log_path}/sglang_isl_${chosen_isl}_osl_${chosen_osl}" -mkdir -p $profile_folder +profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}" +mkdir -p "$profile_folder" source "$(dirname "$0")/../../benchmark_lib.sh" -# Repo root inside the container (3 levels up from this script's directory) REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" -for max_concurrency in ${chosen_concurrencies[@]}; do +for max_concurrency in "${chosen_concurrencies[@]}"; do export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" + num_prompts=$(( max_concurrency * num_prompts_multiplier )) + if [[ "$num_prompts" -lt 16 ]]; then + num_prompts=16 + fi + echo "profile_folder: $profile_folder" echo "max_concurrency: $max_concurrency" echo "chosen_req_rate: $chosen_req_rate" echo "MODEL_PATH: $MODEL_PATH" - echo "head_port: $head_port" + echo "ROUTER_PORT: $ROUTER_PORT" echo "chosen_isl: $chosen_isl" echo "chosen_osl: $chosen_osl" + echo "num_prompts: $num_prompts" echo "export_file: $export_file" + # Engine-specific extra flags + extra_flags="" + if [[ "$ENGINE" == "vllm" ]]; then + extra_flags="--trust-remote-code" + else + if [ "$IS_MTP" = "true" ]; then + extra_flags="--use-chat-template" + fi + fi + run_benchmark_serving \ --bench-serving-dir "$REPO_ROOT" \ - --model ${MODEL_PATH} \ - --port ${head_port} \ + --model "$MODEL_PATH" \ + --port "$ROUTER_PORT" \ --backend openai \ - --input-len ${chosen_isl} \ - --output-len ${chosen_osl} \ - --random-range-ratio ${random_range_ratio} \ - --num-prompts $(( $max_concurrency * $num_prompts_multiplier )) \ + --input-len "$chosen_isl" \ + --output-len "$chosen_osl" \ + --random-range-ratio "$random_range_ratio" \ + --num-prompts "$num_prompts" \ --max-concurrency "$max_concurrency" \ --result-filename "$export_file" \ --result-dir /workspace/ \ - $( [ "$IS_MTP" = "true" ] && echo "--use-chat-template" ) + $extra_flags echo "-----------------------------------------" + + # vLLM: cooldown between rounds for idle KV block reaper + if [[ "$ENGINE" == "vllm" ]]; then + echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." + sleep 10 + fi done diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index d0b99eddc..c5a438541 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -1,141 +1,198 @@ #!/bin/bash -# SGLang/MoRI environment setup for multi-node disaggregated serving. +# Dual-engine environment setup for multi-node disaggregated serving. +# +# ENGINE=sglang (default): SGLang/MoRI environment +# ENGINE=vllm: vLLM/Nixl environment # # REQUIRED ENVIRONMENT VARIABLES: # IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) -# This must be set by the runner script (runners/launch_mi355x-amds.sh) -# -# OPTIONAL ENVIRONMENT VARIABLES: -# MORI_RDMA_TC - RDMA traffic class (e.g., 96, 104). Set by runner if cluster uses QoS. - +# Set by runner or auto-detected from hostname. set -x + +ENGINE="${ENGINE:-sglang}" export PYTHONDONTWRITEBYTECODE=1 -# IBDEVICES configuration +# ============================================================================= +# Shared: IBDEVICES detection +# ============================================================================= + # Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh) # Fall back to hostname detection if not set (for direct script execution) if [[ -z "$IBDEVICES" ]]; then - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7 - elif [[ $NODENAME == mia1* ]]; then - export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 + DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',') + if [[ -n "$DETECTED" ]]; then + export IBDEVICES="$DETECTED" else - echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2 - exit 1 + echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2 fi - echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $NODENAME" + echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)" else echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)" fi export IBDEVICES -# Auto-detect default network interface (portable across clusters) +# Shared: Auto-detect default network interface (portable across clusters) export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) +set +x -export NCCL_IB_HCA=$IBDEVICES +export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} -export SGLANG_USE_AITER=1 +# ============================================================================= +# Engine-specific environment +# ============================================================================= -export SGLANG_MORI_DISPATCH_DTYPE=auto -export SGLANG_MORI_FP8_COMB=true -export SGLANG_MORI_QP_PER_TRANSFER=4 -export SGLANG_MORI_NUM_WORKERS=4 -export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000 +if [[ "$ENGINE" == "vllm" ]]; then + # ========================================================================= + # vLLM/Nixl-specific environment + # ========================================================================= + set -x -export MORI_IO_QP_MAX_SEND_WR=16384 -export MORI_IO_QP_MAX_CQE=32768 -export MORI_IO_QP_MAX_SGE=4 + # UCX_NET_DEVICES: Use the first benic interface for UCX TCP transport + if [[ -z "$UCX_NET_DEVICES" ]]; then + UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1) + if [[ -n "$UCX_NET_DEV" ]]; then + export UCX_NET_DEVICES="$UCX_NET_DEV" + else + FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) + if [[ -n "$FIRST_IB" ]]; then + export UCX_NET_DEVICES="${FIRST_IB}:1" + fi + fi + echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES" + else + echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)" + fi -export MORI_IO_TC_DISABLE=0 + # RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing + export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1} -export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600 -export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600 + # QoS/DSCP configuration for lossless RoCEv2 fabric. + if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then + echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)" + elif command -v nicctl &> /dev/null; then + ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') + ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" ' +$1 == "DSCP" && $2 == ":" && $NF == p { + print $3; exit +}') + if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then + export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP )) + export UCX_IB_SL=$ND_PRIO + echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL" + else + echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + fi + fi + else + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + else + echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration." + fi + fi -# Disable allocating memory in one pass -export MORI_SHMEM_MODE=ISOLATION + set +x + echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}" -# Enable spec v2 -export SGLANG_ENABLE_SPEC_V2=1 -export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 +else + # ========================================================================= + # SGLang/MoRI-specific environment + # ========================================================================= -export SGLANG_LOG_MS=true -export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 + export SGLANG_USE_AITER=1 + export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200 + export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200 -export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 -export MORI_MAX_DISPATCH_TOKENS_DECODE=512 + # Disable allocating memory in one pass + export MORI_SHMEM_MODE=ISOLATION + export SGLANG_MORI_FP8_DISP=True -export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768 -export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703 + if [[ "$MODEL_NAME" == *mxfp4* ]]; then + export SGLANG_MORI_FP8_DISP=False + fi + + export SGLANG_MORI_FP4_DISP=False + export SGLANG_MORI_FP8_COMB=False -# set MTP size=1 when EP16 -export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) + # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower) + export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 + if [[ "$MODEL_NAME" == *mxfp4* ]]; then + export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 + fi + export MORI_MAX_DISPATCH_TOKENS_DECODE=160 -export MORI_EP_LAUNCH_CONFIG_MODE=AUTO + # set MTP size=1 when EP16 + export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) + export MORI_EP_LAUNCH_CONFIG_MODE=AUTO + export MORI_IO_QP_MAX_SEND_WR=16384 + export MORI_IO_QP_MAX_CQE=32768 + export MORI_IO_QP_MAX_SGE=4 -export MORI_APP_LOG_LEVEL=INFO + export MORI_APP_LOG_LEVEL=INFO -# Router logging control: -# 0 (default) keeps noisy per-request access logs out of stdout while still logging to file. -# 1 mirrors router logs to stdout via tee (useful for live debugging). -export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}" + # Router logging control + export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}" -# QoS/DSCP configuration -# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname -if [[ -n "$MORI_RDMA_TC" ]]; then - echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)" -elif command -v nicctl &> /dev/null; then - ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') - ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" ' + # QoS/DSCP configuration + if [[ -n "$MORI_RDMA_TC" ]]; then + echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)" + elif command -v nicctl &> /dev/null; then + ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') + ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" ' $1 == "DSCP" && $2 == ":" && $NF == p { print $3; exit }') - if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then - TC=$(( 4 * ND_DSCP )) - export MORI_RDMA_SL=$ND_PRIO - export MORI_IO_SL=$ND_PRIO - export MORI_RDMA_TC=$TC - export MORI_IO_TC=$TC - echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL" + if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then + TC=$(( 4 * ND_DSCP )) + export MORI_RDMA_SL=$ND_PRIO + export MORI_RDMA_TC=$TC + echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL" + else + echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." + # Fall back to hostname-based detection + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export MORI_RDMA_TC=96 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export MORI_RDMA_TC=104 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + else + echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." + fi + fi else - echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." - # Fall back to hostname-based detection + # nicctl not available, try hostname-based detection NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 - export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 - export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else - echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." + echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." + echo " This is normal for clusters without QoS or outside Docker containers." fi fi -else - # nicctl not available, try hostname-based detection - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export MORI_RDMA_TC=96 - export MORI_IO_TC=96 - echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" - elif [[ $NODENAME == mia1* ]]; then - export MORI_RDMA_TC=104 - export MORI_IO_TC=104 - echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" - else - echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." - echo " This is normal for clusters without QoS or outside Docker containers." - fi -fi - -# FIXME: WA for latest upstream 0305 image -export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} + # FIXME: WA for latest upstream 0305 image + export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} -set +x +fi diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 824605c46..56fefb0ed 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -1,265 +1,260 @@ #!/bin/bash -#SBATCH --job-name=1p2d_bench-serving # Specify a custom string for your slurm batch job -#SBATCH -N 3 # CHECK this to be right in batch jobs -#SBATCH -n 3 # CHECK this to be right in batch jobs +#SBATCH --job-name=disagg-bench +#SBATCH -N 3 # Overridden by submit.sh -N flag +#SBATCH -n 3 # Overridden by submit.sh -n flag #SBATCH --ntasks-per-node=1 #SBATCH --spread-job -#SBATCH --gres=gpu:8 # Request 8 GPUs and 8 NICs (use --gres if specific GPU resources are needed) -#SBATCH --time=24:00:00 # Set a time limit for the job (HH:MM:SS) +#SBATCH --gres=gpu:8 +#SBATCH --time=24:00:00 # --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR +ENGINE="${ENGINE:-sglang}" -# ------------------------ -# Print current time in UTC and PST formats -# ------------------------ echo "=== Job Start Time ===" echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')" +echo "ENGINE: $ENGINE" echo "=======================" echo "" # ============================================================================= -# Model validation from models.yaml (replaces hardcoded VALID_MODELS array) +# Model Validation # ============================================================================= -# DI_REPO_DIR is set below from $(pwd); use the submit-time working directory -# because sbatch copies this script to /var/spool/slurmd/ at runtime. -MODELS_YAML="$(pwd)/models.yaml" + +# Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/ +# at runtime, but the CWD remains the submit-time directory (amd_utils/). +if [[ "$ENGINE" == "vllm" ]]; then + MODELS_YAML="$(pwd)/models_vllm.yaml" +else + MODELS_YAML="$(pwd)/models.yaml" +fi if [[ ! -f "$MODELS_YAML" ]]; then - echo "Error: models.yaml not found at $MODELS_YAML" + echo "Error: models YAML not found at $MODELS_YAML" exit 1 fi -# Validate MODEL_NAME exists as a top-level key in models.yaml +if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then + echo "Error: DOCKER_IMAGE_NAME is not set." + exit 1 +fi + +MODEL_NAME="${MODEL_NAME:-None}" if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then - echo "Error: Model '$MODEL_NAME' not found in models.yaml" + echo "Error: Model '$MODEL_NAME' not found in $MODELS_YAML" echo "Available models:" grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /' exit 1 fi echo "Model found: $MODEL_NAME" -# All models use server.sh as the entrypoint RUN_FILE="server.sh" echo "Runfile set: $RUN_FILE" -if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then - echo "Error: DOCKER_IMAGE_NAME is not set." - exit 1 -fi - -# DI_REPO_DIR points to the repo root so Docker can access both benchmarks/ and utils/. +# DI_REPO_DIR points to the repo root. # $(pwd) is amd_utils/ (the sbatch submit dir); go up 3 levels to reach the repo root. export DI_REPO_DIR=$(cd "$(pwd)/../../.." && pwd) -xP="${xP:-1}" #-> Number of Prefill Workers -yD="${yD:-1}" #-> Number of Decode Workers +xP="${xP:-1}" +yD="${yD:-1}" -# Parallelism Configuration with defaults -PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" -PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" -PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" -DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" -DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" -DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" -DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} # 0 for disabling MTP - -# Benchmark Configuration with defaults +# Benchmark configuration BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" GPUS_PER_NODE="${GPUS_PER_NODE:-8}" -MODEL_NAME="${MODEL_NAME:-None}" +# Engine-specific defaults +PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-false}" +PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-false}" +DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-false}" +DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-false}" +PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" +DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" +DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} + +# ============================================================================= +# Docker privilege detection +# ============================================================================= +# Detect on the batch host. Per-node detection happens inside srun below. +if docker ps &>/dev/null; then + DOCKER_CMD="docker" +else + DOCKER_CMD="sudo docker" +fi +export DOCKER_CMD + +# ============================================================================= +# Model Path Resolution +# ============================================================================= # MODEL_DIR detection: prefer env var, fall back to hostname detection if [[ -z "$MODEL_DIR" ]]; then NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then MODEL_DIR="/nfsdata" - echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then MODEL_DIR="/it-share/data" - echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME" else - MODEL_DIR="/nfsdata" # Default fallback - echo "[INFO] Using default MODEL_DIR=$MODEL_DIR (hostname $NODENAME not recognized)" + MODEL_DIR="/nfsdata" fi + echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $(hostname -s)" fi export MODEL_DIR -# ------------------------ -# Model path validation and selection across all nodes -# ------------------------ -echo "Looking for model: $MODEL_NAME" -echo "Checking model availability across all allocated nodes..." - -# Get all allocated nodes -ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -TOTAL_NODES=$(echo "$ALL_NODES" | wc -l) - -echo "Total allocated nodes: $TOTAL_NODES" -echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')" - -# Function to check model path on all nodes -check_model_path() { - local path=$1 - local check_name=$2 - - echo "Checking $check_name: $path" +if [[ "$ENGINE" == "vllm" ]]; then + # vLLM: Extract hf_dir from models.yaml, search multiple paths, resolve HF cache snapshots + DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} + found && /^[^ ]/{exit} + found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML") + DISK_DIR_NAME="${DISK_DIR_NAME:-$MODEL_NAME}" + echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)" + + resolve_hf_cache_path() { + local base_path=$1 + if [[ -d "${base_path}/snapshots" ]]; then + local snapshot=$(ls -1 "${base_path}/snapshots" 2>/dev/null | head -1) + if [[ -n "$snapshot" ]]; then + echo "${base_path}/snapshots/${snapshot}" + return 0 + fi + fi + echo "$base_path" + return 1 + } + + MODEL_PATH="" + SEARCH_PATHS=( + "${MODEL_DIR}/${DISK_DIR_NAME}" + "${MODEL_DIR}/${MODEL_NAME}" + "/nfsdata/hf_hub_cache-0/${DISK_DIR_NAME}" + "/nfsdata/hf_hub_cache-0/${MODEL_NAME}" + ) + + for search_path in "${SEARCH_PATHS[@]}"; do + if [[ -d "$search_path" ]]; then + RESOLVED=$(resolve_hf_cache_path "$search_path") + MODEL_PATH="$RESOLVED" + echo "Found MODEL_PATH: $MODEL_PATH" + break + fi + done - # Run check on all nodes in parallel - srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c " - if [ -d '$path' ]; then - echo \"\$(hostname): ✓ Found $path\" - exit 0 + if [[ -z "$MODEL_PATH" ]]; then + echo "FATAL: Model '$MODEL_NAME' not found. Searched:" + for p in "${SEARCH_PATHS[@]}"; do echo " - $p"; done + exit 1 + fi + echo "Final MODEL_PATH: $MODEL_PATH" +else + # SGLang: Validate model path across all allocated nodes + echo "Looking for model: $MODEL_NAME" + echo "Checking model availability across all allocated nodes..." + + ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") + TOTAL_NODES=$(echo "$ALL_NODES" | wc -l) + echo "Total allocated nodes: $TOTAL_NODES" + echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')" + + check_model_path() { + local path=$1 + local check_name=$2 + echo "Checking $check_name: $path" + srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c " + if [ -d '$path' ]; then + echo \"\$(hostname): Found $path\" + exit 0 + else + echo \"\$(hostname): Missing $path\" + exit 1 + fi + " + local exit_code=$? + if [ $exit_code -eq 0 ]; then + echo "$check_name available on ALL nodes" + return 0 else - echo \"\$(hostname): ✗ Missing $path\" - exit 1 + echo "$check_name NOT available on all nodes" + return 1 fi - " + } - # Check if all nodes succeeded (exit code 0) - local exit_code=$? - if [ $exit_code -eq 0 ]; then - echo "✓ $check_name available on ALL nodes" - return 0 + if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then + MODEL_PATH="$MODEL_DIR/$MODEL_NAME" + echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" else - echo "✗ $check_name NOT available on all nodes" - return 1 + echo "FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in:" + echo " - $MODEL_DIR/$MODEL_NAME" + exit 1 fi -} - -# Check model weights exist on "$MODEL_DIR/$MODEL_NAME" -if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then - MODEL_PATH="$MODEL_DIR/$MODEL_NAME" - echo "" - echo "✓ Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" -else - echo "" - echo "✗ FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in the following:" - echo " - $MODEL_DIR/$MODEL_NAME" - echo "" - echo "Model must be accessible from all nodes for distributed execution." - echo "Please ensure the model is available on all allocated nodes." - exit 1 + echo "Final MODEL_PATH: $MODEL_PATH" fi -echo "Final MODEL_PATH: $MODEL_PATH" -echo "" - -NUM_NODES="${NUM_NODES}" +# ============================================================================= +# Node Selection +# ============================================================================= -# ------------------------ -# Extract first NUM_NODES from SLURM allocation and update SLURM variables -# ------------------------ -echo "Original SLURM allocation:" -echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "SLURM_NNODES: $SLURM_NNODES" -echo "SLURM_NTASKS: $SLURM_NTASKS" +NUM_NODES=$((xP + yD)) +echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD)" -# Get the full nodelist and extract first NUM_NODES FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') -# Create new nodelist in SLURM format -# This is a simplified approach - for complex ranges, you might need more sophisticated parsing -NEW_SLURM_NODELIST=$(echo "$SELECTED_NODES" | paste -sd, | sed 's/,/,/g') - # Update SLURM environment variables export SLURM_NNODES=$NUM_NODES export SLURM_NTASKS=$NUM_NODES export SLURM_JOB_NUM_NODES=$NUM_NODES export SLURM_NPROCS=$NUM_NODES -export SLURM_JOB_NODELIST="$NEW_SLURM_NODELIST" -export SLURM_NODELIST="$NEW_SLURM_NODELIST" - -# Keep other SLURM variables as they were or set defaults +export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR" +export SLURM_NODELIST="$SELECTED_NODELIST_STR" export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)" -export SLURM_SUBMIT_DIR="${SLURM_SUBMIT_DIR:-$HOME}" -export SLURM_CLUSTER_NAME="${SLURM_CLUSTER_NAME}" # Let SLURM set this automatically -export SLURM_JOB_CPUS_PER_NODE="${SLURM_JOB_CPUS_PER_NODE}" -export SLURM_JOB_PARTITION="${SLURM_JOB_PARTITION}" # Should be set by sbatch/runner -export SLURM_JOBID="${SLURM_JOBID:-$SLURM_JOB_ID}" -export SLURM_JOB_QOS="${SLURM_JOB_QOS}" # Should be set by sbatch/runner if needed -export SLURM_JOB_ACCOUNT="${SLURM_JOB_ACCOUNT}" # Should be set by sbatch/runner export SLURM_NTASKS_PER_NODE=1 -export SLURM_SUBMIT_HOST="${SLURM_SUBMIT_HOST}" -export SLURM_JOB_ID="${SLURM_JOB_ID}" -# SLURM_CONF is auto-set by SLURM, no need to override -export SLURM_JOB_NAME="${SLURM_JOB_NAME:-1p1d_bench-serving}" echo "" -echo "Updated SLURM Environment Variables:" -echo "SLURM_JOB_ID: $SLURM_JOB_ID" -echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "SLURM_NNODES: $SLURM_NNODES" -echo "SLURM_NTASKS: $SLURM_NTASKS" -echo "SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "SLURM_JOB_CPUS_PER_NODE: $SLURM_JOB_CPUS_PER_NODE" -echo "SLURM_JOB_PARTITION: $SLURM_JOB_PARTITION" -echo "SLURM_JOB_NUM_NODES: $SLURM_JOB_NUM_NODES" -echo "SLURM_JOBID: $SLURM_JOBID" -echo "SLURM_JOB_QOS: $SLURM_JOB_QOS" -echo "SLURM_NODELIST: $SLURM_NODELIST" -echo "SLURM_JOB_ACCOUNT: $SLURM_JOB_ACCOUNT" -echo "SLURM_NPROCS: $SLURM_NPROCS" -echo "SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "SLURM_CONF: $SLURM_CONF" -echo "SLURM_JOB_NAME: $SLURM_JOB_NAME" -echo "SLURM_NTASKS_PER_NODE: $SLURM_NTASKS_PER_NODE" -echo "SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "SLURM_CLUSTER_NAME: $SLURM_CLUSTER_NAME" -echo "ulimit: $(ulimit -a)" -echo "" -echo "Selected nodes for execution:" -echo "$SELECTED_NODES" -echo "" +echo "Selected nodes: $SELECTED_NODELIST_STR" + +# ============================================================================= +# IP Resolution +# ============================================================================= -# Node information USER_NAME=$(whoami) MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1) NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1') NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}') IPS=() - -GW_NIC=$(ip route | awk '/^default/ {print $5; exit}') for NODE in $SELECTED_NODES; do IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1') IP=$(echo "$IP" | awk '/src/ {print $7}') IPS+=("$IP") done -echo "Selected node IPs: ${IPS[*]}" | sed 's/ /,/g' +echo "Node IPs: ${IPS[*]}" DOCKER_MOUNT_PATH="/workspace" -SGLANG_WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" -timestamp=$(date +"%Y-%m-%d_%H-%M-%S") +WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" NNODES=$NUM_NODES -echo "MASTER_NODE is ${MASTER_NODE}" -echo "NODE0_ADDR is ${NODE0_ADDR}" -echo "NNODES is ${NNODES}" -echo "REPO Directory is ${DI_REPO_DIR}" -echo "USER_NAME is ${USER_NAME}" - -# Get the RDMA priority and DSCP value from the NIC -if ! command -v nicctl >/dev/null 2>&1; then - echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2 - exit 1 -fi +echo "MASTER_NODE: ${MASTER_NODE}" +echo "NODE0_ADDR: ${NODE0_ADDR}" +echo "NNODES: ${NNODES}" +echo "REPO DIR: ${DI_REPO_DIR}" +echo "USER: ${USER_NAME}" # Reduce log spam export TQDM_MININTERVAL=20 +# Translate the host-resolved MODEL_PATH to the Docker mount namespace +DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}" + export DI_REPO_DIR=$DI_REPO_DIR -export SGLANG_WS_PATH=$SGLANG_WS_PATH +export WS_PATH=$WS_PATH export NNODES=$NNODES export NODE0_ADDR=$NODE0_ADDR export MODEL_PATH=$MODEL_PATH @@ -269,21 +264,16 @@ export yD=$yD export MODEL_NAME=$MODEL_NAME export USER_NAME=$USER_NAME export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')" -export PREFILL_TP_SIZE=$PREFILL_TP_SIZE -export PREFILL_ENABLE_EP=$PREFILL_ENABLE_EP -export PREFILL_ENABLE_DP=$PREFILL_ENABLE_DP -export DECODE_TP_SIZE=$DECODE_TP_SIZE -export DECODE_ENABLE_EP=$DECODE_ENABLE_EP -export DECODE_ENABLE_DP=$DECODE_ENABLE_DP -export DECODE_MTP_SIZE=$DECODE_MTP_SIZE export GPUS_PER_NODE=$GPUS_PER_NODE export BENCH_INPUT_LEN=$BENCH_INPUT_LEN export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY +export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE export DRY_RUN="${DRY_RUN:-0}" export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +export ENGINE=$ENGINE # Eval-related env vars (threaded from submit.sh) export RUN_EVAL="${RUN_EVAL:-false}" @@ -298,38 +288,101 @@ export SPEC_DECODING="${SPEC_DECODING:-}" export IS_MULTINODE="${IS_MULTINODE:-false}" SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') -export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" -export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}" - +export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" +export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" -# Use only the selected nodes for srun execution SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) - cleanup() { - echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..." - # clean up the logs folder - sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true - + echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." + rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true echo "[${SLURM_JOB_ID}] cleanup done." } trap cleanup INT TERM HUP - -# Force NFS cache refresh on all nodes before running Docker to avoid stale file handle errors +# Force NFS cache refresh on all nodes echo "Refreshing NFS caches on all nodes..." srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' sync - # Force re-stat of the mounted directory to refresh NFS handles ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils > /dev/null 2>&1 stat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 cat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 - # Drop caches if we have permission (optional, requires root) echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true echo "NFS cache refreshed on $(hostname)" ' +# ============================================================================= +# Build engine-specific Docker environment variables +# ============================================================================= + +# Common env vars (always passed) +DOCKER_ENV_COMMON=( + -e SLURM_JOB_ID=\$SLURM_JOB_ID + -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST + -e NNODES=\$NNODES + -e NODE_RANK=\$SLURM_PROCID + -e NODE0_ADDR=\$NODE0_ADDR + -e MODEL_DIR=/models + -e MODEL_NAME=\$MODEL_NAME + -e GPUS_PER_NODE=\$GPUS_PER_NODE + -e xP=\$xP + -e yD=\$yD + -e IPADDRS=\$IPADDRS + -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN + -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN + -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO + -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER + -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY + -e TQDM_MININTERVAL=\$TQDM_MININTERVAL + -e DRY_RUN=\$DRY_RUN + -e BENCHMARK_LOGS_DIR=/benchmark_logs + -e ENGINE=\$ENGINE + -e WS_PATH=${WS_PATH} + -e RUN_EVAL=\$RUN_EVAL + -e EVAL_ONLY=\$EVAL_ONLY + -e EVAL_CONC=\$EVAL_CONC + -e FRAMEWORK=\$FRAMEWORK + -e PRECISION=\$PRECISION + -e MODEL_PREFIX=\$MODEL_PREFIX + -e RUNNER_TYPE=\$RUNNER_TYPE + -e RESULT_FILENAME=\$RESULT_FILENAME + -e SPEC_DECODING=\$SPEC_DECODING + -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE + -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP + -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP + -e DECODE_TP_SIZE=\$DECODE_TP_SIZE + -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP + -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP + -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE +) + +# Engine-specific env vars +if [[ "$ENGINE" == "vllm" ]]; then + DOCKER_ENV_ENGINE=( + -e VLLM_WS_PATH=${WS_PATH} + -e MODEL_PATH=$DOCKER_MODEL_PATH + -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma + -e UCX_SOCKADDR_TLS_PRIORITY=tcp + -e UCX_MEMTYPE_CACHE=y + -e UCX_RNDV_SCHEME=get_zcopy + -e UCX_RNDV_THRESH=4k + -e UCX_ROCM_IPC_MIN_ZCOPY=0 + -e UCX_LOG_LEVEL=warn + -e HSA_ENABLE_SDMA=1 + -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} + -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} + -e PYTHONPYCACHEPREFIX=/tmp/pycache + ) +else + DOCKER_ENV_ENGINE=( + -e SGLANG_WS_PATH=${WS_PATH} + ) +fi + +# Engine-specific container filter for pre-clean +CONT_FILTER="name=^container_${ENGINE}_" + srun \ --nodelist="$SELECTED_NODELIST_SRUN" \ --kill-on-bad-exit=1 \ @@ -341,10 +394,10 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" # Pre-clean (idempotent) -sudo docker ps -aq --filter \"name=^container_sbatch_\" | xargs -r sudo docker rm -f || true -sudo docker ps -aq | xargs -r sudo docker stop || true +\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$_DCMD rm -f || true +\$DOCKER_CMD ps -aq | xargs -r \$_DCMD stop || true -exec sudo docker run --rm \ +exec \$DOCKER_CMD run --rm \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -367,51 +420,18 @@ exec sudo docker run --rm \ --cap-add SYS_PTRACE \ --security-opt seccomp=unconfined \ --privileged \ + -v /sys:/sys \ + $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \ -v ${MODEL_DIR}:/models \ -v \$HOME/.ssh:/root/.ssh \ - -v $(which nicctl):/usr/sbin/nicctl \ --shm-size 128G \ -v /tmp:/run_logs \ -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ - -e SLURM_JOB_ID=\$SLURM_JOB_ID \ - -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST \ - -e NNODES=\$NNODES \ - -e NODE_RANK=\$SLURM_PROCID \ - -e NODE0_ADDR=\$NODE0_ADDR \ - -e MODEL_DIR=/models \ - -e SGLANG_WS_PATH=${SGLANG_WS_PATH} \ - -e GPUS_PER_NODE=\$GPUS_PER_NODE \ - -e xP=\$xP \ - -e yD=\$yD \ - -e MODEL_NAME=\$MODEL_NAME \ - -e IPADDRS=\$IPADDRS \ - -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \ - -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \ - -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP \ - -e DECODE_TP_SIZE=\$DECODE_TP_SIZE \ - -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP \ - -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP \ - -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE \ - -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN \ - -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN \ - -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO \ - -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER \ - -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY \ - -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \ - -e DRY_RUN=\$DRY_RUN \ - -e BENCHMARK_LOGS_DIR=/benchmark_logs \ - -e RUN_EVAL=\$RUN_EVAL \ - -e EVAL_ONLY=\$EVAL_ONLY \ - -e EVAL_CONC=\$EVAL_CONC \ - -e FRAMEWORK=\$FRAMEWORK \ - -e PRECISION=\$PRECISION \ - -e MODEL_PREFIX=\$MODEL_PREFIX \ - -e RUNNER_TYPE=\$RUNNER_TYPE \ - -e RESULT_FILENAME=\$RESULT_FILENAME \ - -e SPEC_DECODING=\$SPEC_DECODING \ - -e IS_MULTINODE=\$IS_MULTINODE \ + ${DOCKER_ENV_COMMON[*]} \ + ${DOCKER_ENV_ENGINE[*]} \ --name \"$DOCKER_CONT_NAME\" \ + --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' set -o pipefail mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' @@ -425,4 +445,4 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then fi " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'sudo docker rm -f $DOCKER_CONT_NAME 2>/dev/null || true' +srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml new file mode 100644 index 000000000..c68bb46e3 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml @@ -0,0 +1,42 @@ +# Model-specific vLLM server configurations for disaggregated inference. +# +# Each top-level key is a MODEL_NAME value (must match the model identifier +# used in amd-master.yaml and the directory/HF-cache name under MODEL_DIR). +# +# To add a new model: add a new top-level entry following the same schema. +# No script changes are required. +# +# Schema: +# : +# prefill_flags: str # vLLM CLI flags for prefill workers +# decode_flags: str # vLLM CLI flags for decode workers +# env: str # Space-separated KEY=VALUE pairs exported before vllm serve +# hf_dir: str # (optional) On-disk directory name if it differs from the key +# # e.g. HF cache layout: models--amd--Kimi-K2.5-MXFP4 + +Llama-3.1-405B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +amd-Llama-3.3-70B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +Kimi-K2.5-MXFP4: + prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" + hf_dir: "models--amd--Kimi-K2.5-MXFP4" + +MiniMax-M2.5: + prefill_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600" + hf_dir: "models--MiniMaxAI--MiniMax-M2.5" + +gpt-oss-120b: + prefill_flags: "--tensor-parallel-size 8" + decode_flags: "--tensor-parallel-size 8" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" diff --git a/benchmarks/multi_node/amd_utils/moriio_proxy.py b/benchmarks/multi_node/amd_utils/moriio_proxy.py new file mode 100644 index 000000000..7d1e8454b --- /dev/null +++ b/benchmarks/multi_node/amd_utils/moriio_proxy.py @@ -0,0 +1,327 @@ +#!/usr/bin/env python3 +# MoRI-IO proxy server for vLLM PD disaggregation. +# +# Based on vLLM's examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py +# with the following adaptations for production multi-node use: +# - Ports configurable via PROXY_HTTP_PORT / PROXY_PING_PORT env vars +# - /health endpoint for sync.py barrier readiness checks +# - Uses stdlib `re` instead of `regex` to avoid extra dep +# +# The proxy performs two roles that vllm-router cannot: +# 1. ZMQ service discovery — prefill/decode workers register their RDMA ports +# 2. Request enrichment — injects remote endpoint info into kv_transfer_params + +import asyncio +import copy +import logging +import os +import re +import socket +import threading +import time +import uuid + +import aiohttp +import msgpack +import zmq +from quart import Quart, make_response, request + +logger = logging.getLogger("moriio_proxy") +logger.setLevel(logging.DEBUG) +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter( + "%(asctime)s %(levelname)s [%(name)s] %(message)s")) +logger.addHandler(handler) + +prefill_instances: list[dict] = [] +decode_instances: list[dict] = [] +request_nums = 0 +app = Quart(__name__) + +STREAM_IDLE_TIMEOUT = int(os.environ.get("PROXY_STREAM_IDLE_TIMEOUT", "300")) + +IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)") + +TRANSFER_TYPE = None + + +def _append_whole_dict_unique(target_list, data_dict): + new_filtered = {k: v for k, v in data_dict.items() if k != "index"} + for existed in target_list: + existed_filtered = {k: v for k, v in existed.items() if k != "index"} + if existed_filtered == new_filtered: + return False + logger.info("Registered instance: role=%s addr=%s hs_port=%s notify=%s dp=%s tp=%s", + data_dict.get("role"), data_dict.get("request_address"), + data_dict.get("handshake_port"), data_dict.get("notify_port"), + data_dict.get("dp_size"), data_dict.get("tp_size")) + target_list.append(data_dict) + transfer_mode = data_dict.get("transfer_mode", "unknown") + global TRANSFER_TYPE + + if TRANSFER_TYPE is None: + TRANSFER_TYPE = transfer_mode + logger.info("Transfer mode set to: %s", TRANSFER_TYPE) + elif transfer_mode != TRANSFER_TYPE: + raise ValueError(f"mismatched transfer mode {TRANSFER_TYPE} vs {transfer_mode}") + + return True + + +_list_lock = threading.RLock() + + +def _listen_for_register(hostname, port): + context = zmq.Context() + router_socket = context.socket(zmq.ROUTER) + router_socket.bind(f"tcp://{hostname}:{port}") + poller = zmq.Poller() + poller.register(router_socket, zmq.POLLIN) + global prefill_instances + global decode_instances + + while True: + socks = dict(poller.poll()) + if router_socket in socks: + remote_addr, msg = router_socket.recv_multipart() + data = msgpack.loads(msg) + if data["type"] == "HELLO": + pass + elif ( + data["type"] == "register" + and data["role"] == "P" + and data["request_address"] not in prefill_instances + ): + with _list_lock: + _append_whole_dict_unique(prefill_instances, data) + + elif ( + data["type"] == "register" + and data["role"] == "D" + and data["request_address"] not in decode_instances + ): + with _list_lock: + _append_whole_dict_unique(decode_instances, data) + + +def start_service_discovery(hostname, port): + if not hostname: + hostname = socket.gethostname() + if port == 0: + raise ValueError("Port cannot be 0") + + _listener_thread = threading.Thread( + target=_listen_for_register, args=(hostname, port), daemon=True + ) + _listener_thread.start() + logger.info("Service discovery listening on %s:%s", hostname, port) + return _listener_thread + + +async def send_request_to_prefill( + endpoint, req_data, request_id, d_endpoint, dip, dport, selected_prefill_dp_rank +): + req_data_copy = req_data + + req_data_copy["kv_transfer_params"].update( + { + "do_remote_decode": True, + "do_remote_prefill": False, + "remote_handshake_port": d_endpoint["handshake_port"], + "remote_notify_port": d_endpoint["notify_port"], + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": dip, + "remote_port": dport, + } + ) + req_data_copy["stream"] = False + req_data_copy["max_tokens"] = 1 + if "max_completion_tokens" in req_data_copy: + req_data_copy["max_completion_tokens"] = 1 + if "stream_options" in req_data_copy: + del req_data_copy["stream_options"] + async with aiohttp.ClientSession( + timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) + ) as session: + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + if selected_prefill_dp_rank is not None: + headers["X-data-parallel-rank"] = str(selected_prefill_dp_rank) + async with session.post( + url=endpoint, json=req_data_copy, headers=headers + ) as response: + if response.status == 200: + return await response.json() + else: + raise RuntimeError( + f"Prefill response status={response.status}" + ) + + +async def start_decode_request(endpoint, req_data, request_id): + session = aiohttp.ClientSession( + timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) + ) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + response = await session.post(url=endpoint, json=req_data, headers=headers) + return session, response + + +async def stream_decode_response(session, response, request_id): + try: + if response.status == 200: + chunk_iter = response.content.iter_chunked(1024).__aiter__() + while True: + try: + chunk_bytes = await asyncio.wait_for( + chunk_iter.__anext__(), timeout=STREAM_IDLE_TIMEOUT, + ) + yield chunk_bytes + except StopAsyncIteration: + break + except asyncio.TimeoutError: + logger.error( + "Decode stream %s idle for %ds, aborting", + request_id, STREAM_IDLE_TIMEOUT, + ) + break + else: + raise RuntimeError( + f"Decode response status={response.status}" + ) + finally: + await response.release() + await session.close() + + +@app.route("/health", methods=["GET"]) +async def health_check(): + with _list_lock: + p_count = len(prefill_instances) + d_count = len(decode_instances) + return await make_response( + ({"status": "ok", "prefill_instances": p_count, "decode_instances": d_count}, 200) + ) + + +@app.route("/v1/completions", methods=["POST"]) +@app.route("/v1/chat/completions", methods=["POST"]) +async def handle_request(): + try: + with _list_lock: + global request_nums + request_nums += 1 + + def extract_ip_port_fast(url): + match = IP_PORT_PATTERN.search(url) + if not match: + raise ValueError(f"Invalid URL format: {url}") + return match.groups() + + req_data = await request.get_json() + request_id = str(uuid.uuid4()) + + if not prefill_instances or not decode_instances: + return await make_response( + ("Service Unavailable: No prefill or decode instances registered.", 503) + ) + + pid = request_nums % len(prefill_instances) + did = request_nums % len(decode_instances) + prefill_instance_endpoint = prefill_instances[pid] + decode_instance_endpoint = decode_instances[did] + + selected_prefill_dp_rank = None + if prefill_instance_endpoint["dp_size"] > 1: + selected_prefill_dp_rank = request_nums % prefill_instance_endpoint["dp_size"] + + dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"]) + + req_data_to_prefill = copy.deepcopy(req_data) + req_data_to_prefill["kv_transfer_params"] = {"transfer_id": request_id} + req_data["kv_transfer_params"] = {"transfer_id": request_id} + req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = ( + decode_instance_endpoint["dp_size"] + ) + req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = ( + decode_instance_endpoint["tp_size"] + ) + + send_prefill_task = asyncio.create_task( + send_request_to_prefill( + prefill_instance_endpoint["request_address"], + req_data_to_prefill, + request_id, + decode_instance_endpoint, + dip, + dport, + selected_prefill_dp_rank, + ) + ) + ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"]) + + req_data["max_tokens"] -= 1 + + req_data["kv_transfer_params"] = { + "transfer_id": request_id, + "do_remote_decode": False, + "do_remote_prefill": True, + "remote_handshake_port": prefill_instance_endpoint["handshake_port"], + "remote_notify_port": prefill_instance_endpoint["notify_port"], + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": ip, + "remote_port": port, + } + if TRANSFER_TYPE == "READ": + prefill_response = await send_prefill_task + req_data["kv_transfer_params"]["remote_engine_id"] = prefill_response[ + "kv_transfer_params" + ]["remote_engine_id"] + req_data["kv_transfer_params"]["remote_block_ids"] = prefill_response[ + "kv_transfer_params" + ]["remote_block_ids"] + + req_data["kv_transfer_params"]["remote_dp_size"] = prefill_instance_endpoint[ + "dp_size" + ] + req_data["kv_transfer_params"]["remote_tp_size"] = prefill_instance_endpoint[ + "tp_size" + ] + + if selected_prefill_dp_rank is not None: + req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank + + decode_request_task = asyncio.create_task( + start_decode_request( + decode_instance_endpoint["request_address"], req_data, request_id + ) + ) + + session, decode_response = await decode_request_task + stream_generator = stream_decode_response(session, decode_response, request_id) + response = await make_response(stream_generator) + return response + except Exception as e: + logger.exception("Error handling request: %s", e) + return await make_response((f"Internal Server Error: {e!s}", 500)) + + +if __name__ == "__main__": + http_port = int(os.environ.get("PROXY_HTTP_PORT", "30000")) + ping_port = int(os.environ.get("PROXY_PING_PORT", "36367")) + + t = start_service_discovery("0.0.0.0", ping_port) + app.debug = False + app.config["BODY_TIMEOUT"] = 360000 + app.config["RESPONSE_TIMEOUT"] = 360000 + + logger.info("MoRI-IO proxy starting: HTTP=%d, ZMQ=%d", http_port, ping_port) + app.run(host="0.0.0.0", port=http_port) + t.join() diff --git a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py new file mode 100644 index 000000000..8290276fb --- /dev/null +++ b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py @@ -0,0 +1,672 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The MiniMax AI team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only MiniMaxM2/M2.5 model.""" + +from collections.abc import Iterable +from typing import Any + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm._aiter_ops import rocm_aiter_ops +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import ( + get_ep_group, + get_pp_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention +from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.model_executor.models.utils import sequence_parallel_chunk +from vllm.sequence import IntermediateTensors + +from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP +from .utils import ( + AutoWeightsLoader, + PPMissingLayer, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) + +logger = init_logger(__name__) + + +class MiniMaxM2MoE(nn.Module): + """MoE layer for MiniMax M2/M2.5 with EP/WideEP/Mori support. + + Follows the DeepSeek V2 MoE pattern: GateLinear + FusedMoE with + expert parallelism, EPLB, and sequence parallel awareness. + """ + + def __init__( + self, + config: PretrainedConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ): + super().__init__() + vllm_config = get_current_vllm_config() + parallel_config = vllm_config.parallel_config + + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + + self.ep_group = get_ep_group().device_group + self.ep_rank = get_ep_group().rank_in_group + self.ep_size = self.ep_group.size() + + self.n_routed_experts: int = config.num_local_experts + self.n_shared_experts: int = 0 + + self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe + self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) + self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + + eplb_config = parallel_config.eplb_config + self.enable_eplb = parallel_config.enable_eplb + self.n_redundant_experts = eplb_config.num_redundant_experts + self.n_logical_experts = self.n_routed_experts + self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.use_routing_bias = getattr(config, "use_routing_bias", False) + if self.use_routing_bias: + self.e_score_correction_bias = nn.Parameter( + torch.empty(config.num_local_experts, dtype=torch.float32) + ) + self.e_score_correction_bias.weight_loader = ( + MiniMaxM2MoE.ebias_weight_loader + ) + else: + self.e_score_correction_bias = None + + self.gate = GateLinear( + config.hidden_size, + config.num_local_experts, + out_dtype=torch.float32, + prefix=f"{prefix}.gate", + ) + + self.experts = FusedMoE( + num_experts=config.num_local_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + reduce_results=False, + renormalize=True, + scoring_func=getattr(config, "scoring_func", "softmax"), + e_score_correction_bias=self.e_score_correction_bias, + quant_config=quant_config, + prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, + is_sequence_parallel=self.is_sequence_parallel, + router_logits_dtype=torch.float32, + gate=self.gate, + routed_scaling_factor=1.0 + if not self.is_rocm_aiter_moe_enabled + else self.routed_scaling_factor, + ) + + @staticmethod + def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None: + assert param.size() == loaded_weight.size() + param.data.copy_(loaded_weight.to(torch.float32)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + + if self.is_sequence_parallel: + hidden_states = sequence_parallel_chunk(hidden_states) + + if self.experts.is_internal_router: + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=hidden_states + ) + else: + router_logits, _ = self.gate(hidden_states) + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=router_logits + ) + + if hidden_states.dtype != torch.float16: + if not self.is_rocm_aiter_moe_enabled: + final_hidden_states = final_hidden_states * self.routed_scaling_factor + + if self.is_sequence_parallel: + final_hidden_states = tensor_model_parallel_all_gather( + final_hidden_states, 0 + ) + final_hidden_states = final_hidden_states[:num_tokens] + elif self.tp_size > 1: + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states + ) + + return final_hidden_states.view(num_tokens, hidden_dim) + + +class MiniMaxM2Attention(nn.Module): + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rotary_dim: int, + rope_parameters: dict[str, Any] | None = None, + attn_window_size: int | None = None, + max_position_embeddings: int = 8192, + head_dim: int | None = None, + rms_norm_eps: float = 1e-06, + qkv_bias: bool = False, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim or (hidden_size // self.total_num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + if ( + rope_parameters is not None + and "partial_rotary_factor" not in rope_parameters + ): + rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim + self.rotary_emb = get_rope( + self.head_dim, + max_position=max_position_embeddings, + rope_parameters=rope_parameters, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + per_layer_sliding_window=attn_window_size, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + + self.q_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_heads, eps=rms_norm_eps + ) + self.k_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = MiniMaxText01RMSNormTP.forward_qk( + self.q_norm, self.k_norm, q.contiguous(), k.contiguous() + ) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class MiniMaxM2DecoderLayer(nn.Module): + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): + max_position_embeddings = max( + config.max_position_embeddings, config.max_model_len + ) + # DecoderLayers are created with `make_layers` which passes the prefix + # with the layer's index. + layer_idx = int(prefix.split(sep=".")[-1]) + + self.layer_idx = layer_idx + self.self_attn = MiniMaxM2Attention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rotary_dim=config.rotary_dim, + rope_parameters=config.rope_parameters, + max_position_embeddings=max_position_embeddings, + rms_norm_eps=config.rms_norm_eps, + qkv_bias=getattr(config, "attention_bias", False), + head_dim=getattr(config, "head_dim", None), + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + self.block_sparse_moe = MiniMaxM2MoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + + hidden_states = self.block_sparse_moe(hidden_states) + + return hidden_states, residual + + +@support_torch_compile +class MiniMaxM2Model(nn.Module): + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config + + self.vocab_size = config.vocab_size + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=None, + prefix=f"{prefix}.embed_tokens", + ) + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: MiniMaxM2DecoderLayer( + config, + prefix, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + ), + prefix=f"{prefix}.layers", + ) + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for layer in self.layers[self.start_layer : self.end_layer]: + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return FusedMoE.make_expert_params_mapping( + self, + ckpt_gate_proj_name="w1", + ckpt_down_proj_name="w2", + ckpt_up_proj_name="w3", + num_experts=self.config.num_local_experts, + num_redundant_experts=0, + ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = self.get_expert_mapping() + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue # skip spec decode layers for main model + + for param_name, weight_name, shard_id in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if ("mlp.experts." in name) and name not in params_dict: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader( + param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + ) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class MiniMaxM2MixtureOfExperts(MixtureOfExperts): + """EPLB protocol implementation for MiniMax M2/M2.5.""" + + moe_mlp_layers: list[MiniMaxM2MoE] + + def extract_moe_parameters(self, example_moe: MiniMaxM2MoE | None): + if example_moe is None: + self.num_moe_layers = 0 + self.num_expert_groups = 0 + self.num_logical_experts = 0 + self.num_physical_experts = 0 + self.num_local_physical_experts = 0 + self.num_routed_experts = 0 + self.num_shared_experts = 0 + self.num_redundant_experts = 0 + logger.warning("MiniMax M2: No MoE layer found in model.layers.") + else: + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for moe in self.moe_mlp_layers: + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + +class MiniMaxM2ForCausalLM( + nn.Module, SupportsLoRA, SupportsPP, MiniMaxM2MixtureOfExperts +): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + if hasattr(vllm_config.model_config, "max_model_len"): + self.config.max_model_len = vllm_config.model_config.max_model_len + self.model = MiniMaxM2Model( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead( + config.vocab_size, config.hidden_size, quant_config=None + ) + else: + self.lm_head = PPMissingLayer() + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + self.num_moe_layers = config.num_hidden_layers + self._set_moe_parameters() + + def _set_moe_parameters(self): + self.expert_weights: list = [] + self.num_expert_groups = 1 + self.moe_layers: list = [] + self.moe_mlp_layers: list[MiniMaxM2MoE] = [] + example_moe = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + assert isinstance(layer, MiniMaxM2DecoderLayer) + if isinstance(layer.block_sparse_moe, MiniMaxM2MoE): + example_moe = layer.block_sparse_moe + self.moe_mlp_layers.append(layer.block_sparse_moe) + self.moe_layers.append(layer.block_sparse_moe.experts) + self.extract_moe_parameters(example_moe) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor | IntermediateTensors: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() + + +def get_spec_layer_idx_from_weight_name( + config: PretrainedConfig, weight_name: str +) -> int | None: + if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0): + layer_idx = config.num_hidden_layers + for i in range(config.num_mtp_modules): + if weight_name.startswith(f"model.layers.{layer_idx + i}."): + return layer_idx + i + return None diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index bbe8de6aa..3c92422be 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -1,63 +1,23 @@ #!/bin/bash -# SGLang Disaggregated Server Launcher with Model-Specific Configurations +# Dual-Engine Disaggregated Server Dispatcher # ============================================================================= - -# ============================================================================= -# Environment Configuration +# Dispatches to the engine-specific server launcher based on ENGINE env var. +# ENGINE=sglang (default) -> server_sglang.sh (SGLang + MoRI) +# ENGINE=vllm -> server_vllm.sh (vLLM + Nixl/MoRI-IO) # ============================================================================= -NODE0_ADDR="${NODE0_ADDR:-localhost}" -NODE_RANK="${NODE_RANK:-0}" -MODEL_DIR="${MODEL_DIR:-}" -MODEL_NAME="${MODEL_NAME:-}" - -xP="${xP:-1}" #-> Number of Prefill Workers -yD="${yD:-1}" #-> Number of Decode Workers - -IPADDRS="${IPADDRS:-localhost}" -HEADNODE_PORT="${HEADNODE_PORT:-20000}" -# Parallelism Configuration -PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" -PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" -PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" -DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" -DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" -DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" -DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" - -# Benchmark Configuration -BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" -BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" -BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" -BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" -BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" -BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" - -# Dry Run for debugging purpose -DRY_RUN="${DRY_RUN:-0}" - -# GPU count (expandable for different hardware) -GPUS_PER_NODE="${GPUS_PER_NODE:-8}" - - -# ============================================================================= -# Dependencies and Environment Setup -# ============================================================================= -source $SGLANG_WS_PATH/env.sh +ENGINE="${ENGINE:-sglang}" +WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}" +export WS_PATH ENGINE -host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') -host_name=$(hostname) +echo "[DISPATCHER] ENGINE=$ENGINE WS_PATH=$WS_PATH" -# MORI_RDMA_TC configuration (optional) -# If set by runner, use it for RDMA traffic class configuration -# If not set, RDMA operations will proceed without QoS/traffic class settings -if [[ -n "${MORI_RDMA_TC}" ]]; then - echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration" - echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC" +if [[ "$ENGINE" == "vllm" ]]; then + source "$WS_PATH/server_vllm.sh" else - echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration." - echo "[INFO] This is normal for clusters without QoS requirements." + source "$WS_PATH/server_sglang.sh" fi +<<<<<<< HEAD # ============================================================================= # Model-Specific Configuration from YAML @@ -759,3 +719,5 @@ fi echo "Script completed successfully" exit 0 +======= +>>>>>>> 766ba4ee (consolidate amd_utils for sglang and vllm) diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh new file mode 100755 index 000000000..53ca29cc5 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -0,0 +1,624 @@ +#!/bin/bash +# SGLang Disaggregated Server Launcher with Model-Specific Configurations +# ============================================================================= + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +NODE0_ADDR="${NODE0_ADDR:-localhost}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_DIR="${MODEL_DIR:-}" +MODEL_NAME="${MODEL_NAME:-}" + +xP="${xP:-1}" #-> Number of Prefill Workers +yD="${yD:-1}" #-> Number of Decode Workers + +IPADDRS="${IPADDRS:-localhost}" +HEADNODE_PORT="${HEADNODE_PORT:-20000}" +# Parallelism Configuration +PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" +PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" +PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" +DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" +DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" +DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" +DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" + +# Benchmark Configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" + +# Dry Run for debugging purpose +DRY_RUN="${DRY_RUN:-0}" + +# GPU count (expandable for different hardware) +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + + +# ============================================================================= +# Dependencies and Environment Setup +# ============================================================================= +source $WS_PATH/env.sh + +host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') +host_name=$(hostname) + +# MORI_RDMA_TC configuration (optional) +# If set by runner, use it for RDMA traffic class configuration +# If not set, RDMA operations will proceed without QoS/traffic class settings +if [[ -n "${MORI_RDMA_TC}" ]]; then + echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration" + echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC" +else + echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration." + echo "[INFO] This is normal for clusters without QoS requirements." +fi + +# ============================================================================= +# Model-Specific Configuration from YAML +# ============================================================================= +MODELS_YAML="${WS_PATH}/models.yaml" + +if [[ ! -f "$MODELS_YAML" ]]; then + echo "ERROR: models.yaml not found at $MODELS_YAML" + exit 1 +fi + +# Load model config via inline Python (PyYAML is available in SGLang containers) +# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP") +# is done here in Python to avoid bash glob-expanding the * characters. +eval "$(python3 -c " +import yaml, sys, os + +config_path = '${MODELS_YAML}' +model_name = '${MODEL_NAME}' + +with open(config_path) as f: + models = yaml.safe_load(f) + +if model_name not in models: + print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') + sys.exit(0) + +m = models[model_name] + +def eval_formula(val): + \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\" + if isinstance(val, (int, float)): + return int(val) + s = str(val) + # Build a namespace from env vars (convert numeric values to int) + ns = {} + for k, v in os.environ.items(): + try: + ns[k] = int(v) + except (ValueError, TypeError): + pass + try: + return int(eval(s, {'__builtins__': {}}, ns)) + except Exception as e: + print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr) + return val + +def parse_range(cuda_range, default_start, default_end): + if '-' in str(cuda_range): + s, e = str(cuda_range).split('-') + return s, e + return str(default_start), str(default_end) + +# Output shell variables +print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"') +print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') +print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"') + +prefill = m.get('prefill', {}) +decode = m.get('decode', {}) + +print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"') +print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"') + +dp = prefill.get('dp', {}) +no_dp = prefill.get('no_dp', {}) +print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') +print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') +print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') +print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') +print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) +print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') +print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') + +print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"') +print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"') + +dp = decode.get('dp', {}) +ep_only = decode.get('ep_only', {}) +no_dp = decode.get('no_dp', {}) + +# Decode DP config +print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160) +print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"') + +# Decode EP-only config (EP enabled but DP disabled) +print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256) +print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"') + +# Decode no-DP config +print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) +print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') +")" + +echo "Loaded model configuration for: $MODEL_NAME" + +# Compute DP-dependent prefill parameters +if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then + prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) + prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP + prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP +else + prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) + prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP + prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP +fi + +# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) +if [[ "$DECODE_ENABLE_DP" == "true" ]]; then + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END)) + decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE)) +elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END)) + decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY +else + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END)) + decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP +fi + +# Use Decode configuration to configure different TP/DP size between P and D +PREFILL_DECODE_DIFFERENT_TP="" +if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then + if [[ "$DECODE_ENABLE_DP" == "true" ]]; then + PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}" + else + PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1" + fi +fi + +# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}" +if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" +fi + +DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}" +if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then + DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" +fi + +if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then + MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) +fi + +# ============================================================================= +# Cluster Topology Configuration +# ============================================================================= +IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" + +# Ceiling division by GPUS_PER_NODE for nodes-per-worker +PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE)) +DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE)) +NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP)) + +# Build prefill arguments dynamically based on xP +PREFILL_HEADNODE_URLS=() +PREFILL_ARGS="" +for i in $(seq 0 $((xP - 1))); do + prefill_idx=$((i * PREFILL_NODES_PER_WORKER)) + PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}" + PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000" +done + +# Build decode arguments dynamically based on yD +DECODE_HEADNODE_URLS=() +DECODE_ARGS="" +for i in $(seq 0 $((yD - 1))); do + decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET)) + DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}" + DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000" +done + +echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}" +echo "Decode worker headnode list: ${DECODE_HEADNODE_URLS[@]}" + +# ============================================================================= +# Configuration Builder Functions +# ============================================================================= + +build_server_config() { + local mode="$1" + local model_name="$2" + local tp_size="$3" + local enable_ep="$4" + local enable_dp="$5" + local decode_mtp_size="$6" + + # Calculate EP and DP sizes based on enable flags + local ep_size=1 + local dp_size=1 + + if [[ "$enable_ep" == "true" ]]; then + ep_size=$tp_size + fi + + if [[ "$enable_dp" == "true" ]]; then + dp_size=$tp_size + fi + + # Build parallelism arguments + local parallel_args="--tp-size ${tp_size}" + + if [[ "$enable_ep" == "true" ]]; then + parallel_args="$parallel_args --ep-size ${ep_size}" + fi + + if [[ "$enable_dp" == "true" ]]; then + parallel_args="$parallel_args --dp-size ${dp_size}" + fi + + # Get model-specific configuration from YAML-loaded variables + local base_config="$MODEL_BASE_FLAGS" + local mtp_config="" + local dp_config="" + local specific_config="" + + # MTP config (only if MTP is enabled and mode is decode) + if [ "$decode_mtp_size" -gt 0 ]; then + mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))" + fi + + # DP config (only if DP is enabled) + if [[ "$enable_dp" == "true" ]]; then + dp_config="$MODEL_DP_FLAGS" + fi + + # Mode-specific config + if [[ "$mode" == "prefill" ]]; then + specific_config="$PREFILL_MODE_FLAGS" + elif [[ "$mode" == "decode" ]]; then + specific_config="$DECODE_MODE_FLAGS" + fi + + # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config + local full_config="$parallel_args" + if [[ -n "$base_config" ]]; then + full_config="$full_config $base_config" + fi + if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then + full_config="$full_config $mtp_config" + fi + if [[ -n "$dp_config" ]]; then + full_config="$full_config $dp_config" + fi + if [[ -n "$specific_config" ]]; then + full_config="$full_config $specific_config" + fi + + echo "$full_config" +} + +# Build complete server configurations +PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE") +DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE") + +if [[ -n "$MODEL_NAME" ]]; then + echo "Using model-specific configuration for: $MODEL_NAME" +fi + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +echo "Waiting at the container creation barrier on $host_name" +python3 $WS_PATH/sync.py barrier \ + --local-ip ${host_ip} \ + --local-port 5000 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 5000 \ + --wait-for-all-ports \ + --timeout 300 + + +# ============================================================================= +# Node Role Assignment and Server Launch +# ============================================================================= + +if [ "$NODE_RANK" -eq 0 ]; then + echo "NODE INFO =======================================" + echo "================================================" + echo "Node List : ${SLURM_JOB_NODELIST}" + echo "Node IPs : ${IPADDRS}" + echo "Model Name : ${MODEL_NAME:-'Not specified'}" + echo "================================================" + + echo "CLUSTER INFO ====================================" + echo "================================================" + echo "${host_name}:${host_ip} is Proxy Node and Prefill Node" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" + echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" + echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" + echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" + echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}" + echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}" + echo "================================================" + + # start the head prefill server + PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + --model-path $MODEL_DIR/$MODEL_NAME \ + --disaggregation-mode prefill \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${PREFILL_SERVER_CONFIG} \ + --log-level-http warning" + + if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then + PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" + fi + + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill0_pid=$! + fi + + + echo "Waiting for all prefill and decode servers to be up . . ." + + + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 8000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + echo "Congratulations!!! All prefill and decode servers are up . . ." + + ROUTER_CMD="python -m sglang_router.launch_router \ + --pd-disaggregation \ + --port 30000 \ + --policy random \ + --prefill-policy random \ + --decode-policy random \ + ${PREFILL_ARGS} \ + ${DECODE_ARGS}" + + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $ROUTER_CMD" + else + ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log" + set -x + if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then + eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & + else + eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 & + fi + set +x + proxy_pid=$! + + # Wait for router to be ready via health endpoint + HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-health \ + --health-endpoint /readiness \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $HEALTH_BARRIER_CMD" + else + eval "$HEALTH_BARRIER_CMD" + fi + + echo "Router is ready for benchmarking" + fi + + + echo "Ready for benchmarking on ${host_name}:${host_ip}" + + echo "Benchmarking on ${host_name}:${host_ip}" + cd $WS_PATH + + # Export IS_MTP based on whether MTP is enabled + if [ "$DECODE_MTP_SIZE" -gt 0 ]; then + export IS_MTP=true + else + export IS_MTP=false + fi + + # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier + BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BENCH_CMD" + else + set -x + eval "$BENCH_CMD" + set +x + fi + + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" + mkdir -p "$LOGS_OUTPUT" + + if [[ "$DRY_RUN" -eq 0 ]]; then + cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" + echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" + fi + + echo "Killing the proxy server and prefill server" + + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $proxy_pid + kill $prefill0_pid + fi + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then + echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" + + PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + --model-path $MODEL_DIR/${MODEL_NAME} \ + --disaggregation-mode prefill \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${PREFILL_SERVER_CONFIG} \ + --log-level-http warning" + + if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then + rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) + prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER)) + PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port 30000" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the rank $NODE_RANK prefill server" + + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $prefill_pid + fi + +else + RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER)) + echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})" + echo "Using decode config: $DECODE_SERVER_CONFIG" + echo "Decode node rank: $RANK" + echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" + + DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + --model-path ${MODEL_DIR}/${MODEL_NAME} \ + --disaggregation-mode decode \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${DECODE_SERVER_CONFIG} \ + --log-level-http warning" + + if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then + rank=$((RANK % DECODE_NODES_PER_WORKER)) + decode_idx=$((RANK / DECODE_NODES_PER_WORKER)) + DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $DECODE_CMD" + else + set -x + eval "$DECODE_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & + + set +x + decode_pid=$! + fi + + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port 30000" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the rank $RANK decode server" + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $decode_pid + fi + +fi + +echo "Script completed successfully" +exit 0 diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh new file mode 100755 index 000000000..a10e45d6d --- /dev/null +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -0,0 +1,490 @@ +#!/bin/bash +# vLLM Disaggregated Server Launcher with Model-Specific Configurations +# ============================================================================= +# +# Node role assignment (by NODE_RANK): +# 0 -> Proxy/Router + first Prefill node (kv_producer) +# 1..xP-1 -> Additional Prefill nodes (kv_producer) +# xP..xP+yD-1 -> Decode nodes (kv_consumer) +# +# Total nodes = xP + yD (router co-located with first prefill, like SGLang). + +# ============================================================================= +# Dependency Setup (idempotent; required when using base vLLM image) +# ============================================================================= +source "$(dirname "${BASH_SOURCE[0]}")/setup_deps.sh" + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +NODE0_ADDR="${NODE0_ADDR:-localhost}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_DIR="${MODEL_DIR:-}" +MODEL_NAME="${MODEL_NAME:-}" + +xP="${xP:-1}" +yD="${yD:-1}" + +IPADDRS="${IPADDRS:-localhost}" + +# Benchmark Configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" + +DRY_RUN="${DRY_RUN:-0}" +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +ROUTER_PORT="${ROUTER_PORT:-30000}" +SERVER_PORT="${SERVER_PORT:-2584}" +ENGINE_ID="${ENGINE_ID:-${MODEL_NAME}-pd-run}" + +# Prefer MODEL_PATH from job.slurm (handles HF cache snapshot resolution) +MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}" + +# ============================================================================= +# Dependencies and Environment Setup +# ============================================================================= +source $WS_PATH/env.sh + +host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}') +# RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available) +rdma_ip=$(hostname -I | tr ' ' '\n' | grep '^192\.168\.' | head -1) +rdma_ip="${rdma_ip:-$host_ip}" +host_name=$(hostname) + +echo "[INFO] Management IP (barriers/proxy): $host_ip" +echo "[INFO] RDMA IP (Nixl KV transfer): $rdma_ip" + +# ============================================================================= +# RDMA / Nixl Workarounds +# ============================================================================= + +setup_rdma_env() { + # Pensando ionic (RoCEv2) point-to-point /31 route fix. + # Each benic interface has a /31 to the TOR switch. Without explicit routes, + # traffic to other nodes' RDMA IPs falls through to the management network. + if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then + local rdma_subnet="${BASH_REMATCH[1]}" + local rdma_host="${BASH_REMATCH[2]}" + local rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))" + local rdma_iface + rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1) + if [[ -n "$rdma_iface" ]]; then + ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \ + echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \ + echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24" + fi + fi + + # Patch Nixl UCX backend: set ucx_error_handling_mode=none. + # Required for ALL NIC types under high concurrency (C512+). Without this, + # UCX's default UCP_ERR_HANDLING_MODE_PEER triggers transport-level error + # recovery on ibv_post_send failures, preventing RIXL RDMA READ retries from + # recovering gracefully. This causes the prefill KV cache to fill to 100% + # and deadlock the pipeline. On ionic NICs this was already applied (rdmacm + # incompatibility); on mlx5 NICs it was incorrectly skipped. + local nixl_api + nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) + if [[ -n "$nixl_api" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then + sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api (IBDEVICES=${IBDEVICES:-unset})" + else + echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" + fi + fi +} + +setup_rdma_env + +if [[ -z "$UCX_NET_DEVICES" ]]; then + echo "Error: UCX_NET_DEVICES is empty after env.sh detection" >&2 + exit 1 +fi + +# ============================================================================= +# Model-Specific Configuration from YAML +# ============================================================================= +MODELS_YAML="${WS_PATH}/models_vllm.yaml" + +if [[ ! -f "$MODELS_YAML" ]]; then + echo "ERROR: models.yaml not found at $MODELS_YAML" + exit 1 +fi + +if [[ -z "$MODEL_NAME" ]]; then + echo "ERROR: MODEL_NAME is not set"; exit 1 +fi + +eval "$(python3 -c " +import yaml, sys + +with open('${MODELS_YAML}') as f: + models = yaml.safe_load(f) + +model_name = '${MODEL_NAME}' +if model_name not in models: + print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') + sys.exit(0) + +m = models[model_name] + +def bash_escape(s): + \"\"\"Escape a value for safe embedding in a bash double-quoted assignment.\"\"\" + return s.replace('\\\\', '\\\\\\\\').replace('\"', '\\\\\"').replace('\$', '\\\\\$').replace('\`', '\\\\\`') + +pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8')) +df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8')) +ev = bash_escape(m.get('env', '')) +dev = bash_escape(m.get('decode_env', '')) +print(f'PREFILL_SERVER_CONFIG=\"{pf}\"') +print(f'DECODE_SERVER_CONFIG=\"{df}\"') +print(f'MODEL_ENVS=\"{ev}\"') +print(f'DECODE_MODEL_ENVS=\"{dev}\"') +")" + +echo "Loaded model configuration for: $MODEL_NAME" + +# Apply tensor-parallel size and EP/DP flags from submit pipeline. +if [[ -n "${PREFILL_TP_SIZE:-}" ]]; then + if echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${PREFILL_TP_SIZE}/g") + else + PREFILL_SERVER_CONFIG+=" --tensor-parallel-size ${PREFILL_TP_SIZE}" + fi +fi +if [[ -n "${DECODE_TP_SIZE:-}" ]]; then + if echo "$DECODE_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${DECODE_TP_SIZE}/g") + else + DECODE_SERVER_CONFIG+=" --tensor-parallel-size ${DECODE_TP_SIZE}" + fi +fi +if [[ "${PREFILL_ENABLE_EP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + PREFILL_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${PREFILL_ENABLE_DP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + PREFILL_SERVER_CONFIG+=" --enable-dp-attention" +fi +if [[ "${DECODE_ENABLE_EP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + DECODE_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${DECODE_ENABLE_DP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + DECODE_SERVER_CONFIG+=" --enable-dp-attention" +fi + +echo "PREFILL_SERVER_CONFIG (after TP/EP/DP): $PREFILL_SERVER_CONFIG" +echo "DECODE_SERVER_CONFIG (after TP/EP/DP): $DECODE_SERVER_CONFIG" + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +echo "Waiting at the container creation barrier on $host_name" +python3 $WS_PATH/sync.py barrier \ + --local-ip ${host_ip} \ + --local-port 5000 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 5000 \ + --wait-for-all-ports \ + --timeout 600 + +# ============================================================================= +# ETCD Server Setup +# ============================================================================= + +echo "Proceeding to start etcd server on $host_name" +bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 & +etcd_pid=$! + +echo "Waiting at etcd server barrier on $host_name" +python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 2379 \ + --wait-for-all-ports \ + --timeout 300 + +echo "All etcd servers are up : $host_name" +sleep 3 + +echo "etcd endpoint health==================" +etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true +echo "======================================" + +python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 2379 \ + --wait-for-all-ports \ + --timeout 300 + +# ============================================================================= +# Cluster Topology Configuration +# ============================================================================= +IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" + +PREFILL_ARGS="" +DECODE_ARGS="" + +for ((i=0; i "$PROXY_LOG_FILE" 2>&1 & + set +x + proxy_pid=$! + sleep 3 + fi + + PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + ${PREFILL_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" + set -x + eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & + set +x + prefill_pid=$! + fi + + echo "Waiting for all prefill and decode servers to be up . . ." + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: skipping barrier (wait-for-all-ports)" + else + python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports $SERVER_PORT \ + --wait-for-all-ports \ + --timeout 1800 + fi + + echo "Congratulations!!! All prefill and decode servers are up . . ." + + # Wait for proxy /health to confirm it is accepting requests + HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-health \ + --health-endpoint /health \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $HEALTH_BARRIER_CMD" + else + eval "$HEALTH_BARRIER_CMD" + echo "MoRI-IO proxy is ready for benchmarking" + fi + + echo "Ready for benchmarking on ${host_name}:${host_ip}" + echo "Benchmarking on ${host_name}:${host_ip}" + cd $WS_PATH + + export ROUTER_PORT=$ROUTER_PORT + BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((GPUS_PER_NODE*xP)) $((GPUS_PER_NODE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BENCH_CMD" + else + set -x + eval "$BENCH_CMD" + set +x + fi + + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" + mkdir -p "$LOGS_OUTPUT" + + if [[ "$DRY_RUN" -eq 0 ]]; then + cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" + echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" + fi + + echo "Killing the proxy server and prefill server" + if [[ "$DRY_RUN" -eq 0 ]]; then + [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true + [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true + sleep 2 + # Fallback: ensure no orphaned processes keep ports open + pkill -f moriio_proxy 2>/dev/null || true + pkill -f "vllm serve" 2>/dev/null || true + fi + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then + echo "${host_name}:${host_ip} is Additional Prefill Node (Model: ${MODEL_NAME})" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + + setup_vllm_env + + PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + ${PREFILL_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" + set -x + eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & + set +x + prefill_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the prefill server" + [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid 2>/dev/null || true + +else + echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})" + echo "Using decode config: $DECODE_SERVER_CONFIG" + + setup_vllm_env + + for env_pair in ${DECODE_MODEL_ENVS}; do + export "$env_pair" + echo "[DECODE_ENV] $env_pair" + done + + DECODE_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + ${DECODE_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $DECODE_CMD" + else + DECODE_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log" + set -x + eval "$DECODE_CMD" > "$DECODE_LOG_FILE" 2>&1 & + set +x + decode_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the decode server" + [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true +fi + +echo "Killing the etcd server" +kill $etcd_pid 2>/dev/null || true +pkill -f etcd 2>/dev/null || true + +echo "Script completed successfully" +exit 0 diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh new file mode 100644 index 000000000..8c7a9f07a --- /dev/null +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -0,0 +1,908 @@ +#!/bin/bash +# ============================================================================= +# setup_deps.sh — Install missing vLLM disagg dependencies at container start. +# +# Base image: vllm/vllm-openai-rocm:v0.18.0 +# Sourced by server.sh so PATH / LD_LIBRARY_PATH exports persist. +# Idempotent: each component is skipped if already present. +# +# Build steps run in subshells to avoid CWD pollution between installers. +# ============================================================================= + +ROCM_PATH="${ROCM_PATH:-/opt/rocm}" +UCX_HOME="${UCX_HOME:-/usr/local/ucx}" +RIXL_HOME="${RIXL_HOME:-/usr/local/rixl}" + +_SETUP_START=$(date +%s) +_SETUP_INSTALLED=() + +git_clone_retry() { + local url="$1" dest="$2" max_tries=3 try=1 + while (( try <= max_tries )); do + if git clone --quiet "$url" "$dest" 2>/dev/null; then return 0; fi + echo "[SETUP] git clone attempt $try/$max_tries failed for $url, retrying in 10s..." + rm -rf "$dest" + sleep 10 + (( try++ )) + done + echo "[SETUP] git clone failed after $max_tries attempts: $url" + return 1 +} + +# --------------------------------------------------------------------------- +# 1. UCX (ROCm fork — required for GPU-direct RDMA via Nixl) +# --------------------------------------------------------------------------- +install_ucx() { + if [[ -x "${UCX_HOME}/bin/ucx_info" ]]; then + echo "[SETUP] UCX already present at ${UCX_HOME}" + return 0 + fi + + echo "[SETUP] Installing UCX build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + autoconf automake libtool pkg-config \ + librdmacm-dev rdmacm-utils libibverbs-dev ibverbs-utils ibverbs-providers \ + infiniband-diags perftest ethtool rdma-core strace \ + && rm -rf /var/lib/apt/lists/* + + echo "[SETUP] Building UCX from source (ROCm/ucx @ da3fac2a)..." + ( + set -e + mkdir -p /usr/local/src && cd /usr/local/src + git_clone_retry https://github.com/ROCm/ucx.git ucx && cd ucx + git checkout da3fac2a + ./autogen.sh && mkdir -p build && cd build + ../configure \ + --prefix="${UCX_HOME}" \ + --enable-shared --disable-static \ + --disable-doxygen-doc --enable-optimizations \ + --enable-devel-headers --enable-mt \ + --with-rocm="${ROCM_PATH}" --with-verbs --with-dm + make -j"$(nproc)" && make install + ) + rm -rf /usr/local/src/ucx + + if [[ ! -x "${UCX_HOME}/bin/ucx_info" ]]; then + echo "[SETUP] ERROR: UCX build failed"; exit 1 + fi + _SETUP_INSTALLED+=("UCX") +} + +# --------------------------------------------------------------------------- +# 2. RIXL (ROCm fork of NIXL — KV cache transfer for disaggregated vLLM) +# --------------------------------------------------------------------------- +install_rixl() { + if python3 -c "import rixl" 2>/dev/null; then + echo "[SETUP] RIXL Python bindings already present" + return 0 + fi + + echo "[SETUP] Installing RIXL build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc \ + libcpprest-dev libaio-dev \ + && rm -rf /var/lib/apt/lists/* + pip3 install --quiet meson "pybind11[global]" + + echo "[SETUP] Building RIXL from source (ROCm/RIXL @ f33a5599)..." + ( + set -e + git_clone_retry https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl + git checkout f33a5599 + meson setup build --prefix="${RIXL_HOME}" \ + -Ducx_path="${UCX_HOME}" \ + -Drocm_path="${ROCM_PATH}" + cd build && ninja && ninja install + cd /opt/rixl + pip install --quiet \ + --config-settings=setup-args="-Drocm_path=${ROCM_PATH}" \ + --config-settings=setup-args="-Ducx_path=${UCX_HOME}" . + ) + rm -rf /opt/rixl + + if ! python3 -c "import rixl" 2>/dev/null; then + echo "[SETUP] ERROR: RIXL build failed"; exit 1 + fi + _SETUP_INSTALLED+=("RIXL") +} + +# --------------------------------------------------------------------------- +# 3. etcd (distributed KV store for vLLM disagg service discovery) +# --------------------------------------------------------------------------- +install_etcd() { + if [[ -x /usr/local/bin/etcd/etcd ]]; then + echo "[SETUP] etcd already present" + return 0 + fi + + local version="v3.6.0-rc.5" + echo "[SETUP] Downloading etcd ${version}..." + wget -q "https://github.com/etcd-io/etcd/releases/download/${version}/etcd-${version}-linux-amd64.tar.gz" \ + -O /tmp/etcd.tar.gz + mkdir -p /usr/local/bin/etcd + tar -xf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 + rm /tmp/etcd.tar.gz + _SETUP_INSTALLED+=("etcd") +} + +# --------------------------------------------------------------------------- +# 4. libionic1 (Pensando ionic RDMA verbs provider for RoCEv2 KV transfer) +# Harmless on non-Pensando nodes (shared lib is simply unused). +# --------------------------------------------------------------------------- +install_libionic() { + if dpkg -l libionic1 2>/dev/null | grep -q '^ii'; then + echo "[SETUP] libionic1 already installed" + return 0 + fi + + echo "[SETUP] Downloading and installing libionic1..." + wget -q "https://repo.radeon.com/amdainic/pensando/ubuntu/1.117.5/pool/main/r/rdma-core/libionic1_54.0-149.g3304be71_amd64.deb" \ + -O /tmp/libionic1.deb + dpkg -i /tmp/libionic1.deb || true + rm -f /tmp/libionic1.deb + _SETUP_INSTALLED+=("libionic1") +} + +# --------------------------------------------------------------------------- +# 5. MoRI-IO proxy deps (Python packages for the MoRI-IO-aware proxy server) +# The proxy replaces vllm-router: it handles both HTTP routing AND the +# MoRI-IO ZMQ registration/request-enrichment protocol. +# Only needed on NODE_RANK=0 (proxy node). +# --------------------------------------------------------------------------- +install_mori_proxy_deps() { + if python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then + echo "[SETUP] MoRI-IO proxy Python deps already present" + return 0 + fi + + echo "[SETUP] Installing MoRI-IO proxy Python deps..." + # v0.18.0 ships aiohttp, pyzmq, blinker(distutils); only quart and msgpack + # are missing. --ignore-installed blinker avoids pip's distutils uninstall + # error when quart pulls a newer blinker version. + pip install --quiet --ignore-installed blinker + pip install --quiet quart msgpack + + if ! python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then + echo "[SETUP] ERROR: MoRI-IO proxy deps install failed"; exit 1 + fi + _SETUP_INSTALLED+=("mori-proxy-deps") +} + +# --------------------------------------------------------------------------- +# 6. MoRI (Modular RDMA Interface — EP dispatch/combine kernels for MoE) +# Required for --all2all-backend mori (Expert Parallelism via RDMA). +# GPU kernels are JIT-compiled on first use; no hipcc needed at install. +# +# v0.18.0 ships MoRI 0.1.dev185+g2d02c6a98, but it STILL has the PCI +# topology bug (TopoSystemPci::Load assertion failure on Broadcom +# PEX890xx switches). Always rebuild from our target commit b645fc8 +# which includes the dsp2dev subordinate-range fix. +# --------------------------------------------------------------------------- +install_mori() { + local MORI_TARGET_COMMIT="b645fc8" + local MORI_MARKER="/usr/local/lib/python3.*/dist-packages/.mori_commit_${MORI_TARGET_COMMIT}" + + if ls $MORI_MARKER &>/dev/null; then + echo "[SETUP] MoRI @ $MORI_TARGET_COMMIT already installed (marker found)" + return 0 + fi + + echo "[SETUP] Installing MoRI build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + libopenmpi-dev openmpi-bin libpci-dev \ + && rm -rf /var/lib/apt/lists/* + + echo "[SETUP] Building MoRI from source (ROCm/mori @ $MORI_TARGET_COMMIT)..." + echo "[SETUP] (overriding image-provided version to fix PCI topology bug)" + ( + set -e + git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori + git checkout "$MORI_TARGET_COMMIT" + pip install --quiet --force-reinstall . + ) + rm -rf /opt/mori + + if ! python3 -c "import mori" 2>/dev/null; then + echo "[SETUP] ERROR: MoRI build failed"; exit 1 + fi + touch $(python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")/.mori_commit_${MORI_TARGET_COMMIT} + _SETUP_INSTALLED+=("MoRI@$MORI_TARGET_COMMIT") +} + +# --------------------------------------------------------------------------- +# 6b. amd-quark (MXFP4 quantization support for Kimi-K2.5-MXFP4 and similar) +# Required due to ROCm vLLM missing the quark dependency: +# https://github.com/vllm-project/vllm/issues/35633 +# --------------------------------------------------------------------------- +install_amd_quark() { + if python3 -c "import quark" 2>/dev/null; then + echo "[SETUP] amd-quark already present" + return 0 + fi + + echo "[SETUP] Installing amd-quark for MXFP4 quantization support..." + pip install --quiet amd-quark + + if ! python3 -c "import quark" 2>/dev/null; then + echo "[SETUP] WARN: amd-quark install failed (non-fatal for non-MXFP4 models)" + return 0 + fi + _SETUP_INSTALLED+=("amd-quark") +} + +# --------------------------------------------------------------------------- +# 7. Patch vLLM MoRI-EP + FP8 incompatibility (present in v0.17.1 & v0.18.0) +# vLLM asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel +# uses defer_input_quant=True which MoRI's prepare/finalize rejects. +# Patch: remove both the AITER requirement assertion and the +# defer_input_quant NotImplementedError so non-AITER kernels work. +# --------------------------------------------------------------------------- +patch_mori_fp8_compat() { + python3 -c ' +import re, os, sys +patched = [] + +# 1. Patch layer.py: remove multi-line AITER assertion for MoRI +try: + import vllm.model_executor.layers.fused_moe.layer as lm + f = lm.__file__ + src = open(f).read() + if "Mori needs to be used with aiter" in src: + new = re.sub( + r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", + "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", + src, flags=re.DOTALL) + if new != src: + open(f, "w").write(new) + patched.append("layer.py") +except Exception as e: + print(f"[SETUP] WARN patch layer.py: {e}", file=sys.stderr) + +# 2. Patch mori_prepare_finalize.py: remove defer_input_quant restriction +try: + import vllm.model_executor.layers.fused_moe.mori_prepare_finalize as mm + f = mm.__file__ + src = open(f).read() + if "defer_input_quant" in src: + new = re.sub( + r"raise NotImplementedError\([^)]*defer_input_quant[^)]*\)", + "pass # [PATCHED] defer_input_quant check removed for MoRI-EP + FP8", + src) + if new != src: + open(f, "w").write(new) + patched.append("mori_prepare_finalize.py") +except Exception as e: + print(f"[SETUP] WARN patch mori_pf: {e}", file=sys.stderr) + +if patched: + print(f"[SETUP] Patched: {chr(44).join(patched)}") +else: + print("[SETUP] No MoRI-FP8 patches needed") +' + _SETUP_INSTALLED+=("MoRI-FP8-patch") +} + +# --------------------------------------------------------------------------- +# 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock) +# In WRITE mode, save_kv_layer spins forever waiting for the handshake +# callback to set write_ready_flags. This blocks the model worker thread, +# preventing it from responding to EngineCore shm_broadcast, causing a +# TimeoutError cascade and crash. +# Patch: add time.sleep(0.001) and a 30s timeout to yield CPU and prevent +# the model worker from deadlocking. +# --------------------------------------------------------------------------- +patch_moriio_save_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + # Already patched? + if "[PATCHED] save_kv_layer timeout" in src: + print("[SETUP] save_kv_layer timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + continue""" + + if old not in src: + print("[SETUP] WARN: save_kv_layer busy-spin pattern not found, skipping patch") + sys.exit(0) + + new = """ # [PATCHED] save_kv_layer — null guard + timeout + sleep + if remote_engine_id is None: + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _SAVE_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + _elapsed = _time.monotonic() - _wait_start + if _elapsed > _SAVE_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] save_kv_layer: timeout (%.1fs) waiting for " + "write_ready_flags[%s], breaking to unblock model " + "worker", _elapsed, remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched save_kv_layer: null guard + timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch save_kv_layer: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-save-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 9. Patch MoRIIO waiting_for_transfer_complete with bounded timeout +# The original status.Wait() blocks forever if an RDMA completion never +# arrives (e.g., NIC queue saturation at C256). This replaces the unbounded +# wait with a polling loop using status.Succeeded() + configurable timeout. +# Also adds error handling to the write worker loop so a single failed +# transfer doesn't kill the background thread. +# --------------------------------------------------------------------------- +patch_moriio_transfer_timeout() { + python3 -c ' +import os, sys, textwrap + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine as me + f = me.__file__ + src = open(f).read() + + if "[PATCHED] transfer completion timeout" in src: + print("[SETUP] transfer completion timeout patch already applied") + sys.exit(0) + + # --- Patch 1: Replace waiting_for_transfer_complete with polling + timeout --- + old_wait = """ def waiting_for_transfer_complete(self): + if not self.transfer_status: + return + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + for status in transfers_to_wait: + try: + status.Wait() + if not status.Succeeded(): + logger.error( + "Transfer failed: %s, Code: %s", status.Message(), status.Code() + ) + raise TransferError("MoRIIO transfer failed!") + except Exception as e: + logger.error("Transfer %s failed: %s", status, e) + raise""" + + new_wait = """ def waiting_for_transfer_complete(self): + # [PATCHED] transfer completion timeout — bounded polling loop + import time as _time, os as _os + if not self.transfer_status: + return + + _timeout = float(_os.environ.get("VLLM_MORIIO_TRANSFER_TIMEOUT", "120")) + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + _start = _time.monotonic() + remaining = list(transfers_to_wait) + _polls = 0 + _completed = 0 + + while remaining: + _elapsed = _time.monotonic() - _start + if _elapsed > _timeout: + logger.error( + "[HANGFIX] transfer_timeout elapsed=%.1fs " + "pending=%d/%d completed=%d polls=%d " + "action=raise_transfer_error", + _elapsed, len(remaining), len(transfers_to_wait), + _completed, _polls, + ) + raise TransferError( + f"RDMA transfer timeout after {_elapsed:.1f}s, " + f"{len(remaining)}/{len(transfers_to_wait)} pending" + ) + + still_waiting = [] + for status in remaining: + try: + if status.Succeeded(): + _completed += 1 + continue + still_waiting.append(status) + except Exception as e: + logger.error( + "[HANGFIX] transfer_poll_error error=%s", e) + raise TransferError( + f"Transfer failed during poll: {e}" + ) from e + + remaining = still_waiting + if remaining: + _time.sleep(0.005) + _polls += 1 + if _polls % 2000 == 0: + logger.warning( + "[HANGFIX] transfer_wait pending=%d " + "completed=%d elapsed=%.1fs timeout=%.0fs", + len(remaining), _completed, + _time.monotonic() - _start, _timeout, + )""" + + if old_wait not in src: + print("[SETUP] WARN: waiting_for_transfer_complete pattern not found") + sys.exit(0) + + new_src = src.replace(old_wait, new_wait) + + # --- Patch 2: Add error handling + cleanup to _write_worker_loop --- + old_loop = """ self._execute_write_task(task)""" + + new_loop = """ try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s write_task_failed error=%s " + "action=cleanup_and_mark_done", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None + ) + except Exception: + pass""" + + if old_loop in new_src: + new_src = new_src.replace(old_loop, new_loop, 1) + else: + print("[SETUP] WARN: _write_worker_loop pattern not found for error handling") + + # --- Patch 3: Add deferred task timeout to _process_deferred_tasks --- + old_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + if not self._deferred_tasks: + return + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + if self._is_remote_ready(task): + self._execute_write_task(task) + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + new_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + # [PATCHED] deferred task timeout — prune stale tasks + import time as _time, os as _os + if not self._deferred_tasks: + return + + _DEFER_TIMEOUT = float( + _os.environ.get("VLLM_MORIIO_DEFER_TIMEOUT", "60")) + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + _age = _time.monotonic() - getattr(task, "_defer_ts", _time.monotonic()) + if _age > _DEFER_TIMEOUT: + logger.error( + "[HANGFIX] req=%s deferred_task_expired age=%.1fs " + "action=drop_and_mark_done", + task.request_id, _age, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + continue + if self._is_remote_ready(task): + try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s deferred_write_failed error=%s", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + if old_deferred in new_src: + new_src = new_src.replace(old_deferred, new_deferred, 1) + else: + print("[SETUP] WARN: _process_deferred_tasks pattern not found") + + # --- Patch 4: Stamp defer time when task is deferred --- + old_defer_add = """ self._deferred_tasks.append(task)""" + new_defer_add = """ import time as _time2 + if not hasattr(task, "_defer_ts"): + task._defer_ts = _time2.monotonic() + self._deferred_tasks.append(task)""" + if old_defer_add in new_src: + new_src = new_src.replace(old_defer_add, new_defer_add, 1) + else: + print("[SETUP] WARN: deferred task timestamp patch target not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: transfer timeout + writer error handling") + +except Exception as e: + print(f"[SETUP] WARN patch transfer_timeout: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-transfer-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 10. Patch MoRIIO start_load_kv busy-spin (same pattern as save_kv_layer) +# The READ-mode spin loop in start_load_kv has the same unbounded-spin +# issue as save_kv_layer. Add timeout + sleep + null guard. +# --------------------------------------------------------------------------- +patch_moriio_load_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + if "[PATCHED] start_load_kv timeout" in src: + print("[SETUP] start_load_kv timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + continue""" + + if old not in src: + print("[SETUP] WARN: start_load_kv busy-spin pattern not found, skipping") + sys.exit(0) + + new = """ # [PATCHED] start_load_kv timeout — prevent model worker deadlock + if remote_engine_id is None and not wait_handshake_readd_req: + self._reqs_to_send.update(metadata.reqs_to_send) + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _LOAD_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + if _time.monotonic() - _wait_start > _LOAD_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] start_load_kv: timeout (%.1fs) waiting for " + "load_ready_flag[%s]", _time.monotonic() - _wait_start, + remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: start_load_kv replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched start_load_kv busy-spin with timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch start_load_kv: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-load-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished +# vLLM asserts that a request in finished_recving must be either +# WAITING_FOR_REMOTE_KVS or finished. In READ mode the request can +# transition to RUNNING before the aggregated recv notification arrives, +# crashing the engine with AssertionError. +# (present in v0.17.1 & v0.18.0) +# --------------------------------------------------------------------------- +patch_scheduler_read_mode_fix() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] read-mode recv assertion" in src: + print("[SETUP] scheduler read-mode assertion fix already applied") + sys.exit(0) + + old_recv = """ for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + assert req_id in self.requests + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + else: + assert RequestStatus.is_finished(req.status) + self._free_blocks(self.requests[req_id])""" + + new_recv = """ # [PATCHED] read-mode recv assertion — handle intermediate states + for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping recv", req_id) + continue + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + elif RequestStatus.is_finished(req.status): + self._free_blocks(self.requests[req_id]) + else: + logger.debug( + "Request %s recv finished but status=%s (not " + "WAITING_FOR_REMOTE_KVS or finished), skipping " + "block free — will be freed on request completion", + req_id, req.status.name)""" + + if old_recv not in src: + print("[SETUP] WARN: scheduler finished_recving pattern not found, skipping") + sys.exit(0) + + new_src = src.replace(old_recv, new_recv, 1) + + old_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + assert req_id in self.requests + self._free_blocks(self.requests[req_id])""" + + new_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + self._free_blocks(self.requests[req_id])""" + + if old_send in new_src: + new_src = new_src.replace(old_send, new_send, 1) + else: + print("[SETUP] WARN: scheduler finished_sending pattern not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: scheduler _update_from_kv_xfer_finished read-mode fix") + +except Exception as e: + print(f"[SETUP] WARN patch scheduler read-mode: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("scheduler-read-mode-fix") +} + +# --------------------------------------------------------------------------- +# 12. Idle KV block reaper for disaggregated prefill (READ mode) +# The RIXL notification path can lose `finished_sending` signals under +# high concurrency with ibv_post_send failures. This leaves KV blocks +# permanently allocated on the prefill engine even after the decode has +# finished reading. Over multiple benchmark rounds, leaked blocks +# accumulate and eventually saturate the prefill KV cache. +# +# Fix: instrument the scheduler's `schedule()` method to detect idle +# periods (0 running, 0 waiting for >5s) and force-free blocks for +# any remaining requests whose status is finished. +# --------------------------------------------------------------------------- +patch_prefill_idle_kv_reaper() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] idle-kv-reaper" in src: + print("[SETUP] idle KV block reaper already applied") + sys.exit(0) + + # Find the _update_from_kv_xfer_finished method end and add reaper logic + # We inject into the method that processes KV transfer completions. + marker = "[PATCHED] read-mode recv assertion" + if marker not in src: + print("[SETUP] WARN: scheduler read-mode patch not found, skipping reaper") + sys.exit(0) + + # Add reaper state initialization to __init__ + old_init_marker = "self.finished_recving_kv_req_ids" + if old_init_marker not in src: + print("[SETUP] WARN: finished_recving_kv_req_ids not found in scheduler") + sys.exit(0) + + # Find the first occurrence to insert reaper state + init_pos = src.find(old_init_marker) + # Find the line containing it + line_end = src.find("\n", init_pos) + init_line = src[init_pos:line_end] + + # Add reaper state after this line + reaper_init = init_line + """ + # [PATCHED] idle-kv-reaper state + self._idle_kv_reaper_ts = 0.0 + self._idle_kv_reaper_active = False""" + + src = src.replace(init_line, reaper_init, 1) + + # Now add the reaper logic at the end of _update_from_kv_xfer_finished + # Find the finished_sending handler we patched + send_handler = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + self._free_blocks(self.requests[req_id])""" + + reaper_logic = send_handler + """ + + # [PATCHED] idle-kv-reaper — force-free leaked prefill KV blocks + import time as _time + _REAPER_IDLE_SECS = 5.0 + _num_running = sum(1 for r in self.requests.values() + if r.status == RequestStatus.RUNNING) + _should_reap = (_num_running == 0) + + if _should_reap: + if not self._idle_kv_reaper_active: + self._idle_kv_reaper_active = True + self._idle_kv_reaper_ts = _time.monotonic() + elif _time.monotonic() - self._idle_kv_reaper_ts > _REAPER_IDLE_SECS: + _reaped = 0 + _reap_ids = [] + for _rid, _req in list(self.requests.items()): + if RequestStatus.is_finished(_req.status): + _reap_ids.append(_rid) + for _rid in _reap_ids: + try: + _req = self.requests[_rid] + self._free_blocks(_req) + _reaped += 1 + except Exception as _e: + logger.debug("[KV-REAPER] free_blocks failed for %s: %s", _rid, _e) + if _reaped > 0: + logger.warning( + "[KV-REAPER] Force-freed blocks for %d finished " + "requests after %.1fs idle", + _reaped, _time.monotonic() - self._idle_kv_reaper_ts) + self._idle_kv_reaper_ts = _time.monotonic() + else: + self._idle_kv_reaper_active = False""" + + if send_handler in src: + src = src.replace(send_handler, reaper_logic, 1) + else: + print("[SETUP] WARN: send handler not found for reaper injection") + sys.exit(0) + + open(f, "w").write(src) + print("[SETUP] Patched: idle KV block reaper for prefill") + +except Exception as e: + print(f"[SETUP] WARN patch idle-kv-reaper: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("idle-kv-reaper") +} + +# --------------------------------------------------------------------------- +# 13. Patch MiniMax M2.5 WideEP + MoRI + EPLB support +# Replaces the upstream minimax_m2.py with our patched version that adds +# GateLinear, EP group integration, sequence parallelism, and the +# MixtureOfExperts EPLB protocol. Idempotent: skips if already patched. +# --------------------------------------------------------------------------- +patch_minimax_m2_wideep_mori() { + local patch_file="${WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}/patches/minimax_m2.py" + if [[ ! -f "$patch_file" ]]; then + # Also check the Docker-baked location + patch_file="/opt/vllm_disagg/patches/minimax_m2.py" + fi + if [[ ! -f "$patch_file" ]]; then + echo "[SETUP] minimax_m2.py patch not found, skipping (WideEP/MoRI not patched)" + return 0 + fi + + python3 -c ' +import os, sys, shutil + +try: + import vllm.model_executor.models.minimax_m2 as mmod + target = mmod.__file__ + src = sys.argv[1] + + with open(target) as f: + if "get_ep_group" in f.read(): + print("[SETUP] minimax_m2.py already has WideEP+MoRI support") + sys.exit(0) + + shutil.copy2(src, target) + print(f"[SETUP] Patched minimax_m2.py: {src} -> {target}") + +except Exception as e: + print(f"[SETUP] WARN patch minimax_m2: {e}", file=sys.stderr) +' "$patch_file" + _SETUP_INSTALLED+=("minimax-m2-wideep-mori") +} + +# ============================================================================= +# Run installers +# ============================================================================= + +install_ucx +install_rixl +install_etcd +install_libionic +install_mori +install_amd_quark +install_mori_proxy_deps +patch_mori_fp8_compat +patch_moriio_save_kv_timeout +patch_moriio_transfer_timeout +patch_moriio_load_kv_timeout +patch_scheduler_read_mode_fix +patch_prefill_idle_kv_reaper +patch_minimax_m2_wideep_mori + +# ============================================================================= +# Export paths (persists for server.sh since this file is sourced) +# ============================================================================= + +export ROCM_PATH="${ROCM_PATH}" +export UCX_HOME="${UCX_HOME}" +export RIXL_HOME="${RIXL_HOME}" +export PATH="${UCX_HOME}/bin:/usr/local/bin/etcd:/root/.cargo/bin:${PATH}" +export LD_LIBRARY_PATH="${UCX_HOME}/lib:${RIXL_HOME}/lib:${RIXL_HOME}/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" + +_SETUP_END=$(date +%s) +if [[ ${#_SETUP_INSTALLED[@]} -eq 0 ]]; then + echo "[SETUP] All dependencies already present (${_SETUP_END}s wallclock)" +else + echo "[SETUP] Installed: ${_SETUP_INSTALLED[*]} in $(( _SETUP_END - _SETUP_START ))s" +fi diff --git a/benchmarks/multi_node/amd_utils/start_etcd.sh b/benchmarks/multi_node/amd_utils/start_etcd.sh new file mode 100755 index 000000000..46bbd2964 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/start_etcd.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -x + +IPADDRS="${IPADDRS:-localhost}" + +# Use management network IP (matching what the Slurm script resolved) +host_ip=$(ip route get 1.1.1.1 2>/dev/null | sed -n 's/.*src \([^ ]*\).*/\1/p') +if [[ -z "$host_ip" ]]; then + host_ip=$(hostname -I | awk '{print $1}') +fi + +IFS=',' read -ra ADDR <<< "$IPADDRS" + +# Determine node name based on position in the IPADDRS list +index=0 +for ip in "${ADDR[@]}"; do + if [[ "$ip" == "$host_ip" ]]; then + break + fi + index=$((index + 1)) +done +node_name="etcd-$((index+1))" + +# Build initial cluster string +initial_cluster="" +for i in "${!ADDR[@]}"; do + peer_name="etcd-$((i+1))" + initial_cluster+="$peer_name=http://${ADDR[i]}:2380" + if [[ $i -lt $((${#ADDR[@]} - 1)) ]]; then + initial_cluster+="," + fi +done + +mkdir -p /var/lib/etcd +rm -rf /var/lib/etcd/* + +/usr/local/bin/etcd/etcd \ + --name "$node_name" \ + --data-dir /var/lib/etcd \ + --initial-advertise-peer-urls http://$host_ip:2380 \ + --listen-peer-urls http://0.0.0.0:2380 \ + --listen-client-urls http://0.0.0.0:2379 \ + --advertise-client-urls http://$host_ip:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster "$initial_cluster" \ + --initial-cluster-state new \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/etcd_NODE${NODE_RANK}.log diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index d2c49bc9e..a77462fc5 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -2,37 +2,51 @@ # # Cluster Configuration Template for Multi-Node Disaggregated Serving # -# This script submits a multi-node SGLang disaggregated benchmark job to SLURM. +# This script submits a multi-node disaggregated benchmark job to SLURM. # It must be configured for your specific cluster before use. +# +# ENGINE=sglang (default): SGLang disaggregated serving +# ENGINE=vllm: vLLM disaggregated serving +# +# Router is co-located with the first prefill node (same for both engines), +# so NUM_NODES = PREFILL_NODES + DECODE_NODES. usage() { cat << 'USAGE' -This script aims to provide a one-liner call to the submit_job_script.py, -so that the deployment process can be further simplified. - -To use this script, fill in the following script and run it under your `slurm_jobs` directory: -======== begin script area ======== -# REQUIRED: Cluster-specific configuration -export SLURM_ACCOUNT= # Your SLURM account name -export SLURM_PARTITION= # SLURM partition to submit to -export TIME_LIMIT= # Job time limit (e.g., "08:00:00") - -# REQUIRED: Model and container paths -export MODEL_PATH= # Path to model directory (e.g., /mnt/models, /nfsdata) -export CONTAINER_IMAGE= # Path to container squash file - -# REQUIRED: Hardware configuration -export GPUS_PER_NODE= # GPUs per node (e.g., 8 for MI355X, 4 for MI325X) - -# OPTIONAL: RDMA/Network configuration (set in runners/launch_mi355x-amds.sh for AMD) -# export IBDEVICES= # RDMA device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) -# export MORI_RDMA_TC= # RDMA traffic class (e.g., 96, 104) - -bash submit.sh \ -$PREFILL_NODES $PREFILL_WORKERS $DECODE_NODES $DECODE_WORKERS \ -$ADDITIONAL_FRONTENDS \ -$ISL $OSL $CONCURRENCIES $REQUEST_RATE -======== end script area ======== +Usage: + bash submit.sh \ + \ + \ + \ + \ + [NODE_LIST] + +Arguments: + PREFILL_NODES Number of prefill nodes + PREFILL_WORKERS Number of prefill workers (usually 1) + DECODE_NODES Number of decode nodes + DECODE_WORKERS Number of decode workers (usually 1) + ISL Input sequence length + OSL Output sequence length + CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") + REQUEST_RATE Request rate ("inf" for max throughput) + PREFILL_ENABLE_EP true/false or 1/0 (expert parallelism on prefill) + PREFILL_ENABLE_DP true/false or 1/0 (data-parallel attention on prefill) + DECODE_ENABLE_EP true/false or 1/0 (expert parallelism on decode) + DECODE_ENABLE_DP true/false or 1/0 (data-parallel attention on decode) + PREFILL_TP Tensor parallel size per prefill node + DECODE_TP Tensor parallel size per decode node + RANDOM_RANGE_RATIO Random range ratio for benchmark client + NODE_LIST Optional: comma-separated hostnames (must match NUM_NODES) + +Required environment variables: + SLURM_ACCOUNT SLURM account name + SLURM_PARTITION SLURM partition + TIME_LIMIT Job time limit (e.g., "08:00:00") + MODEL_PATH Path to model directory (e.g., /nfsdata) + MODEL_NAME Model name directory + CONTAINER_IMAGE Docker image name (e.g., vllm_disagg_pd:latest) + RUNNER_NAME Runner identifier (for job name) USAGE } @@ -53,6 +67,7 @@ check_env MODEL_PATH check_env MODEL_NAME check_env CONTAINER_IMAGE check_env RUNNER_NAME +check_env FRAMEWORK # GPUS_PER_NODE defaults to 8 (MI355X). Set to 4 for MI325X if needed. GPUS_PER_NODE="${GPUS_PER_NODE:-8}" @@ -66,31 +81,32 @@ ISL=$5 OSL=$6 CONCURRENCIES=$7 REQUEST_RATE=$8 -PREFILL_ENABLE_EP=${9:-1} -PREFILL_ENABLE_DP=${10:-1} -DECODE_ENABLE_EP=${11:-1} -DECODE_ENABLE_DP=${12:-1} +PREFILL_ENABLE_EP=${9:-true} +PREFILL_ENABLE_DP=${10:-true} +DECODE_ENABLE_EP=${11:-true} +DECODE_ENABLE_DP=${12:-true} PREFILL_TP=${13:-8} DECODE_TP=${14:-8} -RANDOM_RANGE_RATIO=${15} +RANDOM_RANGE_RATIO=${15:-0.8} NODE_LIST=${16} - NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}" # Export variables for the SLURM job +export ENGINE="${FRAMEWORK:-sglang}" export MODEL_DIR=$MODEL_PATH export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE export PROFILER_ARGS=$profiler_args - - +# Engine-specific xP/yD semantics and TP exports +if [[ "$ENGINE" == "vllm" ]]; then + export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} + export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} +fi +# xP = prefill workers, yD = decode workers (may span multiple nodes) export xP=$PREFILL_WORKERS export yD=$DECODE_WORKERS -export NUM_NODES=$NUM_NODES -export GPUS_PER_NODE=$GPUS_PER_NODE -export MODEL_NAME=$MODEL_NAME export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $PREFILL_TP / $PREFILL_WORKERS )) export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP} export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP} @@ -98,12 +114,16 @@ export DECODE_TP_SIZE=$(( $DECODE_NODES * $DECODE_TP / $DECODE_WORKERS )) export DECODE_ENABLE_EP=${DECODE_ENABLE_EP} export DECODE_ENABLE_DP=${DECODE_ENABLE_DP} export DECODE_MTP_SIZE=${DECODE_MTP_SIZE} + +export NUM_NODES=$NUM_NODES +export GPUS_PER_NODE=$GPUS_PER_NODE +export MODEL_NAME=$MODEL_NAME export BENCH_INPUT_LEN=${ISL} export BENCH_OUTPUT_LEN=${OSL} -export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO} -export BENCH_NUM_PROMPTS_MULTIPLIER=10 +export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} +export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} # Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker) export RUN_EVAL="${RUN_EVAL:-false}" @@ -118,13 +138,10 @@ export SPEC_DECODING="${SPEC_DECODING:-}" export IS_MULTINODE="${IS_MULTINODE:-false}" # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. -# SLURM writes output files on the batch node, so /tmp won't work (node-local). -# Defaults to a sibling directory of the submit working directory. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" mkdir -p "$BENCHMARK_LOGS_DIR" # Optional: pass an explicit node list to sbatch. -# NODE_LIST is expected to be comma-separated hostnames. NODELIST_OPT=() if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then IFS=',' read -r -a NODE_ARR <<< "$NODE_LIST" @@ -137,6 +154,13 @@ if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then NODELIST_OPT=(--nodelist "$NODELIST_CSV") fi +# Optional: exclude specific nodes (e.g. nodes with broken Docker sockets). +# Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames. +EXCLUDE_OPT=() +if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then + EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") +fi + # Construct the sbatch command sbatch_cmd=( sbatch @@ -145,6 +169,7 @@ sbatch_cmd=( -N "$NUM_NODES" -n "$NUM_NODES" "${NODELIST_OPT[@]}" + "${EXCLUDE_OPT[@]}" --time "$TIME_LIMIT" --partition "$SLURM_PARTITION" --account "$SLURM_ACCOUNT" @@ -154,7 +179,6 @@ sbatch_cmd=( "$(dirname "$0")/job.slurm" ) -# todo: --parsable outputs only the jobid and cluster name, test if jobid;clustername is correct JOB_ID=$("${sbatch_cmd[@]}") if [[ $? -ne 0 ]]; then echo "Error: Failed to submit job with sbatch" >&2 diff --git a/benchmarks/multi_node/amd_utils/sync.py b/benchmarks/multi_node/amd_utils/sync.py index 140951519..3678e7614 100755 --- a/benchmarks/multi_node/amd_utils/sync.py +++ b/benchmarks/multi_node/amd_utils/sync.py @@ -143,7 +143,10 @@ def close_port(): time.sleep(30) if args.enable_port: - time.sleep(30) + # Keep the port open long enough for slow nodes to pass their barrier. + # The previous 30s was too short when setup times vary by minutes. + grace = max(60, args.timeout // 2) if args.timeout > 0 else 300 + time.sleep(grace) close_port() diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh index 6a7314ab4..d17d1a323 100644 --- a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh index 0124d4b4d..a8c0d2743 100644 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh new file mode 100755 index 000000000..d7995fb25 --- /dev/null +++ b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh new file mode 100644 index 000000000..a9a28d889 --- /dev/null +++ b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" From 0fc3679dbdca210e35c7e5e0d4691b678a892c4f Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 21 Apr 2026 07:57:08 +0000 Subject: [PATCH 44/98] use vLLM router as default router for vllm disagg Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/job.slurm | 34 ++++++++++++++++ .../multi_node/amd_utils/server_vllm.sh | 40 +++++++++++-------- 2 files changed, 58 insertions(+), 16 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 56fefb0ed..491f27aa8 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -77,6 +77,11 @@ PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} +# Router selection: "vllm-router" (external container) or "moriio" (in-container proxy) +ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}" +ROUTER_PORT="${ROUTER_PORT:-30000}" +PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" + # ============================================================================= # Docker privilege detection # ============================================================================= @@ -289,6 +294,10 @@ export IS_MULTINODE="${IS_MULTINODE:-false}" SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" + +# vLLM external router container +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy}" +ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) @@ -397,6 +406,24 @@ echo \"Rank \$SLURM_PROCID on \$(hostname)\" \$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$_DCMD rm -f || true \$DOCKER_CMD ps -aq | xargs -r \$_DCMD stop || true +# Start vLLM external router container on node 0 +if [[ \"$ENGINE\" == \"vllm\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then + \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true + \$DOCKER_CMD run -d \\ + --name \"$ROUTER_CONT_NAME\" \\ + --network host \\ + \"$VLLM_ROUTER_IMAGE\" \\ + vllm-router \\ + --vllm-pd-disaggregation \\ + --vllm-discovery-address \"0.0.0.0:${PROXY_PING_PORT}\" \\ + --port \"${ROUTER_PORT}\" \\ + --host 0.0.0.0 \\ + --policy consistent_hash \\ + --prefill-policy consistent_hash \\ + --decode-policy consistent_hash \\ + --log-level info +fi + exec \$DOCKER_CMD run --rm \ --init \ --stop-timeout 10 \ @@ -446,3 +473,10 @@ fi " srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' + +# Clean up vLLM external router container on node 0 +if [[ "$ENGINE" == "vllm" && "$ROUTER_TYPE" == "vllm-router" ]]; then + srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' + '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true + ' +fi diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index a10e45d6d..6b70014ee 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -282,19 +282,24 @@ if [ "$NODE_RANK" -eq 0 ]; then setup_vllm_env # Start MoRI-IO proxy FIRST — workers register via ZMQ on startup - echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." - PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ - python3 $WS_PATH/moriio_proxy.py" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PROXY_CMD" + # Skipped when ROUTER_TYPE=vllm-router (external router container started by job.slurm) + if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then + echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." + PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ + python3 $WS_PATH/moriio_proxy.py" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PROXY_CMD" + else + PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" + set -x + eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 & + set +x + proxy_pid=$! + sleep 3 + fi else - PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" - set -x - eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 & - set +x - proxy_pid=$! - sleep 3 + echo "Using external vLLM router (ROUTER_TYPE=${ROUTER_TYPE:-vllm-router})" fi PREFILL_CMD="vllm serve ${MODEL_PATH} \ @@ -368,13 +373,16 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" fi - echo "Killing the proxy server and prefill server" + echo "Killing the prefill server" if [[ "$DRY_RUN" -eq 0 ]]; then - [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true + if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then + [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true + fi [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true sleep 2 - # Fallback: ensure no orphaned processes keep ports open - pkill -f moriio_proxy 2>/dev/null || true + if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then + pkill -f moriio_proxy 2>/dev/null || true + fi pkill -f "vllm serve" 2>/dev/null || true fi From 9d6c39b70c89c830d755a359815c60f3c2daacfe Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 23 Apr 2026 01:49:52 +0000 Subject: [PATCH 45/98] fix bugs Signed-off-by: Chun Fang --- benchmarks/multi_node/amd_utils/bench.sh | 6 +- benchmarks/multi_node/amd_utils/env.sh | 4 +- benchmarks/multi_node/amd_utils/job.slurm | 60 +- benchmarks/multi_node/amd_utils/server.sh | 712 +----------------- .../multi_node/amd_utils/server_vllm.sh | 54 +- benchmarks/multi_node/amd_utils/setup_deps.sh | 10 +- benchmarks/multi_node/amd_utils/submit.sh | 2 +- 7 files changed, 74 insertions(+), 774 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 87f3b1e8a..aecc29e83 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -11,7 +11,7 @@ # \ # -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" n_prefill=$1 n_decode=$2 @@ -67,7 +67,7 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do # Engine-specific extra flags extra_flags="" - if [[ "$ENGINE" == "vllm" ]]; then + if [[ "$ENGINE" == "vllm-disagg" ]]; then extra_flags="--trust-remote-code" else if [ "$IS_MTP" = "true" ]; then @@ -92,7 +92,7 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do echo "-----------------------------------------" # vLLM: cooldown between rounds for idle KV block reaper - if [[ "$ENGINE" == "vllm" ]]; then + if [[ "$ENGINE" == "vllm-disagg" ]]; then echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." sleep 10 fi diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index c5a438541..81da415e8 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -9,7 +9,7 @@ # Set by runner or auto-detected from hostname. set -x -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" export PYTHONDONTWRITEBYTECODE=1 # ============================================================================= @@ -43,7 +43,7 @@ export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} # Engine-specific environment # ============================================================================= -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then # ========================================================================= # vLLM/Nixl-specific environment # ========================================================================= diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 491f27aa8..b9a83941a 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -8,7 +8,7 @@ #SBATCH --time=24:00:00 # --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" echo "=== Job Start Time ===" echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" @@ -23,7 +23,7 @@ echo "" # Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/ # at runtime, but the CWD remains the submit-time directory (amd_utils/). -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then MODELS_YAML="$(pwd)/models_vllm.yaml" else MODELS_YAML="$(pwd)/models.yaml" @@ -111,7 +111,7 @@ if [[ -z "$MODEL_DIR" ]]; then fi export MODEL_DIR -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then # vLLM: Extract hf_dir from models.yaml, search multiple paths, resolve HF cache snapshots DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} found && /^[^ ]/{exit} @@ -278,6 +278,7 @@ export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE export DRY_RUN="${DRY_RUN:-0}" export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +export KEEP_CONTAINERS="${KEEP_CONTAINERS:-0}" export ENGINE=$ENGINE # Eval-related env vars (threaded from submit.sh) @@ -367,7 +368,7 @@ DOCKER_ENV_COMMON=( ) # Engine-specific env vars -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then DOCKER_ENV_ENGINE=( -e VLLM_WS_PATH=${WS_PATH} -e MODEL_PATH=$DOCKER_MODEL_PATH @@ -403,28 +404,29 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" # Pre-clean (idempotent) -\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$_DCMD rm -f || true -\$DOCKER_CMD ps -aq | xargs -r \$_DCMD stop || true +\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true +\$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true # Start vLLM external router container on node 0 -if [[ \"$ENGINE\" == \"vllm\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then +if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true - \$DOCKER_CMD run -d \\ - --name \"$ROUTER_CONT_NAME\" \\ - --network host \\ - \"$VLLM_ROUTER_IMAGE\" \\ - vllm-router \\ - --vllm-pd-disaggregation \\ - --vllm-discovery-address \"0.0.0.0:${PROXY_PING_PORT}\" \\ - --port \"${ROUTER_PORT}\" \\ - --host 0.0.0.0 \\ - --policy consistent_hash \\ - --prefill-policy consistent_hash \\ - --decode-policy consistent_hash \\ - --log-level info + \$DOCKER_CMD run -d \ + --name \"$ROUTER_CONT_NAME\" \ + --network host \ + -v /tmp:/run_logs \ + \"$VLLM_ROUTER_IMAGE\" \ + bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \ + --vllm-pd-disaggregation \ + --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \ + --port ${ROUTER_PORT} \ + --host 0.0.0.0 \ + --policy consistent_hash \ + --prefill-policy consistent_hash \ + --decode-policy consistent_hash \ + --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log \" fi -exec \$DOCKER_CMD run --rm \ +exec \$DOCKER_CMD run \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -472,11 +474,13 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then fi " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' +if [[ "${KEEP_CONTAINERS}" != "1" ]]; then + srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' -# Clean up vLLM external router container on node 0 -if [[ "$ENGINE" == "vllm" && "$ROUTER_TYPE" == "vllm-router" ]]; then - srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' - '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true - ' -fi + # Clean up vLLM external router container on node 0 + if [[ "$ENGINE" == "vllm-disagg" && "$ROUTER_TYPE" == "vllm-router" ]]; then + srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' + '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true + ' + fi +fi \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 3c92422be..5c441a793 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -2,722 +2,18 @@ # Dual-Engine Disaggregated Server Dispatcher # ============================================================================= # Dispatches to the engine-specific server launcher based on ENGINE env var. -# ENGINE=sglang (default) -> server_sglang.sh (SGLang + MoRI) -# ENGINE=vllm -> server_vllm.sh (vLLM + Nixl/MoRI-IO) +# ENGINE=sglang-disagg (default) -> server_sglang.sh (SGLang + MoRI) +# ENGINE=vllm-disagg -> server_vllm.sh (vLLM + Nixl/MoRI-IO) # ============================================================================= -ENGINE="${ENGINE:-sglang}" +ENGINE="${ENGINE:-sglang-disagg}" WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}" export WS_PATH ENGINE echo "[DISPATCHER] ENGINE=$ENGINE WS_PATH=$WS_PATH" -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then source "$WS_PATH/server_vllm.sh" else source "$WS_PATH/server_sglang.sh" fi -<<<<<<< HEAD - -# ============================================================================= -# Model-Specific Configuration from YAML -# ============================================================================= -MODELS_YAML="${SGLANG_WS_PATH}/models.yaml" - -if [[ ! -f "$MODELS_YAML" ]]; then - echo "ERROR: models.yaml not found at $MODELS_YAML" - exit 1 -fi - -# Load model config via inline Python (PyYAML is available in SGLang containers) -# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP") -# is done here in Python to avoid bash glob-expanding the * characters. -eval "$(python3 -c " -import yaml, sys, os - -config_path = '${MODELS_YAML}' -model_name = '${MODEL_NAME}' - -with open(config_path) as f: - models = yaml.safe_load(f) - -if model_name not in models: - print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') - sys.exit(0) - -m = models[model_name] - -def eval_formula(val): - \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\" - if isinstance(val, (int, float)): - return int(val) - s = str(val) - # Build a namespace from env vars (convert numeric values to int) - ns = {} - for k, v in os.environ.items(): - try: - ns[k] = int(v) - except (ValueError, TypeError): - pass - try: - return int(eval(s, {'__builtins__': {}}, ns)) - except Exception as e: - print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr) - return val - -def parse_range(cuda_range, default_start, default_end): - if '-' in str(cuda_range): - s, e = str(cuda_range).split('-') - return s, e - return str(default_start), str(default_end) - -# Output shell variables -print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"') -print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') -print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"') - -prefill = m.get('prefill', {}) -decode = m.get('decode', {}) - -print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"') -print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"') - -dp = prefill.get('dp', {}) -no_dp = prefill.get('no_dp', {}) -print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') -print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') -print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') -print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"') -print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"') -print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"') -print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') -print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) -print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') -print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') - -print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"') -print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"') - -dp = decode.get('dp', {}) -ep_only = decode.get('ep_only', {}) -no_dp = decode.get('no_dp', {}) - -# Decode DP config -print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160) -print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"') - -# Decode EP-only config (EP enabled but DP disabled) -print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256) -print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"') - -# Decode no-DP config -print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) -print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') -")" - -echo "Loaded model configuration for: $MODEL_NAME" - -# Compute DP-dependent prefill parameters -if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then - prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) - prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP - prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP - prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP - prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP - prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP -else - prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) - prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP - prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP - prefill_context_length="" - prefill_max_total_tokens="" - prefill_enable_two_batch_overlap="false" -fi - -# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) -if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END)) - decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE)) -elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END)) - decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY -else - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END)) - decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP -fi - -# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " -if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" -fi -if [[ -n "$prefill_context_length" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}" -fi -if [[ -n "$prefill_max_total_tokens" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}" -fi -if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap" - PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true" -fi - -DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} " - -if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then - DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" -fi - -if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then - MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) - MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) -fi - -# ============================================================================= -# Cluster Topology Configuration -# ============================================================================= -IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" - -# Ceiling division by GPUS_PER_NODE for nodes-per-worker -PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE)) -DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE)) -NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP)) - -# Build prefill arguments dynamically based on xP -PREFILL_HEADNODE_URLS=() -PREFILL_ARGS="" -for i in $(seq 0 $((xP - 1))); do - prefill_idx=$((i * PREFILL_NODES_PER_WORKER)) - PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}" - PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000" -done - -# Build decode arguments dynamically based on yD -DECODE_HEADNODE_URLS=() -DECODE_ARGS="" -for i in $(seq 0 $((yD - 1))); do - decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET)) - DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}" - DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000" -done - -echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}" -echo "Decode worker headnode list: ${DECODE_HEADNODE_URLS[@]}" - -# ============================================================================= -# Configuration Builder Functions -# ============================================================================= - -build_server_config() { - local mode="$1" - local model_name="$2" - local tp_size="$3" - local enable_ep="$4" - local enable_dp="$5" - local decode_mtp_size="$6" - - # Calculate EP and DP sizes based on enable flags - local ep_size=1 - local dp_size=1 - - if [[ "$enable_ep" == "true" ]]; then - ep_size=$tp_size - fi - - if [[ "$enable_dp" == "true" ]]; then - dp_size=$tp_size - fi - - # Build parallelism arguments - local parallel_args="--tp-size ${tp_size}" - - if [[ "$enable_ep" == "true" ]]; then - parallel_args="$parallel_args --ep-size ${ep_size}" - fi - - if [[ "$enable_dp" == "true" ]]; then - parallel_args="$parallel_args --dp-size ${dp_size}" - fi - - # Get model-specific configuration from YAML-loaded variables - local base_config="$MODEL_BASE_FLAGS" - local mtp_config="" - local dp_config="" - local specific_config="" - - # MTP config (only if MTP is enabled and mode is decode) - if [ "$decode_mtp_size" -gt 0 ]; then - mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))" - fi - - # DP config (only if DP is enabled) - if [[ "$enable_dp" == "true" ]]; then - dp_config="$MODEL_DP_FLAGS" - fi - - # Mode-specific config - if [[ "$mode" == "prefill" ]]; then - specific_config="$PREFILL_MODE_FLAGS" - elif [[ "$mode" == "decode" ]]; then - specific_config="$DECODE_MODE_FLAGS" - fi - - # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config - local full_config="$parallel_args" - if [[ -n "$base_config" ]]; then - full_config="$full_config $base_config" - fi - if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then - full_config="$full_config $mtp_config" - fi - if [[ -n "$dp_config" ]]; then - full_config="$full_config $dp_config" - fi - if [[ -n "$specific_config" ]]; then - full_config="$full_config $specific_config" - fi - - echo "$full_config" -} - -# Build complete server configurations -PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE") -DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE") - -if [[ -n "$MODEL_NAME" ]]; then - echo "Using model-specific configuration for: $MODEL_NAME" -fi - -if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]; then - PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') - DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') - unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL - unset MORI_MOE_MAX_INPUT_TOKENS_DECODE - # NOTE: that currently with fp8_combine set, the evals do not pass on InferenceX eval harness - # or on SGLang native harness for high concurrency 4k and gets no where near the golden score of - # 0.95 on even basic GSM8k grade school math as confirmed by @billishyahao from AMD - # and as confirmed by @Oseltamivir. This was initally merged with @billishyahao promising - # that an fast follow PR to fix the evals via having quant correction in the fp8 combine -fi - -# ============================================================================= -# Container Synchronization -# ============================================================================= - -echo "Waiting at the container creation barrier on $host_name" -python3 $SGLANG_WS_PATH/sync.py barrier \ - --local-ip ${host_ip} \ - --local-port 5000 \ - --enable-port \ - --node-ips ${IPADDRS} \ - --node-ports 5000 \ - --wait-for-all-ports \ - --timeout 300 - - -# ============================================================================= -# Node Role Assignment and Server Launch -# ============================================================================= - -if [ "$NODE_RANK" -eq 0 ]; then - echo "NODE INFO =======================================" - echo "================================================" - echo "Node List : ${SLURM_JOB_NODELIST}" - echo "Node IPs : ${IPADDRS}" - echo "Model Name : ${MODEL_NAME:-'Not specified'}" - echo "================================================" - - echo "CLUSTER INFO ====================================" - echo "================================================" - echo "${host_name}:${host_ip} is Proxy Node and Prefill Node" - echo "Using prefill config: $PREFILL_SERVER_CONFIG" - echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" - echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" - echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" - echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" - echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}" - echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} " - echo "Decode env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} " - - echo "================================================" - - # start the head prefill server - PREFILL_MORI_MOE_ENV="" - set -x - if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then - PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" - fi - set +x - PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ - --model-path $MODEL_DIR/$MODEL_NAME \ - --disaggregation-mode prefill \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} " - - if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then - PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" - fi - - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PREFILL_CMD" - else - set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & - set +x - prefill0_pid=$! - fi - - - echo "Waiting for all prefill and decode servers to be up . . ." - - - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 8000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - echo "Congratulations!!! All prefill and decode servers are up . . ." - - ROUTER_CMD="python -m sglang_router.launch_router \ - --pd-disaggregation \ - --port 30000 \ - --policy random \ - --prefill-policy random \ - --decode-policy random \ - ${PREFILL_ARGS} \ - ${DECODE_ARGS}" - - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $ROUTER_CMD" - else - ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log" - set -x - if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then - eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & - else - eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 & - fi - set +x - proxy_pid=$! - - # Wait for router to be ready via health endpoint - HEALTH_BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-health \ - --health-endpoint /readiness \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $HEALTH_BARRIER_CMD" - else - eval "$HEALTH_BARRIER_CMD" - fi - - echo "Router is ready for benchmarking" - fi - - - echo "Ready for benchmarking on ${host_name}:${host_ip}" - - echo "Benchmarking on ${host_name}:${host_ip}" - cd $SGLANG_WS_PATH - - # Export IS_MTP based on whether MTP is enabled - if [ "$DECODE_MTP_SIZE" -gt 0 ]; then - export IS_MTP=true - else - export IS_MTP=false - fi - - # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier - BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ - $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ - ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ - ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" - - if [[ "${EVAL_ONLY:-false}" == "true" ]]; then - echo "EVAL_ONLY mode: skipping throughput benchmark" - elif [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BENCH_CMD" - else - set -x - eval "$BENCH_CMD" - set +x - fi - - # Run evaluation if requested (before killing router) - if [[ "${RUN_EVAL:-false}" == "true" ]]; then - echo "Running lm-eval evaluation on Node 0..." - - # Health check: verify the router is still serving before running eval. - # The throughput benchmark may have crashed/exhausted decode workers. - EVAL_HEALTH_OK=false - for _attempt in 1 2 3; do - if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then - EVAL_HEALTH_OK=true - break - fi - echo "Eval health check attempt $_attempt failed, retrying in 10s..." - sleep 10 - done - - if [[ "$EVAL_HEALTH_OK" != "true" ]]; then - echo "WARNING: Router health check failed after 3 attempts. Skipping eval." - else - # Must run from repo root so utils/evals/${task}.yaml resolves - pushd /workspace - - # Source eval functions from benchmark_lib.sh - source /workspace/benchmarks/benchmark_lib.sh - - # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list - if [[ -n "${EVAL_CONC:-}" ]]; then - export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" - else - export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) - fi - - # Override eval context length with model's configured context_length - if [[ -n "$prefill_context_length" ]]; then - export EVAL_MAX_MODEL_LEN="$prefill_context_length" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" - else - # Run lm-eval against the router on port 30000 - run_eval --framework lm-eval --port 30000 - eval_rc=$? - - if [[ $eval_rc -ne 0 ]]; then - echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2 - EVAL_FAILED=1 - else - # Set metadata env vars for append_lm_eval_summary - export TP="${PREFILL_TP_SIZE}" - export CONC="${EVAL_CONCURRENT_REQUESTS}" - export EP_SIZE=1 - [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" - export PREFILL_TP="${PREFILL_TP_SIZE}" - export PREFILL_EP=1 - [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}" - export PREFILL_NUM_WORKERS="${xP}" - export DECODE_TP="${DECODE_TP_SIZE}" - export DECODE_EP=1 - [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}" - export DECODE_NUM_WORKERS="${yD}" - export DP_ATTENTION="${PREFILL_ENABLE_DP}" - export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}" - export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}" - export ISL="${BENCH_INPUT_LEN}" - export OSL="${BENCH_OUTPUT_LEN}" - # IS_MULTINODE, FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, - # RESULT_FILENAME are already set via Docker -e flags from job.slurm - - append_lm_eval_summary - # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace - - # Copy eval artifacts to run_logs for NFS extraction by runner - EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" - mkdir -p "$EVAL_COPY_DIR" - for f in meta_env.json; do - [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" - done - # Use find for glob patterns to avoid "no match" errors - find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; - find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; - - echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" - fi - fi - - popd - fi - fi - - # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) - LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" - mkdir -p "$LOGS_OUTPUT" - - if [[ "$DRY_RUN" -eq 0 ]]; then - cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" - echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" - fi - - echo "Killing the proxy server and prefill server" - - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $proxy_pid - kill $prefill0_pid - fi - - if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then - echo "ERROR: eval failed; exiting node-0 with rc=1" - exit 1 - fi - -elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then - echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" - echo "Using prefill config: $PREFILL_SERVER_CONFIG" - echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" - - PREFILL_MORI_MOE_ENV="" - set -x - if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then - PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" - fi - set +x - PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ - --model-path $MODEL_DIR/${MODEL_NAME} \ - --disaggregation-mode prefill \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} " - - if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then - rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) - prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER)) - PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PREFILL_CMD" - else - set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & - set +x - prefill_pid=$! - fi - - echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - - echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ - --remote-ip ${NODE0_ADDR} \ - --remote-port 30000" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $WAIT_CMD" - else - eval "$WAIT_CMD" - fi - - echo "Killing the rank $NODE_RANK prefill server" - - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $prefill_pid - fi - -else - RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER)) - echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})" - echo "Using decode config: $DECODE_SERVER_CONFIG" - echo "Decode node rank: $RANK" - echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" - - DECODE_MORI_MOE_ENV="" - set -x - if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_DECODE" ]]; then - DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}" - fi - set +x - DECODE_CMD="${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ - --model-path ${MODEL_DIR}/${MODEL_NAME} \ - --disaggregation-mode decode \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${DECODE_SERVER_CONFIG} " - - if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then - rank=$((RANK % DECODE_NODES_PER_WORKER)) - decode_idx=$((RANK / DECODE_NODES_PER_WORKER)) - DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $DECODE_CMD" - else - set -x - eval "$DECODE_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & - - set +x - decode_pid=$! - fi - - - echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - - - echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ - --remote-ip ${NODE0_ADDR} \ - --remote-port 30000" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $WAIT_CMD" - else - eval "$WAIT_CMD" - fi - - echo "Killing the rank $RANK decode server" - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $decode_pid - fi - -fi - -echo "Script completed successfully" -exit 0 -======= ->>>>>>> 766ba4ee (consolidate amd_utils for sglang and vllm) diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 6b70014ee..73cad3adc 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -199,29 +199,29 @@ python3 $WS_PATH/sync.py barrier \ # ETCD Server Setup # ============================================================================= -echo "Proceeding to start etcd server on $host_name" -bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 & -etcd_pid=$! - -echo "Waiting at etcd server barrier on $host_name" -python3 $WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 2379 \ - --wait-for-all-ports \ - --timeout 300 - -echo "All etcd servers are up : $host_name" -sleep 3 - -echo "etcd endpoint health==================" -etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true -echo "======================================" - -python3 $WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 2379 \ - --wait-for-all-ports \ - --timeout 300 +# echo "Proceeding to start etcd server on $host_name" +# bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 & +# etcd_pid=$! + +# echo "Waiting at etcd server barrier on $host_name" +# python3 $WS_PATH/sync.py barrier \ +# --node-ips ${IPADDRS} \ +# --node-ports 2379 \ +# --wait-for-all-ports \ +# --timeout 300 + +# echo "All etcd servers are up : $host_name" +# sleep 3 + +# echo "etcd endpoint health==================" +# etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true +# echo "======================================" + +# python3 $WS_PATH/sync.py barrier \ +# --node-ips ${IPADDRS} \ +# --node-ports 2379 \ +# --wait-for-all-ports \ +# --timeout 300 # ============================================================================= # Cluster Topology Configuration @@ -343,7 +343,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "DRY RUN: $HEALTH_BARRIER_CMD" else eval "$HEALTH_BARRIER_CMD" - echo "MoRI-IO proxy is ready for benchmarking" + echo "${ROUTER_TYPE} is ready for benchmarking" fi echo "Ready for benchmarking on ${host_name}:${host_ip}" @@ -490,9 +490,9 @@ else [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true fi -echo "Killing the etcd server" -kill $etcd_pid 2>/dev/null || true -pkill -f etcd 2>/dev/null || true +# echo "Killing the etcd server" +# kill $etcd_pid 2>/dev/null || true +# pkill -f etcd 2>/dev/null || true echo "Script completed successfully" exit 0 diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 8c7a9f07a..589399f74 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -875,11 +875,11 @@ except Exception as e: # Run installers # ============================================================================= -install_ucx -install_rixl -install_etcd -install_libionic -install_mori +# install_ucx +# install_rixl +# install_etcd +# install_libionic +# install_mori install_amd_quark install_mori_proxy_deps patch_mori_fp8_compat diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index a77462fc5..f6670b5ee 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -100,7 +100,7 @@ export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE export PROFILER_ARGS=$profiler_args # Engine-specific xP/yD semantics and TP exports -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} fi From 05d5952c218ed747c5771a676c5c4df775541396 Mon Sep 17 00:00:00 2001 From: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com> Date: Mon, 4 May 2026 12:58:19 +0200 Subject: [PATCH 46/98] [AMD] Bump to nightly vllm and vllm-router images (#1208) --------- Signed-off-by: Simon Danielsson --- .github/configs/amd-master.yaml | 4 +- benchmarks/multi_node/amd_utils/env.sh | 9 +- benchmarks/multi_node/amd_utils/job.slurm | 5 +- .../multi_node/amd_utils/moriio_proxy.py | 327 ------------------ .../amd_utils/patches/minimax_m2.py | 4 +- .../multi_node/amd_utils/server_vllm.sh | 32 +- benchmarks/multi_node/amd_utils/setup_deps.sh | 46 +-- 7 files changed, 43 insertions(+), 384 deletions(-) delete mode 100644 benchmarks/multi_node/amd_utils/moriio_proxy.py diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 23c7c7461..04a08fc41 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1313,7 +1313,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:v0.18.0 + image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg @@ -1366,7 +1366,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:v0.18.0 + image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x-disagg diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 81da415e8..cd4794ed5 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -32,8 +32,13 @@ fi export IBDEVICES # Shared: Auto-detect default network interface (portable across clusters) -export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) -export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) +# Only auto-detect if not already set by the runner/environment +if [[ -z "$GLOO_SOCKET_IFNAME" ]]; then + export GLOO_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1) +fi +if [[ -z "$NCCL_SOCKET_IFNAME" ]]; then + export NCCL_SOCKET_IFNAME=$(ip route 2>/dev/null | grep '^default' | awk '{print $5}' | head -n 1) +fi set +x diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index b9a83941a..70f501df6 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -297,7 +297,7 @@ SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" # vLLM external router container -VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy}" +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260503-e8992ca}" ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" @@ -417,6 +417,7 @@ if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \ \"$VLLM_ROUTER_IMAGE\" \ bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \ --vllm-pd-disaggregation \ + --kv-connector moriio \ --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \ --port ${ROUTER_PORT} \ --host 0.0.0.0 \ @@ -483,4 +484,4 @@ if [[ "${KEEP_CONTAINERS}" != "1" ]]; then '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true ' fi -fi \ No newline at end of file +fi diff --git a/benchmarks/multi_node/amd_utils/moriio_proxy.py b/benchmarks/multi_node/amd_utils/moriio_proxy.py deleted file mode 100644 index 7d1e8454b..000000000 --- a/benchmarks/multi_node/amd_utils/moriio_proxy.py +++ /dev/null @@ -1,327 +0,0 @@ -#!/usr/bin/env python3 -# MoRI-IO proxy server for vLLM PD disaggregation. -# -# Based on vLLM's examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py -# with the following adaptations for production multi-node use: -# - Ports configurable via PROXY_HTTP_PORT / PROXY_PING_PORT env vars -# - /health endpoint for sync.py barrier readiness checks -# - Uses stdlib `re` instead of `regex` to avoid extra dep -# -# The proxy performs two roles that vllm-router cannot: -# 1. ZMQ service discovery — prefill/decode workers register their RDMA ports -# 2. Request enrichment — injects remote endpoint info into kv_transfer_params - -import asyncio -import copy -import logging -import os -import re -import socket -import threading -import time -import uuid - -import aiohttp -import msgpack -import zmq -from quart import Quart, make_response, request - -logger = logging.getLogger("moriio_proxy") -logger.setLevel(logging.DEBUG) -handler = logging.StreamHandler() -handler.setFormatter(logging.Formatter( - "%(asctime)s %(levelname)s [%(name)s] %(message)s")) -logger.addHandler(handler) - -prefill_instances: list[dict] = [] -decode_instances: list[dict] = [] -request_nums = 0 -app = Quart(__name__) - -STREAM_IDLE_TIMEOUT = int(os.environ.get("PROXY_STREAM_IDLE_TIMEOUT", "300")) - -IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)") - -TRANSFER_TYPE = None - - -def _append_whole_dict_unique(target_list, data_dict): - new_filtered = {k: v for k, v in data_dict.items() if k != "index"} - for existed in target_list: - existed_filtered = {k: v for k, v in existed.items() if k != "index"} - if existed_filtered == new_filtered: - return False - logger.info("Registered instance: role=%s addr=%s hs_port=%s notify=%s dp=%s tp=%s", - data_dict.get("role"), data_dict.get("request_address"), - data_dict.get("handshake_port"), data_dict.get("notify_port"), - data_dict.get("dp_size"), data_dict.get("tp_size")) - target_list.append(data_dict) - transfer_mode = data_dict.get("transfer_mode", "unknown") - global TRANSFER_TYPE - - if TRANSFER_TYPE is None: - TRANSFER_TYPE = transfer_mode - logger.info("Transfer mode set to: %s", TRANSFER_TYPE) - elif transfer_mode != TRANSFER_TYPE: - raise ValueError(f"mismatched transfer mode {TRANSFER_TYPE} vs {transfer_mode}") - - return True - - -_list_lock = threading.RLock() - - -def _listen_for_register(hostname, port): - context = zmq.Context() - router_socket = context.socket(zmq.ROUTER) - router_socket.bind(f"tcp://{hostname}:{port}") - poller = zmq.Poller() - poller.register(router_socket, zmq.POLLIN) - global prefill_instances - global decode_instances - - while True: - socks = dict(poller.poll()) - if router_socket in socks: - remote_addr, msg = router_socket.recv_multipart() - data = msgpack.loads(msg) - if data["type"] == "HELLO": - pass - elif ( - data["type"] == "register" - and data["role"] == "P" - and data["request_address"] not in prefill_instances - ): - with _list_lock: - _append_whole_dict_unique(prefill_instances, data) - - elif ( - data["type"] == "register" - and data["role"] == "D" - and data["request_address"] not in decode_instances - ): - with _list_lock: - _append_whole_dict_unique(decode_instances, data) - - -def start_service_discovery(hostname, port): - if not hostname: - hostname = socket.gethostname() - if port == 0: - raise ValueError("Port cannot be 0") - - _listener_thread = threading.Thread( - target=_listen_for_register, args=(hostname, port), daemon=True - ) - _listener_thread.start() - logger.info("Service discovery listening on %s:%s", hostname, port) - return _listener_thread - - -async def send_request_to_prefill( - endpoint, req_data, request_id, d_endpoint, dip, dport, selected_prefill_dp_rank -): - req_data_copy = req_data - - req_data_copy["kv_transfer_params"].update( - { - "do_remote_decode": True, - "do_remote_prefill": False, - "remote_handshake_port": d_endpoint["handshake_port"], - "remote_notify_port": d_endpoint["notify_port"], - "remote_engine_id": None, - "remote_block_ids": None, - "remote_host": dip, - "remote_port": dport, - } - ) - req_data_copy["stream"] = False - req_data_copy["max_tokens"] = 1 - if "max_completion_tokens" in req_data_copy: - req_data_copy["max_completion_tokens"] = 1 - if "stream_options" in req_data_copy: - del req_data_copy["stream_options"] - async with aiohttp.ClientSession( - timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) - ) as session: - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - "X-Request-Id": request_id, - } - if selected_prefill_dp_rank is not None: - headers["X-data-parallel-rank"] = str(selected_prefill_dp_rank) - async with session.post( - url=endpoint, json=req_data_copy, headers=headers - ) as response: - if response.status == 200: - return await response.json() - else: - raise RuntimeError( - f"Prefill response status={response.status}" - ) - - -async def start_decode_request(endpoint, req_data, request_id): - session = aiohttp.ClientSession( - timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) - ) - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - "X-Request-Id": request_id, - } - response = await session.post(url=endpoint, json=req_data, headers=headers) - return session, response - - -async def stream_decode_response(session, response, request_id): - try: - if response.status == 200: - chunk_iter = response.content.iter_chunked(1024).__aiter__() - while True: - try: - chunk_bytes = await asyncio.wait_for( - chunk_iter.__anext__(), timeout=STREAM_IDLE_TIMEOUT, - ) - yield chunk_bytes - except StopAsyncIteration: - break - except asyncio.TimeoutError: - logger.error( - "Decode stream %s idle for %ds, aborting", - request_id, STREAM_IDLE_TIMEOUT, - ) - break - else: - raise RuntimeError( - f"Decode response status={response.status}" - ) - finally: - await response.release() - await session.close() - - -@app.route("/health", methods=["GET"]) -async def health_check(): - with _list_lock: - p_count = len(prefill_instances) - d_count = len(decode_instances) - return await make_response( - ({"status": "ok", "prefill_instances": p_count, "decode_instances": d_count}, 200) - ) - - -@app.route("/v1/completions", methods=["POST"]) -@app.route("/v1/chat/completions", methods=["POST"]) -async def handle_request(): - try: - with _list_lock: - global request_nums - request_nums += 1 - - def extract_ip_port_fast(url): - match = IP_PORT_PATTERN.search(url) - if not match: - raise ValueError(f"Invalid URL format: {url}") - return match.groups() - - req_data = await request.get_json() - request_id = str(uuid.uuid4()) - - if not prefill_instances or not decode_instances: - return await make_response( - ("Service Unavailable: No prefill or decode instances registered.", 503) - ) - - pid = request_nums % len(prefill_instances) - did = request_nums % len(decode_instances) - prefill_instance_endpoint = prefill_instances[pid] - decode_instance_endpoint = decode_instances[did] - - selected_prefill_dp_rank = None - if prefill_instance_endpoint["dp_size"] > 1: - selected_prefill_dp_rank = request_nums % prefill_instance_endpoint["dp_size"] - - dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"]) - - req_data_to_prefill = copy.deepcopy(req_data) - req_data_to_prefill["kv_transfer_params"] = {"transfer_id": request_id} - req_data["kv_transfer_params"] = {"transfer_id": request_id} - req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = ( - decode_instance_endpoint["dp_size"] - ) - req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = ( - decode_instance_endpoint["tp_size"] - ) - - send_prefill_task = asyncio.create_task( - send_request_to_prefill( - prefill_instance_endpoint["request_address"], - req_data_to_prefill, - request_id, - decode_instance_endpoint, - dip, - dport, - selected_prefill_dp_rank, - ) - ) - ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"]) - - req_data["max_tokens"] -= 1 - - req_data["kv_transfer_params"] = { - "transfer_id": request_id, - "do_remote_decode": False, - "do_remote_prefill": True, - "remote_handshake_port": prefill_instance_endpoint["handshake_port"], - "remote_notify_port": prefill_instance_endpoint["notify_port"], - "remote_engine_id": None, - "remote_block_ids": None, - "remote_host": ip, - "remote_port": port, - } - if TRANSFER_TYPE == "READ": - prefill_response = await send_prefill_task - req_data["kv_transfer_params"]["remote_engine_id"] = prefill_response[ - "kv_transfer_params" - ]["remote_engine_id"] - req_data["kv_transfer_params"]["remote_block_ids"] = prefill_response[ - "kv_transfer_params" - ]["remote_block_ids"] - - req_data["kv_transfer_params"]["remote_dp_size"] = prefill_instance_endpoint[ - "dp_size" - ] - req_data["kv_transfer_params"]["remote_tp_size"] = prefill_instance_endpoint[ - "tp_size" - ] - - if selected_prefill_dp_rank is not None: - req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank - - decode_request_task = asyncio.create_task( - start_decode_request( - decode_instance_endpoint["request_address"], req_data, request_id - ) - ) - - session, decode_response = await decode_request_task - stream_generator = stream_decode_response(session, decode_response, request_id) - response = await make_response(stream_generator) - return response - except Exception as e: - logger.exception("Error handling request: %s", e) - return await make_response((f"Internal Server Error: {e!s}", 500)) - - -if __name__ == "__main__": - http_port = int(os.environ.get("PROXY_HTTP_PORT", "30000")) - ping_port = int(os.environ.get("PROXY_PING_PORT", "36367")) - - t = start_service_discovery("0.0.0.0", ping_port) - app.debug = False - app.config["BODY_TIMEOUT"] = 360000 - app.config["RESPONSE_TIMEOUT"] = 360000 - - logger.info("MoRI-IO proxy starting: HTTP=%d, ZMQ=%d", http_port, ping_port) - app.run(host="0.0.0.0", port=http_port) - t.join() diff --git a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py index 8290276fb..ac830eb1f 100644 --- a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py +++ b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py @@ -137,7 +137,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, - reduce_results=False, renormalize=True, scoring_func=getattr(config, "scoring_func", "softmax"), e_score_correction_bias=self.e_score_correction_bias, @@ -185,7 +184,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: ) final_hidden_states = final_hidden_states[:num_tokens] elif self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( + from vllm.distributed.communication_op import tensor_model_parallel_all_reduce + final_hidden_states = tensor_model_parallel_all_reduce( final_hidden_states ) diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 73cad3adc..9acb05f54 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -242,7 +242,7 @@ done echo "Prefill node IPs: ${PREFILL_ARGS}" echo "Decode node IPs: ${DECODE_ARGS}" -# MoRI-IO proxy ZMQ registration port (must match moriio_proxy.py PROXY_PING_PORT) +# MoRI-IO proxy ZMQ registration port (must match vllm-router --vllm-discovery-address) PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" # vLLM environment (UCX transport vars are set at the Docker level in job.slurm) @@ -281,26 +281,8 @@ if [ "$NODE_RANK" -eq 0 ]; then setup_vllm_env - # Start MoRI-IO proxy FIRST — workers register via ZMQ on startup - # Skipped when ROUTER_TYPE=vllm-router (external router container started by job.slurm) - if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then - echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..." - PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \ - python3 $WS_PATH/moriio_proxy.py" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PROXY_CMD" - else - PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log" - set -x - eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 & - set +x - proxy_pid=$! - sleep 3 - fi - else - echo "Using external vLLM router (ROUTER_TYPE=${ROUTER_TYPE:-vllm-router})" - fi + # Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE) + echo "Using external vllm-router container (started by job.slurm on this node)" PREFILL_CMD="vllm serve ${MODEL_PATH} \ --port $SERVER_PORT \ @@ -343,7 +325,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "DRY RUN: $HEALTH_BARRIER_CMD" else eval "$HEALTH_BARRIER_CMD" - echo "${ROUTER_TYPE} is ready for benchmarking" + echo "MoRI-IO proxy is ready for benchmarking" fi echo "Ready for benchmarking on ${host_name}:${host_ip}" @@ -375,14 +357,8 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Killing the prefill server" if [[ "$DRY_RUN" -eq 0 ]]; then - if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then - [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true - fi [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true sleep 2 - if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then - pkill -f moriio_proxy 2>/dev/null || true - fi pkill -f "vllm serve" 2>/dev/null || true fi diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 589399f74..958cb9808 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -242,43 +242,48 @@ patch_mori_fp8_compat() { import re, os, sys patched = [] -# 1. Patch layer.py: remove multi-line AITER assertion for MoRI +# Patch layer.py: remove AITER requirement assertion(s) for MoRI try: import vllm.model_executor.layers.fused_moe.layer as lm f = lm.__file__ src = open(f).read() - if "Mori needs to be used with aiter" in src: + if "[PATCHED] AITER requirement removed for MoRI-EP + FP8" in src: + print("[SETUP] layer.py MoRI-FP8 patch already applied") + elif "Mori needs to be used with aiter" in src: + # v0.19+: two consecutive assertions inside `if self.moe_config.use_mori_kernels:` new = re.sub( - r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", + r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)\s*" + r"assert not self\.aiter_fmoe_shared_expert_enabled,\s*\([^)]*\)", "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", src, flags=re.DOTALL) + if new == src: + # v0.17.1/v0.18.0: only the first assertion existed + new = re.sub( + r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", + "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", + src, flags=re.DOTALL) if new != src: open(f, "w").write(new) patched.append("layer.py") + else: + print("[SETUP] ERROR: layer.py pattern found but regex had no effect", file=sys.stderr) + sys.exit(1) + else: + print("[SETUP] ERROR: layer.py AITER assertion pattern not found — vLLM API may have changed", file=sys.stderr) + sys.exit(1) except Exception as e: - print(f"[SETUP] WARN patch layer.py: {e}", file=sys.stderr) + print(f"[SETUP] ERROR patch layer.py: {e}", file=sys.stderr) + sys.exit(1) -# 2. Patch mori_prepare_finalize.py: remove defer_input_quant restriction -try: - import vllm.model_executor.layers.fused_moe.mori_prepare_finalize as mm - f = mm.__file__ - src = open(f).read() - if "defer_input_quant" in src: - new = re.sub( - r"raise NotImplementedError\([^)]*defer_input_quant[^)]*\)", - "pass # [PATCHED] defer_input_quant check removed for MoRI-EP + FP8", - src) - if new != src: - open(f, "w").write(new) - patched.append("mori_prepare_finalize.py") -except Exception as e: - print(f"[SETUP] WARN patch mori_pf: {e}", file=sys.stderr) +# prepare_finalize/mori.py (v0.19+) already handles defer_input_quant correctly +# (skips FP8 quant when True). No patch needed for that file. +# Added in 0.18.1: https://github.com/vllm-project/vllm/commit/6a9cceb219fcbd6b1eb540ddfdc77ec160f0e209 if patched: print(f"[SETUP] Patched: {chr(44).join(patched)}") else: print("[SETUP] No MoRI-FP8 patches needed") -' +' || exit 1 _SETUP_INSTALLED+=("MoRI-FP8-patch") } @@ -881,7 +886,6 @@ except Exception as e: # install_libionic # install_mori install_amd_quark -install_mori_proxy_deps patch_mori_fp8_compat patch_moriio_save_kv_timeout patch_moriio_transfer_timeout From b621e764d2043fe9ac90fde37abc98949e17df4b Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 12 May 2026 08:33:11 +0000 Subject: [PATCH 47/98] update vllm image and vllm router image --- .github/configs/amd-master.yaml | 2 +- benchmarks/multi_node/amd_utils/job.slurm | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 04a08fc41..8a78072ec 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1313,7 +1313,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c + image: aigmkt/vllm-dev:ainic2 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 70f501df6..47eed2149 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -297,7 +297,7 @@ SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" # vLLM external router container -VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260503-e8992ca}" +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260511-e667ebb}" ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" From 3f1ce6fdbb004d2fef99df6a8e7701c48f80e303 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 12 May 2026 10:12:22 +0000 Subject: [PATCH 48/98] update the interface prefix for tw cluster Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/env.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index cd4794ed5..ffdc9682e 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -54,9 +54,9 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then # ========================================================================= set -x - # UCX_NET_DEVICES: Use the first benic interface for UCX TCP transport + # UCX_NET_DEVICES: Use the first tw-eth interface for UCX TCP transport if [[ -z "$UCX_NET_DEVICES" ]]; then - UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1) + UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth/{print $2}' | head -1) if [[ -n "$UCX_NET_DEV" ]]; then export UCX_NET_DEVICES="$UCX_NET_DEV" else From ee52aff4b23d827105f85c773d9032acf019dea7 Mon Sep 17 00:00:00 2001 From: Shan Theresa Date: Wed, 13 May 2026 06:33:57 +0000 Subject: [PATCH 49/98] add deps for ib device auto-detection Signed-off-by: Shan Theresa --- benchmarks/multi_node/amd_utils/env.sh | 4 ++ benchmarks/multi_node/amd_utils/setup_deps.sh | 31 ++++++------ benchmarks/multi_node/amd_utils/submit.sh | 49 +++++++++++++++++++ 3 files changed, 68 insertions(+), 16 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index ffdc9682e..e01365503 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -56,7 +56,11 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then # UCX_NET_DEVICES: Use the first tw-eth interface for UCX TCP transport if [[ -z "$UCX_NET_DEVICES" ]]; then +<<<<<<< Updated upstream UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth/{print $2}' | head -1) +======= + UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth0/{print $2}' | head -1) +>>>>>>> Stashed changes if [[ -n "$UCX_NET_DEV" ]]; then export UCX_NET_DEVICES="$UCX_NET_DEV" else diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 958cb9808..860cecf96 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -144,28 +144,26 @@ install_libionic() { } # --------------------------------------------------------------------------- -# 5. MoRI-IO proxy deps (Python packages for the MoRI-IO-aware proxy server) -# The proxy replaces vllm-router: it handles both HTTP routing AND the -# MoRI-IO ZMQ registration/request-enrichment protocol. -# Only needed on NODE_RANK=0 (proxy node). +# 5. Container RDMA/net tools +# - ibv_devinfo comes from ibverbs-utils +# - iproute2 provides the `ip` command +# Used for in-container NIC/RDMA validation and routing checks. # --------------------------------------------------------------------------- -install_mori_proxy_deps() { - if python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then - echo "[SETUP] MoRI-IO proxy Python deps already present" +install_recipe_deps() { + if command -v ibv_devinfo >/dev/null 2>&1 && command -v ip >/dev/null 2>&1; then + echo "[SETUP] Container RDMA/net tools already present" return 0 fi - echo "[SETUP] Installing MoRI-IO proxy Python deps..." - # v0.18.0 ships aiohttp, pyzmq, blinker(distutils); only quart and msgpack - # are missing. --ignore-installed blinker avoids pip's distutils uninstall - # error when quart pulls a newer blinker version. - pip install --quiet --ignore-installed blinker - pip install --quiet quart msgpack + echo "[SETUP] Installing ibv_devinfo + iproute2 in container..." + apt-get update -q -y && apt-get install -q -y \ + ibverbs-utils iproute2 \ + && rm -rf /var/lib/apt/lists/* - if ! python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then - echo "[SETUP] ERROR: MoRI-IO proxy deps install failed"; exit 1 + if ! command -v ibv_devinfo >/dev/null 2>&1 || ! command -v ip >/dev/null 2>&1; then + echo "[SETUP] ERROR: Failed to install ibv_devinfo/iproute2"; exit 1 fi - _SETUP_INSTALLED+=("mori-proxy-deps") + _SETUP_INSTALLED+=("ibverbs-utils+iproute2") } # --------------------------------------------------------------------------- @@ -885,6 +883,7 @@ except Exception as e: # install_etcd # install_libionic # install_mori +install_recipe_deps install_amd_quark patch_mori_fp8_compat patch_moriio_save_kv_timeout diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index f6670b5ee..524b00c65 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -161,6 +161,55 @@ if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") fi +# ============================================================================= +# Reuse existing allocation (skip sbatch) +# ============================================================================= +# When SLURM_REUSE_JOBID is set, run job.slurm directly in the current shell, +# attaching to the existing allocation. Inner `srun` calls pick up the +# allocation via SLURM_JOB_ID; SLURM_OVERLAP=1 lets them share task slots with +# the interactive shell already holding the allocation. +if [[ -n "${SLURM_REUSE_JOBID:-}" ]]; then + REUSE_JID="$SLURM_REUSE_JOBID" + echo "Reusing existing Slurm allocation ${REUSE_JID} (skipping sbatch)" >&2 + + # Resolve allocation's nodelist if not already provided. + ALLOC_NODELIST="${SLURM_JOB_NODELIST:-$(squeue -h -j "$REUSE_JID" -o '%N' 2>/dev/null)}" + if [[ -z "$ALLOC_NODELIST" ]]; then + echo "Error: could not resolve nodelist for job ${REUSE_JID}" >&2 + exit 1 + fi + ALLOC_NNODES=$(scontrol show hostnames "$ALLOC_NODELIST" | wc -l) + if [[ "$ALLOC_NNODES" -lt "$NUM_NODES" ]]; then + echo "Error: allocation ${REUSE_JID} has ${ALLOC_NNODES} nodes, need ${NUM_NODES}" >&2 + exit 1 + fi + + export SLURM_JOB_ID="$REUSE_JID" + export SLURM_JOBID="$REUSE_JID" + export SLURM_JOB_NODELIST="$ALLOC_NODELIST" + export SLURM_NODELIST="$ALLOC_NODELIST" + export SLURM_NNODES="$ALLOC_NNODES" + export SLURM_JOB_NUM_NODES="$ALLOC_NNODES" + export SLURM_NTASKS="$ALLOC_NNODES" + export SLURM_NPROCS="$ALLOC_NNODES" + export SLURM_NTASKS_PER_NODE=1 + export SLURM_TASKS_PER_NODE="1(x${ALLOC_NNODES})" + export SLURM_OVERLAP=1 + export SLURM_SUBMIT_DIR="$(pwd)" + + STDOUT_LOG="${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.out" + STDERR_LOG="${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.err" + rm -f "$STDOUT_LOG" "$STDERR_LOG" + + nohup bash "$(dirname "$0")/job.slurm" >"$STDOUT_LOG" 2>"$STDERR_LOG" & + INLINE_PID=$! + echo "$INLINE_PID" > "${BENCHMARK_LOGS_DIR}/slurm_job-${REUSE_JID}.pid" + echo "Started job.slurm (pid=${INLINE_PID}); logs: ${STDOUT_LOG}" >&2 + + echo "$REUSE_JID" + exit 0 +fi + # Construct the sbatch command sbatch_cmd=( sbatch From 4abca16f8062b4d8f7d9def12e4c85577f256c69 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 10:42:03 +0000 Subject: [PATCH 50/98] update vllm image Signed-off-by: Theresa Shan --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8a78072ec..4fe7644c0 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1313,7 +1313,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: aigmkt/vllm-dev:ainic2 + image: ghcr.io/simondanielsson/vllm-dev:ainic-test-hydra model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg From c9e0d0f2083a0b3d9b506bfd62be82213c318043 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 13:57:43 +0000 Subject: [PATCH 51/98] fix indentation and add missing finally block in async_request_openai_chat_completions Co-Authored-By: Claude Opus 4 --- utils/bench_serving/backend_request_func.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py index 4c8820f8d..9c4221781 100644 --- a/utils/bench_serving/backend_request_func.py +++ b/utils/bench_serving/backend_request_func.py @@ -421,10 +421,13 @@ async def async_request_openai_chat_completions( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output def get_model(pretrained_model_name_or_path: str) -> str: From 135dab0bf082694ebb9b7499d9fd6be51a54741b Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 13:59:32 +0000 Subject: [PATCH 52/98] fix tw-eth interface detection pattern in env.sh Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/env.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index e01365503..ffdc9682e 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -56,11 +56,7 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then # UCX_NET_DEVICES: Use the first tw-eth interface for UCX TCP transport if [[ -z "$UCX_NET_DEVICES" ]]; then -<<<<<<< Updated upstream UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth/{print $2}' | head -1) -======= - UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth0/{print $2}' | head -1) ->>>>>>> Stashed changes if [[ -n "$UCX_NET_DEV" ]]; then export UCX_NET_DEVICES="$UCX_NET_DEV" else From 943d6b673bbe7228827f6c68e9e2a38eace4a804 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 14:09:40 +0000 Subject: [PATCH 53/98] fix vllm-disagg config schema: use scenarios.fixed-seq-len Co-Authored-By: Claude Opus 4 --- .github/configs/amd-master.yaml | 178 ++++++++++++++++---------------- 1 file changed, 90 insertions(+), 88 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 4fe7644c0..5f50343f4 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1321,49 +1321,50 @@ kimik2.5-fp4-mi355x-vllm-disagg: framework: vllm-disagg multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c @@ -1374,51 +1375,52 @@ minimaxm2.5-fp8-mi355x-vllm-disagg: framework: vllm-disagg multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536, - # TP8 shards to 192 which is not divisible by FP8 block_n=128. - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536, + # TP8 shards to 192 which is not divisible by FP8 block_n=128. + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" dsr1-fp4-mi355x-sglang-disagg: image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501 From b8277b9c4273431513169d161eff1185bd86a30c Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 15:10:04 +0000 Subject: [PATCH 54/98] fix vllm-disagg routing to multi_node benchmark subdir Co-Authored-By: Claude Opus 4 --- runners/launch_mi355x-amds.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 2f700d4e7..e5243d6fd 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -56,7 +56,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" - if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then + if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then BENCHMARK_SUBDIR="multi_node" else BENCHMARK_SUBDIR="single_node" From 1336c34668c0ec4eadc4df7d5ab64f579c72bdaf Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Wed, 13 May 2026 15:51:26 +0000 Subject: [PATCH 55/98] fix result collection to use FRAMEWORK as log directory prefix The inline collect_latest_results.py hardcoded "sglang" as the log directory prefix, causing "No logs directory found" for vllm-disagg runs where bench.sh creates directories named vllm-disagg_isl_X_osl_Y. Co-Authored-By: Claude Opus 4 --- runners/launch_mi355x-amds.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index e5243d6fd..f616504af 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -108,12 +108,12 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if [[ "${EVAL_ONLY:-false}" != "true" ]]; then cat > collect_latest_results.py <<'PY' import os, sys -sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) -for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: +sgl_job_dir, isl, osl, nexp, framework = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]), sys.argv[5] +for path in sorted([f"{sgl_job_dir}/logs/{name}/{framework}_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/{framework}_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY - LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1) + LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1 "$FRAMEWORK") if [ -z "$LOGS_DIR" ]; then echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" exit 1 From 281f6797d1fe91c5793685292e1fa797a5d8c700 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 02:23:11 +0000 Subject: [PATCH 56/98] suppress tokenizer warnings and debug output in bench.sh Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/bench.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index aecc29e83..33cc918bf 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -37,6 +37,9 @@ IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" ROUTER_PORT="${ROUTER_PORT:-30000}" +export TRANSFORMERS_VERBOSITY=error +export TOKENIZERS_PARALLELISM=false + echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}" From b131734a49677461a6ccd647719a910e31fece7d Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 02:44:58 +0000 Subject: [PATCH 57/98] fix vllm-disagg deadlock: stop router after rank 0 container exits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vllm-router runs as a separate container on node 0. After node 0's main container finishes the benchmark and exits, decode nodes remain stuck waiting for the router port to close. The router cleanup in job.slurm can't run until srun completes, but srun can't complete because decode nodes are blocked — deadlock. Fix: skip exec on rank 0 for vllm-disagg so the srun bash script continues after docker exits and can stop the router container, allowing decode nodes to detect the port closure and exit. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/job.slurm | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 47eed2149..20ecb6683 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -427,7 +427,16 @@ if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \ --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log \" fi -exec \$DOCKER_CMD run \ +# Skip exec on vllm-disagg rank 0 so we can stop the router after the main +# container exits. Without this, decode nodes block forever waiting for the +# router port to close (the router is a separate container). +MAYBE_EXEC=exec +if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then + MAYBE_EXEC= + set +e +fi + +\$MAYBE_EXEC \$DOCKER_CMD run \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -468,11 +477,11 @@ exec \$DOCKER_CMD run \ '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log ' +# Only reached when exec was skipped (vllm-disagg rank 0) DOCKER_EXIT_CODE=\$? -if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then - echo \"ERROR: docker exited rc=\$DOCKER_EXIT_CODE on \$(hostname)\" - exit \$DOCKER_EXIT_CODE -fi +echo \"[rank 0] Main container exited (rc=\$DOCKER_EXIT_CODE). Stopping vllm-router...\" +\$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true +exit \$DOCKER_EXIT_CODE " if [[ "${KEEP_CONTAINERS}" != "1" ]]; then From 53d84e8cedaacfb69f144cfbc3a7dc01b084f2b2 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 02:57:46 +0000 Subject: [PATCH 58/98] reduce vllm-disagg concurrency sweep to single point for faster iteration Co-Authored-By: Claude Opus 4 --- .github/configs/amd-master.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 5f50343f4..10703a0bc 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1326,9 +1326,9 @@ kimik2.5-fp4-mi355x-vllm-disagg: - isl: 1024 osl: 1024 search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total , 16, 32, 64, 128, 256, 512 - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + conc-list: [ 8 ] prefill: num-worker: 1 tp: 8 @@ -1349,7 +1349,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: osl: 1024 search-space: - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + conc-list: [ 8 ] prefill: num-worker: 1 tp: 8 From 416dc14287149179b0f1fad6489f823bc6ce6805 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 03:30:18 +0000 Subject: [PATCH 59/98] preserve slurm logs on failure and print stderr inline The EXIT trap deleted benchmark_logs/ before saving artifacts, making it impossible to debug container startup failures. Now the trap always copies slurm .out/.err to the artifact directory and prints the last 100 lines of .err inline in the CI output. Co-Authored-By: Claude Opus 4 --- runners/launch_mi355x-amds.sh | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index f616504af..8ea9f2d5a 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -52,8 +52,24 @@ if [[ "$IS_MULTINODE" == "true" ]]; then sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true # Ensure root-owned files are cleaned up even on early exit to prevent - # EACCES errors when the next GH Actions job checks out on this runner - trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT + # EACCES errors when the next GH Actions job checks out on this runner. + # Always preserve slurm logs as CI artifacts for debugging. + cleanup_and_save_logs() { + if [[ -n "${GITHUB_ACTIONS:-}" && -n "${JOB_ID:-}" ]]; then + local art_dir="$GITHUB_WORKSPACE/benchmark_artifacts" + mkdir -p "$art_dir" + cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$art_dir/" 2>/dev/null || true + fi + # Print .err inline so failures are visible in CI output + local err_file="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID:-unknown}.err" + if [[ -s "$err_file" ]]; then + echo "=== Slurm job stderr ===" + tail -100 "$err_file" + echo "========================" + fi + sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true + } + trap cleanup_and_save_logs EXIT SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then @@ -162,16 +178,7 @@ PY sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true - # Upload logs as artifact if running in GitHub Actions - if [[ -n "${GITHUB_ACTIONS:-}" ]]; then - ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts" - mkdir -p "$ARTIFACT_DIR" - cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$ARTIFACT_DIR/" 2>/dev/null || true - echo "Logs copied to $ARTIFACT_DIR for artifact upload" - fi - - # Clean up root-owned files to prevent EACCES on GH Actions checkout cleanup - sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true + # Log preservation and cleanup handled by EXIT trap (cleanup_and_save_logs) else From f2e9cdb9b87e7a7bf5586a705d693e4793b211ca Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 09:16:42 +0000 Subject: [PATCH 60/98] enable set -x around docker privilege detection for CI debugging Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/job.slurm | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 20ecb6683..8d904044a 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -86,12 +86,14 @@ PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" # Docker privilege detection # ============================================================================= # Detect on the batch host. Per-node detection happens inside srun below. +set -x if docker ps &>/dev/null; then DOCKER_CMD="docker" else DOCKER_CMD="sudo docker" fi export DOCKER_CMD +set +x # ============================================================================= # Model Path Resolution From ee980da933351886e20f1c1149803eb056bda9fc Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 10:16:43 +0000 Subject: [PATCH 61/98] fix docker detection: test on compute node, not batch host The batch host has docker socket permissions but the compute nodes do not, causing "permission denied" on all srun tasks. Move the detection after SELECTED_NODES is known and probe via srun. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/job.slurm | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 8d904044a..1da4b4890 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -82,19 +82,6 @@ ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}" ROUTER_PORT="${ROUTER_PORT:-30000}" PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" -# ============================================================================= -# Docker privilege detection -# ============================================================================= -# Detect on the batch host. Per-node detection happens inside srun below. -set -x -if docker ps &>/dev/null; then - DOCKER_CMD="docker" -else - DOCKER_CMD="sudo docker" -fi -export DOCKER_CMD -set +x - # ============================================================================= # Model Path Resolution # ============================================================================= @@ -212,6 +199,16 @@ FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') +# Docker privilege detection — test on a compute node, not the batch host. +FIRST_NODE=$(echo "$SELECTED_NODES" | head -1) +if srun --nodelist="$FIRST_NODE" -N1 -n1 --overlap bash -c 'docker ps &>/dev/null'; then + DOCKER_CMD="docker" +else + DOCKER_CMD="sudo docker" +fi +export DOCKER_CMD +echo "[docker-detect] DOCKER_CMD=$DOCKER_CMD (tested on $FIRST_NODE)" + # Update SLURM environment variables export SLURM_NNODES=$NUM_NODES export SLURM_NTASKS=$NUM_NODES From da68e5f6443e8a7198b7f03e577b269b27209e72 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 10:50:01 +0000 Subject: [PATCH 62/98] fix docker detection: per-node probe since group membership varies Export DOCKER_CMD_DETECT as a shell snippet that each srun participant evaluates locally, instead of testing a single node and assuming all nodes have the same docker socket permissions. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/job.slurm | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 1da4b4890..22b1ebcb3 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -199,15 +199,9 @@ FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') -# Docker privilege detection — test on a compute node, not the batch host. -FIRST_NODE=$(echo "$SELECTED_NODES" | head -1) -if srun --nodelist="$FIRST_NODE" -N1 -n1 --overlap bash -c 'docker ps &>/dev/null'; then - DOCKER_CMD="docker" -else - DOCKER_CMD="sudo docker" -fi -export DOCKER_CMD -echo "[docker-detect] DOCKER_CMD=$DOCKER_CMD (tested on $FIRST_NODE)" +# Docker privilege detection — evaluated per-node since group membership varies. +# Exported as a snippet so every srun participant resolves it locally. +export DOCKER_CMD_DETECT='if docker ps &>/dev/null 2>&1; then DOCKER_CMD=docker; else DOCKER_CMD="sudo docker"; fi' # Update SLURM environment variables export SLURM_NNODES=$NUM_NODES @@ -402,6 +396,10 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" +# Per-node docker privilege detection +eval \"\$DOCKER_CMD_DETECT\" +echo \"[docker-detect] rank \$SLURM_PROCID: DOCKER_CMD=\$DOCKER_CMD\" + # Pre-clean (idempotent) \$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true \$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true @@ -484,12 +482,12 @@ exit \$DOCKER_EXIT_CODE " if [[ "${KEEP_CONTAINERS}" != "1" ]]; then - srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' + srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' # Clean up vLLM external router container on node 0 if [[ "$ENGINE" == "vllm-disagg" && "$ROUTER_TYPE" == "vllm-router" ]]; then srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' - '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true + eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true ' fi fi From 63717ad43fac1076f9361612cf689e1e43fc286d Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 14:19:27 +0000 Subject: [PATCH 63/98] add vllm-disagg changelog entries and update kimi conc-list - Add perf-changelog entries for kimik2.5-fp4-mi355x-vllm-disagg and minimaxm2.5-fp8-mi355x-vllm-disagg to trigger CI benchmarks - Update kimi 1k1k conc-list from [8] to [16] - Comment out kimi 8k1k config until eval pipeline is wired up Co-Authored-By: Claude Opus 4 --- .github/configs/amd-master.yaml | 44 ++++++++++++++++----------------- perf-changelog.yaml | 10 ++++++++ 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 10703a0bc..f7142ca79 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1326,9 +1326,9 @@ kimik2.5-fp4-mi355x-vllm-disagg: - isl: 1024 osl: 1024 search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total , 16, 32, 64, 128, 256, 512 + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - spec-decoding: "none" - conc-list: [ 8 ] + conc-list: [ 16 ] prefill: num-worker: 1 tp: 8 @@ -1345,26 +1345,26 @@ kimik2.5-fp4-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "none" - conc-list: [ 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" + # - isl: 8192 + # osl: 1024 + # search-space: + # - spec-decoding: "none" + # conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "PREFILL_NODES=1" + # - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + # decode: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: false + # additional-settings: + # - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9c4910b13..cbd37a057 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2934,3 +2934,13 @@ description: - "Update SGLang ROCm image from v0.5.11/v0.5.10rc0 to v0.5.12-rocm720-mi35x-20260517" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1440 + +- config-keys: + - kimik2.5-fp4-mi355x-vllm-disagg + description: + - "Add vLLM disaggregated prefill-decode benchmark for Kimi-K2.5-MXFP4 on MI355X" + +- config-keys: + - minimaxm2.5-fp8-mi355x-vllm-disagg + description: + - "Add vLLM disaggregated prefill-decode benchmark for MiniMax-M2.5 on MI355X" From e8f8cada190c1345cb6166b0da73c9319f25d84a Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Thu, 14 May 2026 14:50:15 +0000 Subject: [PATCH 64/98] switch vllm-disagg to 8k1k config to trigger multi-node eval Comment out 1k1k config and enable 8k1k with conc-list [16] so mark_eval_entries picks it up for the eval pipeline. Co-Authored-By: Claude Opus 4 --- .github/configs/amd-master.yaml | 46 ++++++++++++++++----------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f7142ca79..65f6f9d93 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1323,31 +1323,10 @@ kimik2.5-fp4-mi355x-vllm-disagg: disagg: true scenarios: fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - - spec-decoding: "none" - conc-list: [ 16 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - # - isl: 8192 + # - isl: 1024 # osl: 1024 # search-space: + # # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total # - spec-decoding: "none" # conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] # prefill: @@ -1366,6 +1345,27 @@ kimik2.5-fp4-mi355x-vllm-disagg: # additional-settings: # - "DECODE_NODES=2" + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 16 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c model: MiniMaxAI/MiniMax-M2.5 From 980ffd0b320693d25d89e04f6114ce18aeed918a Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 02:52:17 +0000 Subject: [PATCH 65/98] add multi-node eval feature Signed-off-by: Theresa Shan --- .../multi_node/amd_utils/server_sglang.sh | 209 +++++++++++++++--- .../multi_node/amd_utils/server_vllm.sh | 84 ++++++- 2 files changed, 255 insertions(+), 38 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index 53ca29cc5..b410bc978 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -43,7 +43,7 @@ GPUS_PER_NODE="${GPUS_PER_NODE:-8}" # ============================================================================= # Dependencies and Environment Setup # ============================================================================= -source $WS_PATH/env.sh +source $SGLANG_WS_PATH/env.sh host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') host_name=$(hostname) @@ -62,7 +62,7 @@ fi # ============================================================================= # Model-Specific Configuration from YAML # ============================================================================= -MODELS_YAML="${WS_PATH}/models.yaml" +MODELS_YAML="${SGLANG_WS_PATH}/models.yaml" if [[ ! -f "$MODELS_YAML" ]]; then echo "ERROR: models.yaml not found at $MODELS_YAML" @@ -127,6 +127,9 @@ no_dp = prefill.get('no_dp', {}) print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') +print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"') +print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"') +print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"') print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) @@ -169,10 +172,16 @@ if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP + prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP + prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP + prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP else prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP + prefill_context_length="" + prefill_max_total_tokens="" + prefill_enable_two_batch_overlap="false" fi # Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) @@ -187,29 +196,31 @@ else decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP fi -# Use Decode configuration to configure different TP/DP size between P and D -PREFILL_DECODE_DIFFERENT_TP="" -if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then - if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}" - else - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1" - fi -fi - # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}" +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi +if [[ -n "$prefill_context_length" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}" +fi +if [[ -n "$prefill_max_total_tokens" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}" +fi +if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap" + PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true" +fi + +DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} " -DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}" if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" fi if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) + MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) fi # ============================================================================= @@ -327,12 +338,24 @@ if [[ -n "$MODEL_NAME" ]]; then echo "Using model-specific configuration for: $MODEL_NAME" fi +if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]; then + PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') + DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g') + unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL + unset MORI_MOE_MAX_INPUT_TOKENS_DECODE + # NOTE: that currently with fp8_combine set, the evals do not pass on InferenceX eval harness + # or on SGLang native harness for high concurrency 4k and gets no where near the golden score of + # 0.95 on even basic GSM8k grade school math as confirmed by @billishyahao from AMD + # and as confirmed by @Oseltamivir. This was initally merged with @billishyahao promising + # that an fast follow PR to fix the evals via having quant correction in the fp8 combine +fi + # ============================================================================= # Container Synchronization # ============================================================================= echo "Waiting at the container creation barrier on $host_name" -python3 $WS_PATH/sync.py barrier \ +python3 $SGLANG_WS_PATH/sync.py barrier \ --local-ip ${host_ip} \ --local-port 5000 \ --enable-port \ @@ -362,20 +385,27 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" - echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}" - echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}" + echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}" + echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} " + echo "Decode env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} " + echo "================================================" # start the head prefill server - PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_MORI_MOE_ENV="" + set -x + if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then + PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" + fi + set +x + PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/$MODEL_NAME \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} \ - --log-level-http warning" + ${PREFILL_SERVER_CONFIG} " if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" @@ -396,7 +426,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Waiting for all prefill and decode servers to be up . . ." - BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ --node-ips ${IPADDRS} \ --node-ports 8000 \ --wait-for-all-ports \ @@ -433,7 +463,7 @@ if [ "$NODE_RANK" -eq 0 ]; then proxy_pid=$! # Wait for router to be ready via health endpoint - HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + HEALTH_BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ --node-ips ${NODE0_ADDR} \ --node-ports 30000 \ --wait-for-all-health \ @@ -453,7 +483,7 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Ready for benchmarking on ${host_name}:${host_ip}" echo "Benchmarking on ${host_name}:${host_ip}" - cd $WS_PATH + cd $SGLANG_WS_PATH # Export IS_MTP based on whether MTP is enabled if [ "$DECODE_MTP_SIZE" -gt 0 ]; then @@ -463,12 +493,14 @@ if [ "$NODE_RANK" -eq 0 ]; then fi # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier - BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ + BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" - if [[ "$DRY_RUN" -eq 1 ]]; then + if [[ "${EVAL_ONLY:-false}" == "true" ]]; then + echo "EVAL_ONLY mode: skipping throughput benchmark" + elif [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $BENCH_CMD" else set -x @@ -476,6 +508,96 @@ if [ "$NODE_RANK" -eq 0 ]; then set +x fi + # Run evaluation if requested (before killing router) + if [[ "${RUN_EVAL:-false}" == "true" ]]; then + echo "Running lm-eval evaluation on Node 0..." + + # Health check: verify the router is still serving before running eval. + # The throughput benchmark may have crashed/exhausted decode workers. + EVAL_HEALTH_OK=false + for _attempt in 1 2 3; do + if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then + EVAL_HEALTH_OK=true + break + fi + echo "Eval health check attempt $_attempt failed, retrying in 10s..." + sleep 10 + done + + if [[ "$EVAL_HEALTH_OK" != "true" ]]; then + echo "WARNING: Router health check failed after 3 attempts. Skipping eval." + else + # Must run from repo root so utils/evals/${task}.yaml resolves + pushd /workspace + + # Source eval functions from benchmark_lib.sh + source /workspace/benchmarks/benchmark_lib.sh + + # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list + if [[ -n "${EVAL_CONC:-}" ]]; then + export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" + else + export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + fi + + # Override eval context length with model's configured context_length + if [[ -n "$prefill_context_length" ]]; then + export EVAL_MAX_MODEL_LEN="$prefill_context_length" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" + else + # Run lm-eval against the router on port 30000 + run_eval --framework lm-eval --port 30000 + eval_rc=$? + + if [[ $eval_rc -ne 0 ]]; then + echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2 + EVAL_FAILED=1 + else + # Set metadata env vars for append_lm_eval_summary + export TP="${PREFILL_TP_SIZE}" + export CONC="${EVAL_CONCURRENT_REQUESTS}" + export EP_SIZE=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" + export PREFILL_TP="${PREFILL_TP_SIZE}" + export PREFILL_EP=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}" + export PREFILL_NUM_WORKERS="${xP}" + export DECODE_TP="${DECODE_TP_SIZE}" + export DECODE_EP=1 + [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}" + export DECODE_NUM_WORKERS="${yD}" + export DP_ATTENTION="${PREFILL_ENABLE_DP}" + export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}" + export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}" + export ISL="${BENCH_INPUT_LEN}" + export OSL="${BENCH_OUTPUT_LEN}" + # IS_MULTINODE, FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, + # RESULT_FILENAME are already set via Docker -e flags from job.slurm + + append_lm_eval_summary + # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace + + # Copy eval artifacts to run_logs for NFS extraction by runner + EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" + mkdir -p "$EVAL_COPY_DIR" + for f in meta_env.json; do + [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" + done + # Use find for glob patterns to avoid "no match" errors + find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; + find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; + + echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" + fi + fi + + popd + fi + fi + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" mkdir -p "$LOGS_OUTPUT" @@ -492,20 +614,30 @@ if [ "$NODE_RANK" -eq 0 ]; then kill $prefill0_pid fi + if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then + echo "ERROR: eval failed; exiting node-0 with rc=1" + exit 1 + fi + elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" echo "Using prefill config: $PREFILL_SERVER_CONFIG" echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" - PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_MORI_MOE_ENV="" + set -x + if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" ]]; then + PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" + fi + set +x + PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/${MODEL_NAME} \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} \ - --log-level-http warning" + ${PREFILL_SERVER_CONFIG} " if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) @@ -524,7 +656,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then fi echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ --node-ips ${NODE0_ADDR} \ --node-ports 30000 \ --wait-for-all-ports \ @@ -537,7 +669,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then fi echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $WS_PATH/sync.py wait \ + WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ --remote-ip ${NODE0_ADDR} \ --remote-port 30000" @@ -560,15 +692,20 @@ else echo "Decode node rank: $RANK" echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" - DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + DECODE_MORI_MOE_ENV="" + set -x + if [[ -n "$MORI_MOE_MAX_INPUT_TOKENS_DECODE" ]]; then + DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}" + fi + set +x + DECODE_CMD="${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ --model-path ${MODEL_DIR}/${MODEL_NAME} \ --disaggregation-mode decode \ --disaggregation-ib-device ${IBDEVICES} \ --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ - ${DECODE_SERVER_CONFIG} \ - --log-level-http warning" + ${DECODE_SERVER_CONFIG} " if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then rank=$((RANK % DECODE_NODES_PER_WORKER)) @@ -589,7 +726,7 @@ else echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ --node-ips ${NODE0_ADDR} \ --node-ports 30000 \ --wait-for-all-ports \ @@ -603,7 +740,7 @@ else echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $WS_PATH/sync.py wait \ + WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ --remote-ip ${NODE0_ADDR} \ --remote-port 30000" @@ -621,4 +758,4 @@ else fi echo "Script completed successfully" -exit 0 +exit 0 \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 9acb05f54..60b0adb92 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -338,7 +338,9 @@ if [ "$NODE_RANK" -eq 0 ]; then ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" - if [[ "$DRY_RUN" -eq 1 ]]; then + if [[ "${EVAL_ONLY:-false}" == "true" ]]; then + echo "EVAL_ONLY mode: skipping throughput benchmark" + elif [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: $BENCH_CMD" else set -x @@ -346,7 +348,80 @@ if [ "$NODE_RANK" -eq 0 ]; then set +x fi - # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + # Run evaluation if requested (before killing router) + if [[ "${RUN_EVAL:-false}" == "true" ]]; then + echo "Running lm-eval evaluation on Node 0..." + + EVAL_HEALTH_OK=false + for _attempt in 1 2 3; do + if curl -sf --max-time 10 "http://0.0.0.0:${ROUTER_PORT}/health" >/dev/null 2>&1; then + EVAL_HEALTH_OK=true + break + fi + echo "Eval health check attempt $_attempt failed, retrying in 10s..." + sleep 10 + done + + if [[ "$EVAL_HEALTH_OK" != "true" ]]; then + echo "WARNING: Router health check failed after 3 attempts. Skipping eval." + else + pushd /workspace + + source /workspace/benchmarks/benchmark_lib.sh + + if [[ -n "${EVAL_CONC:-}" ]]; then + export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" + else + export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: run_eval --framework lm-eval --port $ROUTER_PORT (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" + else + run_eval --framework lm-eval --port "$ROUTER_PORT" + eval_rc=$? + + if [[ $eval_rc -ne 0 ]]; then + echo "ERROR: run_eval exited rc=$eval_rc; skipping metadata write and eval artifact staging" >&2 + EVAL_FAILED=1 + else + export TP="${PREFILL_TP_SIZE}" + export CONC="${EVAL_CONCURRENT_REQUESTS}" + export EP_SIZE=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" + export PREFILL_TP="${PREFILL_TP_SIZE}" + export PREFILL_EP=1 + [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}" + export PREFILL_NUM_WORKERS="${xP}" + export DECODE_TP="${DECODE_TP_SIZE}" + export DECODE_EP=1 + [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}" + export DECODE_NUM_WORKERS="${yD}" + export DP_ATTENTION="${PREFILL_ENABLE_DP}" + export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}" + export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}" + export ISL="${BENCH_INPUT_LEN}" + export OSL="${BENCH_OUTPUT_LEN}" + + append_lm_eval_summary + + EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" + mkdir -p "$EVAL_COPY_DIR" + for f in meta_env.json; do + [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" + done + find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; + find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; + + echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" + fi + fi + + popd + fi + fi + + # Copy benchmark/eval results to BENCHMARK_LOGS_DIR (mounted from host) LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" mkdir -p "$LOGS_OUTPUT" @@ -362,6 +437,11 @@ if [ "$NODE_RANK" -eq 0 ]; then pkill -f "vllm serve" 2>/dev/null || true fi + if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then + echo "ERROR: eval failed; exiting node-0 with rc=1" + exit 1 + fi + elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then echo "${host_name}:${host_ip} is Additional Prefill Node (Model: ${MODEL_NAME})" echo "Using prefill config: $PREFILL_SERVER_CONFIG" From d3aa76cff82bd3baee0bf70933d9542ee4cafe18 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 02:53:02 +0000 Subject: [PATCH 66/98] remove start_etcd.sh Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/start_etcd.sh | 47 ------------------- 1 file changed, 47 deletions(-) delete mode 100755 benchmarks/multi_node/amd_utils/start_etcd.sh diff --git a/benchmarks/multi_node/amd_utils/start_etcd.sh b/benchmarks/multi_node/amd_utils/start_etcd.sh deleted file mode 100755 index 46bbd2964..000000000 --- a/benchmarks/multi_node/amd_utils/start_etcd.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -set -x - -IPADDRS="${IPADDRS:-localhost}" - -# Use management network IP (matching what the Slurm script resolved) -host_ip=$(ip route get 1.1.1.1 2>/dev/null | sed -n 's/.*src \([^ ]*\).*/\1/p') -if [[ -z "$host_ip" ]]; then - host_ip=$(hostname -I | awk '{print $1}') -fi - -IFS=',' read -ra ADDR <<< "$IPADDRS" - -# Determine node name based on position in the IPADDRS list -index=0 -for ip in "${ADDR[@]}"; do - if [[ "$ip" == "$host_ip" ]]; then - break - fi - index=$((index + 1)) -done -node_name="etcd-$((index+1))" - -# Build initial cluster string -initial_cluster="" -for i in "${!ADDR[@]}"; do - peer_name="etcd-$((i+1))" - initial_cluster+="$peer_name=http://${ADDR[i]}:2380" - if [[ $i -lt $((${#ADDR[@]} - 1)) ]]; then - initial_cluster+="," - fi -done - -mkdir -p /var/lib/etcd -rm -rf /var/lib/etcd/* - -/usr/local/bin/etcd/etcd \ - --name "$node_name" \ - --data-dir /var/lib/etcd \ - --initial-advertise-peer-urls http://$host_ip:2380 \ - --listen-peer-urls http://0.0.0.0:2380 \ - --listen-client-urls http://0.0.0.0:2379 \ - --advertise-client-urls http://$host_ip:2379 \ - --initial-cluster-token etcd-cluster-1 \ - --initial-cluster "$initial_cluster" \ - --initial-cluster-state new \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/etcd_NODE${NODE_RANK}.log From 8d730c98f52c232bb99feeb095467e7bc3b1dc9f Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 03:03:23 +0000 Subject: [PATCH 67/98] change decode to 1, easier for testing Signed-off-by: Theresa Shan --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 65f6f9d93..bcb3af801 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1359,12 +1359,12 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "PREFILL_NODES=1" - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: - num-worker: 2 + num-worker: 1 tp: 8 ep: 8 dp-attn: false additional-settings: - - "DECODE_NODES=2" + - "DECODE_NODES=1" minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c From ed80d6f171e5d2b9880edb53e24c5ff0a9e3dacb Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 06:49:13 +0000 Subject: [PATCH 68/98] add --served-model-name to vllm serve commands and wire up eval Set --served-model-name on all prefill/decode vllm serve commands so the model name matches what run_lm_eval sends in API requests. Also add eval pipeline support (health check, run_eval, artifact staging) mirroring server_sglang.sh. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/server_vllm.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 60b0adb92..35da4ad27 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -284,7 +284,9 @@ if [ "$NODE_RANK" -eq 0 ]; then # Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE) echo "Using external vllm-router container (started by job.slurm on this node)" + SERVED_MODEL="${MODEL:-${MODEL_NAME}}" PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ --trust-remote-code \ --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ @@ -448,7 +450,9 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then setup_vllm_env + SERVED_MODEL="${MODEL:-${MODEL_NAME}}" PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ --trust-remote-code \ --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ @@ -502,7 +506,9 @@ else echo "[DECODE_ENV] $env_pair" done + SERVED_MODEL="${MODEL:-${MODEL_NAME}}" DECODE_CMD="vllm serve ${MODEL_PATH} \ + --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ --trust-remote-code \ --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ From 54ba6df6cc88329adab8ef249f8bb4f919733b35 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 08:31:41 +0000 Subject: [PATCH 69/98] fix model name consistency between vllm serve and bench client bench.sh now uses MODEL_NAME for vllm-disagg to match --served-model-name, and MODEL_PATH for sglang to match its default. Simplified SERVED_MODEL to use MODEL_NAME directly since MODEL env var is not available inside the container. Co-Authored-By: Claude Opus 4 --- benchmarks/multi_node/amd_utils/bench.sh | 8 +++++++- benchmarks/multi_node/amd_utils/server_vllm.sh | 6 +++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 33cc918bf..24dfbf587 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -20,6 +20,12 @@ decode_gpus=$4 model_path=$5 model_name=$6 MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" +# vllm-disagg uses --served-model-name MODEL_NAME; sglang defaults to MODEL_PATH +if [[ "$ENGINE" == "vllm-disagg" ]]; then + BENCH_MODEL="${MODEL_NAME:-${MODEL_PATH}}" +else + BENCH_MODEL="${MODEL_PATH}" +fi log_path=$7 chosen_isl=${8:-1024} @@ -80,7 +86,7 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do run_benchmark_serving \ --bench-serving-dir "$REPO_ROOT" \ - --model "$MODEL_PATH" \ + --model "$BENCH_MODEL" \ --port "$ROUTER_PORT" \ --backend openai \ --input-len "$chosen_isl" \ diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 35da4ad27..ecab81656 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -284,7 +284,7 @@ if [ "$NODE_RANK" -eq 0 ]; then # Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE) echo "Using external vllm-router container (started by job.slurm on this node)" - SERVED_MODEL="${MODEL:-${MODEL_NAME}}" + SERVED_MODEL="${MODEL_NAME}" PREFILL_CMD="vllm serve ${MODEL_PATH} \ --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ @@ -450,7 +450,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then setup_vllm_env - SERVED_MODEL="${MODEL:-${MODEL_NAME}}" + SERVED_MODEL="${MODEL_NAME}" PREFILL_CMD="vllm serve ${MODEL_PATH} \ --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ @@ -506,7 +506,7 @@ else echo "[DECODE_ENV] $env_pair" done - SERVED_MODEL="${MODEL:-${MODEL_NAME}}" + SERVED_MODEL="${MODEL_NAME}" DECODE_CMD="vllm serve ${MODEL_PATH} \ --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ From b0f116ec426dae0df94594574ae2b2f57d058279 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 09:28:13 +0000 Subject: [PATCH 70/98] add token patch to bench for vllm Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 24dfbf587..554db8b91 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -77,7 +77,7 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do # Engine-specific extra flags extra_flags="" if [[ "$ENGINE" == "vllm-disagg" ]]; then - extra_flags="--trust-remote-code" + extra_flags="--trust-remote-code --tokenizer $MODEL_PATH" else if [ "$IS_MTP" = "true" ]; then extra_flags="--use-chat-template" From 4e3d87cf090d4a0b41c70706548ea331c661ab01 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Fri, 15 May 2026 09:50:34 +0000 Subject: [PATCH 71/98] add --tokenizer passthrough to run_benchmark_serving MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit benchmark_lib.sh rejected unknown flags — add --tokenizer support so vllm-disagg bench can resolve the tokenizer from the local model path instead of attempting an HF download with the short model name. Co-Authored-By: Claude Opus 4 --- benchmarks/benchmark_lib.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 677df68c1..9394be682 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -208,6 +208,7 @@ run_benchmark_serving() { local dsv4=false local trust_remote_code=false local server_pid="" + local tokenizer="" while [[ $# -gt 0 ]]; do case $1 in @@ -276,6 +277,10 @@ run_benchmark_serving() { server_pid="$2" shift 2 ;; + --tokenizer) + tokenizer="$2" + shift 2 + ;; *) echo "Unknown parameter: $1" return 1 @@ -383,6 +388,10 @@ run_benchmark_serving() { benchmark_cmd+=(--trust-remote-code) fi + if [[ -n "$tokenizer" ]]; then + benchmark_cmd+=(--tokenizer "$tokenizer") + fi + # Run benchmark with optional server monitoring set -x if [[ -n "$server_pid" ]]; then From a99c4f6d78a8c659b5f9a6e40bc20b82ec32a021 Mon Sep 17 00:00:00 2001 From: Shan Theresa Date: Fri, 15 May 2026 10:38:14 +0000 Subject: [PATCH 72/98] update vllm image for kimi2.5 and Minimax disagg. Signed-off-by: Shan Theresa --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index bcb3af801..ed6bce0df 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1313,7 +1313,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: ghcr.io/simondanielsson/vllm-dev:ainic-test-hydra + image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg @@ -1367,7 +1367,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "DECODE_NODES=1" minimaxm2.5-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-100c7b65e7579c8caf4ee0b04a6410b2796b905c + image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x-disagg From 8a24fa6a78e171cb414776d360d2f234a60dd4e6 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Mon, 18 May 2026 15:52:08 +0800 Subject: [PATCH 73/98] Update setup_deps.sh --- benchmarks/multi_node/amd_utils/setup_deps.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 860cecf96..c65412bac 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -885,7 +885,7 @@ except Exception as e: # install_mori install_recipe_deps install_amd_quark -patch_mori_fp8_compat +# patch_mori_fp8_compat patch_moriio_save_kv_timeout patch_moriio_transfer_timeout patch_moriio_load_kv_timeout From 196791950cdfcdf2a24847da4ac2bba4db30593b Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Mon, 18 May 2026 23:27:34 +0800 Subject: [PATCH 74/98] Update amd-master.yaml restore the kimi k2.5 settings --- .github/configs/amd-master.yaml | 48 ++++++++++++++++----------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index ed6bce0df..29722e41b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1323,33 +1323,33 @@ kimik2.5-fp4-mi355x-vllm-disagg: disagg: true scenarios: fixed-seq-len: - # - isl: 1024 - # osl: 1024 - # search-space: - # # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - # - spec-decoding: "none" - # conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - # - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - # decode: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_NODES=2" + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" - isl: 8192 osl: 1024 search-space: - spec-decoding: "none" - conc-list: [ 16 ] + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] prefill: num-worker: 1 tp: 8 @@ -1359,12 +1359,12 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "PREFILL_NODES=1" - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" decode: - num-worker: 1 + num-worker: 2 tp: 8 ep: 8 dp-attn: false additional-settings: - - "DECODE_NODES=1" + - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 From 79874528fc71cc9610029d071e1ef15b30d461e7 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 19 May 2026 14:48:09 +0000 Subject: [PATCH 75/98] update req rate for vllm. Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 554db8b91..05384f435 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -31,7 +31,7 @@ log_path=$7 chosen_isl=${8:-1024} chosen_osl=${9:-1024} concurrency_list=${10:-"512x1"} -if [[ "$ENGINE" == "vllm" ]]; then +if [[ "$ENGINE" == "vllm-disagg" ]]; then chosen_req_rate=${11:-inf} else chosen_req_rate=${11:-1} From b9df2a0d261c00dfe4eb1970b8af104f6ac6a434 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 19 May 2026 15:20:28 +0000 Subject: [PATCH 76/98] make the sglang env consistent with upstream Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/env.sh | 55 +++++++++++++++++--------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index ffdc9682e..aa69d0e46 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -119,41 +119,52 @@ else # ========================================================================= export SGLANG_USE_AITER=1 - export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200 - export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200 + + export SGLANG_MORI_DISPATCH_DTYPE=auto + export SGLANG_MORI_FP8_COMB=true + export SGLANG_MORI_QP_PER_TRANSFER=4 + export SGLANG_MORI_NUM_WORKERS=4 + export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000 + + export MORI_IO_QP_MAX_SEND_WR=16384 + export MORI_IO_QP_MAX_CQE=32768 + export MORI_IO_QP_MAX_SGE=4 + + export MORI_IO_TC_DISABLE=0 + + export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600 + export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600 # Disable allocating memory in one pass export MORI_SHMEM_MODE=ISOLATION - export SGLANG_MORI_FP8_DISP=True - if [[ "$MODEL_NAME" == *mxfp4* ]]; then - export SGLANG_MORI_FP8_DISP=False - fi + # Enable spec v2 + export SGLANG_ENABLE_SPEC_V2=1 + export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 - export SGLANG_MORI_FP4_DISP=False - export SGLANG_MORI_FP8_COMB=False + export SGLANG_LOG_MS=true + export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 - # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower) - export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 - if [[ "$MODEL_NAME" == *mxfp4* ]]; then - export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 - fi - export MORI_MAX_DISPATCH_TOKENS_DECODE=160 + export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 + export MORI_MAX_DISPATCH_TOKENS_DECODE=512 + + export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768 + export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703 # set MTP size=1 when EP16 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) export MORI_EP_LAUNCH_CONFIG_MODE=AUTO - export MORI_IO_QP_MAX_SEND_WR=16384 - export MORI_IO_QP_MAX_CQE=32768 - export MORI_IO_QP_MAX_SGE=4 export MORI_APP_LOG_LEVEL=INFO - # Router logging control + # Router logging control: + # 0 (default) keeps noisy per-request access logs out of stdout while still logging to file. + # 1 mirrors router logs to stdout via tee (useful for live debugging). export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}" # QoS/DSCP configuration + # Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname if [[ -n "$MORI_RDMA_TC" ]]; then echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)" elif command -v nicctl &> /dev/null; then @@ -166,17 +177,21 @@ $1 == "DSCP" && $2 == ":" && $NF == p { if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then TC=$(( 4 * ND_DSCP )) export MORI_RDMA_SL=$ND_PRIO + export MORI_IO_SL=$ND_PRIO export MORI_RDMA_TC=$TC - echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL" + export MORI_IO_TC=$TC + echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL" else echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." # Fall back to hostname-based detection NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 + export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 + export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." @@ -187,9 +202,11 @@ $1 == "DSCP" && $2 == ":" && $NF == p { NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 + export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 + export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." From 7925efd9966b0c3d753c63c2446dc2748b996bd1 Mon Sep 17 00:00:00 2001 From: Theresa Shan Date: Tue, 19 May 2026 15:31:32 +0000 Subject: [PATCH 77/98] node blacklist Signed-off-by: Theresa Shan --- benchmarks/multi_node/amd_utils/submit.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index 524b00c65..fa3d65418 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -157,6 +157,7 @@ fi # Optional: exclude specific nodes (e.g. nodes with broken Docker sockets). # Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames. EXCLUDE_OPT=() +SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-mia1-p01-g11,mia1-p01-g12,mia1-p01-g15}" if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") fi From abdbff6aacfc922b050aa62c31eecdf23c4ed2c1 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Tue, 19 May 2026 18:14:25 +0200 Subject: [PATCH 78/98] fix: re-add MORI_IO_TC envvars Signed-off-by: simondanielsson --- benchmarks/multi_node/amd_utils/env.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 27f5d2749..aa69d0e46 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -202,9 +202,11 @@ $1 == "DSCP" && $2 == ":" && $NF == p { NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 + export MORI_IO_TC=96 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then export MORI_RDMA_TC=104 + export MORI_IO_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." From 28ae46beeb7f0e58bea70274d9073a77e1b478e8 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Wed, 20 May 2026 09:24:47 +0200 Subject: [PATCH 79/98] fix: add excluded nodes in MI325 cluster Signed-off-by: simondanielsson --- runners/launch_mi325x-amds.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh index 8aa480992..636e3e75e 100644 --- a/runners/launch_mi325x-amds.sh +++ b/runners/launch_mi325x-amds.sh @@ -28,7 +28,8 @@ scancel_sync() { # chi-mi325x-pod1-121: enroot-aufs2ovlfs setcap fails on this node's NFS-backed # squash dir; container image import never completes # (root-caused via #1467/#1468/#1469 sweep failures). -export SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-}" +export SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-chi-mi325x-pod1-021.ord.vultr.cpe.ice.amd.com,chi-mi325x-pod1-027.ord.vultr.cpe.ice.amd.com,chi-mi325x-pod1-028.ord.vultr.cpe.ice.amd.com,chi-mi325x-pod1-030.ord.vultr.cpe.ice.amd.com,chi-mi325x-pod1-121.ord.vultr.cpe.ice.amd.com}" + if [[ "$IS_MULTINODE" == "true" ]]; then set -x From 80a5b37646a2807c4060cecbffa03afaba679549 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Wed, 20 May 2026 10:14:28 +0200 Subject: [PATCH 80/98] fix: update conc list and use 2p1d for 8k/1k high conc Signed-off-by: simondanielsson --- .github/configs/amd-master.yaml | 50 ++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 4200e962e..1911b2587 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1327,7 +1327,7 @@ minimaxm2.5-fp8-mi300x-vllm-disagg: osl: 1024 search-space: - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] prefill: num-worker: 1 tp: 8 @@ -1347,8 +1347,29 @@ minimaxm2.5-fp8-mi300x-vllm-disagg: - isl: 8192 osl: 1024 search-space: + # Top of curve: 2P1D - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + conc-list: [256, 512, 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=2" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + + # Bottom of curve: 1P2D + - spec-decoding: "none" + conc-list: [8, 16, 32, 64, 128] prefill: num-worker: 1 tp: 8 @@ -1381,7 +1402,7 @@ minimaxm2.5-fp8-mi325x-vllm-disagg: osl: 1024 search-space: - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] prefill: num-worker: 1 tp: 8 @@ -1401,8 +1422,29 @@ minimaxm2.5-fp8-mi325x-vllm-disagg: - isl: 8192 osl: 1024 search-space: + # Top of curve: 2P1D - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + conc-list: [256, 512, 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=2" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + + # Bottom of curve: 1P2D + - spec-decoding: "none" + conc-list: [8, 16, 32, 64, 128] prefill: num-worker: 1 tp: 8 From d383560c389323068bc4ef9374f9c15baa8a66c8 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Wed, 20 May 2026 10:17:07 +0200 Subject: [PATCH 81/98] fix: set MORI-related envvars for vllm same as sgl Signed-off-by: simondanielsson --- benchmarks/multi_node/amd_utils/env.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index aa69d0e46..50001766f 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -54,6 +54,14 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then # ========================================================================= set -x + export VLLM_MORIIO_QP_PER_TRANSFER=4 + export VLLM_MORIIO_NUM_WORKERS=4 + export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000 + export MORI_IO_QP_MAX_SEND_WR=16384 + export MORI_IO_QP_MAX_CQE=32768 + export MORI_IO_QP_MAX_SGE=4 + export MORI_IO_TC_DISABLE=0 + # UCX_NET_DEVICES: Use the first tw-eth interface for UCX TCP transport if [[ -z "$UCX_NET_DEVICES" ]]; then UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/tw-eth/{print $2}' | head -1) From 3d962a7273b869765ebd526d2a522dad04e64d48 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Wed, 20 May 2026 10:25:19 +0200 Subject: [PATCH 82/98] fix: update exluded node now when more are down Signed-off-by: simondanielsson --- runners/launch_mi300x-amds.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index a24422fa8..329e314b5 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -25,9 +25,9 @@ scancel_sync() { } # Pin to the known-good mi300x nodes; others are unavailable: -# chi-mi300x-033, chi-mi300x-037: down (Not responding) -# chi-mi300x-049: drained (persistent /nvme_home disk-full) -export SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-chi-mi300x-033.ord.vultr.cpe.ice.amd.com,chi-mi300x-035.ord.vultr.cpe.ice.amd.com,chi-mi300x-037.ord.vultr.cpe.ice.amd.com,chi-mi300x-049.ord.vultr.cpe.ice.amd.com}" +# chi-mi300x-033-037: down* +# chi-mi300x-049: down +export SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-chi-mi300x-033.ord.vultr.cpe.ice.amd.com,chi-mi300x-034.ord.vultr.cpe.ice.amd.com,chi-mi300x-035.ord.vultr.cpe.ice.amd.com,chi-mi300x-036.ord.vultr.cpe.ice.amd.com,chi-mi300x-037.ord.vultr.cpe.ice.amd.com,chi-mi300x-049.ord.vultr.cpe.ice.amd.com}" if [[ "$IS_MULTINODE" == "true" ]]; then set -x From 2b99dcb167788b26f8411dd31c4ce168de54478c Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Wed, 20 May 2026 22:09:12 +0200 Subject: [PATCH 83/98] fix: update excluded mi300 nodes Signed-off-by: simondanielsson --- runners/launch_mi300x-amds.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index 329e314b5..87a5764b4 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -25,9 +25,9 @@ scancel_sync() { } # Pin to the known-good mi300x nodes; others are unavailable: -# chi-mi300x-033-037: down* +# chi-mi300x-033,037: down* # chi-mi300x-049: down -export SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-chi-mi300x-033.ord.vultr.cpe.ice.amd.com,chi-mi300x-034.ord.vultr.cpe.ice.amd.com,chi-mi300x-035.ord.vultr.cpe.ice.amd.com,chi-mi300x-036.ord.vultr.cpe.ice.amd.com,chi-mi300x-037.ord.vultr.cpe.ice.amd.com,chi-mi300x-049.ord.vultr.cpe.ice.amd.com}" +export SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-chi-mi300x-033.ord.vultr.cpe.ice.amd.com,chi-mi300x-037.ord.vultr.cpe.ice.amd.com,chi-mi300x-049.ord.vultr.cpe.ice.amd.com}" if [[ "$IS_MULTINODE" == "true" ]]; then set -x From bbea3a7a817490bff0fb2b8239004f08200d8ee3 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Wed, 20 May 2026 22:12:01 +0200 Subject: [PATCH 84/98] fix: remove sudo from rm commands in mi325 runner Signed-off-by: simondanielsson --- runners/launch_mi325x-amds.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh index 636e3e75e..1c9b5f2c9 100644 --- a/runners/launch_mi325x-amds.sh +++ b/runners/launch_mi325x-amds.sh @@ -51,7 +51,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}" mkdir -p "$BENCHMARK_LOGS_DIR" - sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true cleanup_and_save_logs() { if [[ -n "${GITHUB_ACTIONS:-}" && -n "${JOB_ID:-}" ]]; then @@ -65,7 +65,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then tail -100 "$err_file" echo "========================" fi - sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true + rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true } trap cleanup_and_save_logs EXIT @@ -154,7 +154,7 @@ PY set -x echo "Canceled the slurm job $JOB_ID" - sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true else From ae7dca472b9465bd9145c22251ee05c43f157a0a Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Wed, 20 May 2026 22:22:08 +0200 Subject: [PATCH 85/98] fix: update mi325 model path Signed-off-by: simondanielsson --- runners/launch_mi325x-amds.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh index 1c9b5f2c9..1e0b25ed5 100644 --- a/runners/launch_mi325x-amds.sh +++ b/runners/launch_mi325x-amds.sh @@ -39,7 +39,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then export SLURM_JOB_NAME="benchmark-${FRAMEWORK}.job" export MODEL_NAME=${MODEL##*/} - export MODEL_PATH="/nfsdata" + export MODEL_PATH="/nfsdata/sa/gharunner/gharunners/hf-hub-cache" export IBDEVICES="rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7" export MORI_RDMA_TC=104 From a7ae751f73207267cf18527a53570dcb2186042f Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Wed, 20 May 2026 22:34:33 +0200 Subject: [PATCH 86/98] fix: use a more random port than 5000 for initial container creation barrier Signed-off-by: simondanielsson --- benchmarks/multi_node/amd_utils/server_vllm.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index ecab81656..4f07b1257 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -185,13 +185,14 @@ echo "DECODE_SERVER_CONFIG (after TP/EP/DP): $DECODE_SERVER_CONFIG" # Container Synchronization # ============================================================================= -echo "Waiting at the container creation barrier on $host_name" +BARRIER_PORT="${BARRIER_PORT:-36380}" +echo "Waiting at the container creation barrier on $host_name (port $BARRIER_PORT)" python3 $WS_PATH/sync.py barrier \ --local-ip ${host_ip} \ - --local-port 5000 \ + --local-port ${BARRIER_PORT} \ --enable-port \ --node-ips ${IPADDRS} \ - --node-ports 5000 \ + --node-ports ${BARRIER_PORT} \ --wait-for-all-ports \ --timeout 600 From 97c34d15973d265048366466f611af4426248d49 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Thu, 21 May 2026 11:07:56 +0200 Subject: [PATCH 87/98] fix: add backup docker command if docker and sudo docker does not work Signed-off-by: simondanielsson --- benchmarks/multi_node/amd_utils/job.slurm | 29 ++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 22b1ebcb3..51e53cd1e 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -201,7 +201,34 @@ SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') # Docker privilege detection — evaluated per-node since group membership varies. # Exported as a snippet so every srun participant resolves it locally. -export DOCKER_CMD_DETECT='if docker ps &>/dev/null 2>&1; then DOCKER_CMD=docker; else DOCKER_CMD="sudo docker"; fi' +# +# Middle branch (sg-docker shim) is a workaround for stale supplementary groups +# on long-running GHA runners: the kernel-level group list is frozen at runner +# start time and predates the gharunner→docker group add, even though NSS now +# lists gharunner in docker. sg(1) is setuid root, so it can set the docker GID +# per invocation; we wrap docker in a PATH shim so xargs/etc. also work. +# Remove this branch once the runners are restarted. +export DOCKER_CMD_DETECT=' +if docker ps &>/dev/null 2>&1; then + DOCKER_CMD=docker +elif command -v sg >/dev/null 2>&1 && sg docker -c "docker ps" &>/dev/null 2>&1; then + SHIM_DIR="/tmp/docker-sg-shim-$$" + mkdir -p "$SHIM_DIR" + cat >"$SHIM_DIR/docker" <<"SHIM_EOF" +#!/bin/bash +args="" +for a in "$@"; do args+=$(printf "%q " "$a"); done +exec sg docker -c "/usr/bin/docker $args" +SHIM_EOF + chmod +x "$SHIM_DIR/docker" + export PATH="$SHIM_DIR:$PATH" + DOCKER_CMD=docker + echo "[docker-detect] using sg-docker shim on $(hostname)" >&2 +else + out=$(docker ps 2>&1) + echo "[docker-detect] docker ps failed on $(hostname) as $(id -un): $out" >&2 + DOCKER_CMD="sudo docker" +fi' # Update SLURM environment variables export SLURM_NNODES=$NUM_NODES From 95ac360e5d486e6cedd0a93974fb0bd76b6907e7 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Thu, 21 May 2026 11:23:26 +0200 Subject: [PATCH 88/98] fix: docker backup fix Signed-off-by: simondanielsson --- benchmarks/multi_node/amd_utils/job.slurm | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 51e53cd1e..acea89882 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -216,9 +216,12 @@ elif command -v sg >/dev/null 2>&1 && sg docker -c "docker ps" &>/dev/null 2>&1; mkdir -p "$SHIM_DIR" cat >"$SHIM_DIR/docker" <<"SHIM_EOF" #!/bin/bash -args="" -for a in "$@"; do args+=$(printf "%q " "$a"); done -exec sg docker -c "/usr/bin/docker $args" +F=$(mktemp) +printf "%s\0" "$@" > "$F" +sg docker -c "xargs -0 -a $F /usr/bin/docker" +rc=$? +rm -f "$F" +exit $rc SHIM_EOF chmod +x "$SHIM_DIR/docker" export PATH="$SHIM_DIR:$PATH" From 1c7e81c191a1af3540f5f967cc74401ac7a9eec9 Mon Sep 17 00:00:00 2001 From: simondanielsson Date: Thu, 21 May 2026 13:36:41 +0200 Subject: [PATCH 89/98] fix: remove manual install of older libbnxt-re version Signed-off-by: simondanielsson --- benchmarks/multi_node/amd_utils/setup_deps.sh | 50 ------------------- 1 file changed, 50 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 308acf3ac..159b3e16b 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -228,60 +228,10 @@ install_amd_quark() { _SETUP_INSTALLED+=("amd-quark") } -# --------------------------------------------------------------------------- -# 8. Broadcom bnxt RDMA userspace libraries (libbnxt_re verbs provider) -# Required on nodes with Broadcom Thor2 NICs (bcm5760x) when the base -# image does not ship the bnxt_re verbs provider. -# --------------------------------------------------------------------------- -install_bnxt_rdma() { - local existing - existing=$(find /usr/local/lib /usr/lib64 /usr/lib -name "libbnxt_re-rdmav*.so" 2>/dev/null) - if [[ -n "$existing" ]]; then - echo "[SETUP] Existing bnxt RDMA libraries found (will override):" - for lib in $existing; do - echo " $lib ($(ls -l "$lib" 2>/dev/null | awk '{print $5, $6, $7, $8}'))" - done - fi - - echo "[SETUP] Installing bnxt RDMA build dependencies..." - apt-get update -q -y && apt-get install -q -y \ - wget unzip autoconf automake libtool pkg-config \ - libibverbs-dev librdmacm-dev ibverbs-utils \ - && rm -rf /var/lib/apt/lists/* - - echo "[SETUP] Downloading and building Broadcom bnxt RDMA userspace libraries..." - ( - set -e - cd /tmp - wget -q https://docs.broadcom.com/docs-and-downloads/ethernet-network-adapters/NXE/Thor2/GCA1/bcm5760x_230.2.52.0a.zip - unzip -q bcm5760x_230.2.52.0a.zip - cd bcm5760x_230.2.52.0a/drivers_linux/bnxt_rocelib/ - results=$(find . -name "libbnxt*.tar.gz") - tar -xf $results - untar_dir=$(find . -maxdepth 1 -type d -name "libbnxt*" ! -name "*.tar.gz" | head -n 1) - cd "$untar_dir" - sh autogen.sh - ./configure - make -j"$(nproc)" - find /usr/lib64/ /usr/lib -name "libbnxt_re-rdmav*.so" -exec mv {} {}.inbox \; 2>/dev/null || true - make install all - echo /usr/local/lib >> /etc/ld.so.conf - ldconfig - cp -f bnxt_re.driver /etc/libibverbs.d/ - ) - rm -rf /tmp/bcm5760x_230.2.52.0a /tmp/bcm5760x_230.2.52.0a.zip - - if ! ibv_devices 2>/dev/null; then - echo "[SETUP] WARN: ibv_devices failed after bnxt install (may be OK if no Broadcom NIC on this node)" - fi - _SETUP_INSTALLED+=("bnxt-rdma") -} - # ============================================================================= # Run installers # ============================================================================= -install_bnxt_rdma install_recipe_deps install_amd_quark From e6ab6860f4d8c7989d427cf2ba76c5f8e88d968a Mon Sep 17 00:00:00 2001 From: haic0 Date: Sun, 28 Jun 2026 14:01:29 +0000 Subject: [PATCH 90/98] fix: preserve mi300x multinode launch diagnostics Signed-off-by: haic0 Co-authored-by: Cursor --- .../workflows/benchmark-multinode-tmpl.yml | 4 +- runners/launch_mi300x-amds.sh | 80 +++++++++++++++++-- 2 files changed, 77 insertions(+), 7 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index f901b1ff7..5880de49b 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -271,7 +271,9 @@ jobs: uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: multinode_server_logs_${{ env.RESULT_FILENAME }} - path: multinode_server_logs.tar.gz + path: | + multinode_server_logs.tar.gz + benchmark_artifacts/ if-no-files-found: ignore - name: Upload agentic aggregated result diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index 87a5764b4..4075e6c71 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -51,11 +51,59 @@ if [[ "$IS_MULTINODE" == "true" ]]; then mkdir -p "$BENCHMARK_LOGS_DIR" sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true - cleanup_and_save_logs() { - if [[ -n "${GITHUB_ACTIONS:-}" && -n "${JOB_ID:-}" ]]; then - local art_dir="$GITHUB_WORKSPACE/benchmark_artifacts" - mkdir -p "$art_dir" + save_multinode_diagnostics() { + local art_dir="$GITHUB_WORKSPACE/benchmark_artifacts" + mkdir -p "$art_dir" + + cp -r "$BENCHMARK_LOGS_DIR"/submit_*.log "$art_dir/" 2>/dev/null || true + if [[ -n "${JOB_ID:-}" ]]; then cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$art_dir/" 2>/dev/null || true + scontrol show job "$JOB_ID" > "$art_dir/scontrol_job_${JOB_ID}.txt" 2>&1 || true + sacct -j "$JOB_ID" --format=JobID,JobName,State,ExitCode,Elapsed,NodeList%80 > "$art_dir/sacct_job_${JOB_ID}.txt" 2>&1 || true + fi + + squeue -u "$USER" > "$art_dir/squeue_${USER}.txt" 2>&1 || true + { + echo "RUNNER_NAME=${RUNNER_NAME:-}" + echo "RUNNER_TYPE=${RUNNER_TYPE:-}" + echo "SLURM_ACCOUNT=${SLURM_ACCOUNT:-}" + echo "SLURM_PARTITION=${SLURM_PARTITION:-}" + echo "SLURM_EXCLUDE_NODES=${SLURM_EXCLUDE_NODES:-}" + echo "SCRIPT_NAME=${SCRIPT_NAME:-}" + echo "BENCHMARK_SUBDIR=${BENCHMARK_SUBDIR:-}" + echo "BENCHMARK_LOGS_DIR=${BENCHMARK_LOGS_DIR:-}" + echo "MODEL=${MODEL:-}" + echo "MODEL_NAME=${MODEL_NAME:-}" + echo "MODEL_PATH=${MODEL_PATH:-}" + echo "FRAMEWORK=${FRAMEWORK:-}" + echo "PRECISION=${PRECISION:-}" + echo "ISL=${ISL:-}" + echo "OSL=${OSL:-}" + echo "CONC_LIST=${CONC_LIST:-}" + echo "PREFILL_NODES=${PREFILL_NODES:-}" + echo "PREFILL_NUM_WORKERS=${PREFILL_NUM_WORKERS:-}" + echo "PREFILL_TP=${PREFILL_TP:-}" + echo "PREFILL_EP=${PREFILL_EP:-}" + echo "PREFILL_DP_ATTN=${PREFILL_DP_ATTN:-}" + echo "DECODE_NODES=${DECODE_NODES:-}" + echo "DECODE_NUM_WORKERS=${DECODE_NUM_WORKERS:-}" + echo "DECODE_TP=${DECODE_TP:-}" + echo "DECODE_EP=${DECODE_EP:-}" + echo "DECODE_DP_ATTN=${DECODE_DP_ATTN:-}" + echo "RUN_EVAL=${RUN_EVAL:-}" + echo "EVAL_ONLY=${EVAL_ONLY:-}" + echo "EVAL_CONC=${EVAL_CONC:-}" + echo "RESULT_FILENAME=${RESULT_FILENAME:-}" + } > "$art_dir/launcher_env.txt" 2>&1 || true + + if compgen -G "$art_dir/*" > /dev/null; then + tar -czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$art_dir" . 2>/dev/null || true + fi + } + + cleanup_and_save_logs() { + if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + save_multinode_diagnostics fi local err_file="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID:-unknown}.err" if [[ -s "$err_file" ]]; then @@ -73,7 +121,26 @@ if [[ "$IS_MULTINODE" == "true" ]]; then else BENCHMARK_SUBDIR="single_node" fi - JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}") + SUBMIT_LOG="$BENCHMARK_LOGS_DIR/submit_${SCRIPT_NAME%.sh}.log" + bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}" > "$SUBMIT_LOG" 2>&1 + SUBMIT_RC=$? + cat "$SUBMIT_LOG" + JOB_ID=$(tail -n 1 "$SUBMIT_LOG" || true) + if [[ "$SUBMIT_RC" -ne 0 ]]; then + echo "ERROR: Failed to submit multi-node job via benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}" + echo "=== Submit log ===" + cat "$SUBMIT_LOG" || true + echo "==================" + exit 1 + fi + + if [[ ! "$JOB_ID" =~ ^[0-9]+$ ]]; then + echo "ERROR: Expected numeric Slurm job id, got '$JOB_ID'" + echo "=== Submit log ===" + cat "$SUBMIT_LOG" || true + echo "==================" + exit 1 + fi LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out" @@ -82,7 +149,8 @@ if [[ "$IS_MULTINODE" == "true" ]]; then while ! ls "$LOG_FILE" &>/dev/null; do if ! squeue -u "$USER" --noheader --format='%i' | grep -q "$JOB_ID"; then echo "ERROR: Job $JOB_ID failed before creating log file" - scontrol show job "$JOB_ID" + scontrol show job "$JOB_ID" || true + save_multinode_diagnostics exit 1 fi sleep 5 From 650edf3267d5656bf5564fc10121ba556fd032ac Mon Sep 17 00:00:00 2001 From: haic0 Date: Sun, 28 Jun 2026 14:33:49 +0000 Subject: [PATCH 91/98] fix: use valid mi300x slurm excludes Signed-off-by: haic0 Co-authored-by: Cursor --- runners/launch_mi300x-amds.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index 4075e6c71..c7088bc8e 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -27,7 +27,7 @@ scancel_sync() { # Pin to the known-good mi300x nodes; others are unavailable: # chi-mi300x-033,037: down* # chi-mi300x-049: down -export SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-chi-mi300x-033.ord.vultr.cpe.ice.amd.com,chi-mi300x-037.ord.vultr.cpe.ice.amd.com,chi-mi300x-049.ord.vultr.cpe.ice.amd.com}" +export SLURM_EXCLUDE_NODES="${SLURM_EXCLUDE_NODES:-chi-mi300x-033,chi-mi300x-037,chi-mi300x-049}" if [[ "$IS_MULTINODE" == "true" ]]; then set -x @@ -56,7 +56,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then mkdir -p "$art_dir" cp -r "$BENCHMARK_LOGS_DIR"/submit_*.log "$art_dir/" 2>/dev/null || true - if [[ -n "${JOB_ID:-}" ]]; then + if [[ "${JOB_ID:-}" =~ ^[0-9]+$ ]]; then cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$art_dir/" 2>/dev/null || true scontrol show job "$JOB_ID" > "$art_dir/scontrol_job_${JOB_ID}.txt" 2>&1 || true sacct -j "$JOB_ID" --format=JobID,JobName,State,ExitCode,Elapsed,NodeList%80 > "$art_dir/sacct_job_${JOB_ID}.txt" 2>&1 || true @@ -106,6 +106,9 @@ if [[ "$IS_MULTINODE" == "true" ]]; then save_multinode_diagnostics fi local err_file="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID:-unknown}.err" + if [[ ! "${JOB_ID:-}" =~ ^[0-9]+$ ]]; then + err_file="$BENCHMARK_LOGS_DIR/slurm_job-unknown.err" + fi if [[ -s "$err_file" ]]; then echo "=== Slurm job stderr ===" tail -100 "$err_file" @@ -125,7 +128,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}" > "$SUBMIT_LOG" 2>&1 SUBMIT_RC=$? cat "$SUBMIT_LOG" - JOB_ID=$(tail -n 1 "$SUBMIT_LOG" || true) + JOB_ID=$(grep -E '^[0-9]+$' "$SUBMIT_LOG" | tail -n 1 || true) if [[ "$SUBMIT_RC" -ne 0 ]]; then echo "ERROR: Failed to submit multi-node job via benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}" echo "=== Submit log ===" From aae9b4146b1ce2dcb0cdfff56c96ee5577b8611f Mon Sep 17 00:00:00 2001 From: haic0 Date: Mon, 29 Jun 2026 02:05:27 +0000 Subject: [PATCH 92/98] fix: choose mi300x nodes with workspace access Signed-off-by: haic0 Co-authored-by: Cursor --- benchmarks/multi_node/amd_utils/job.slurm | 15 ++++++-- runners/launch_mi300x-amds.sh | 44 +++++++++++++++++++++++ 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index acea89882..9b43e1a8e 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -227,10 +227,19 @@ SHIM_EOF export PATH="$SHIM_DIR:$PATH" DOCKER_CMD=docker echo "[docker-detect] using sg-docker shim on $(hostname)" >&2 +elif command -v sudo >/dev/null 2>&1 && sudo -n docker ps &>/dev/null 2>&1; then + DOCKER_CMD="sudo -n docker" + echo "[docker-detect] using passwordless sudo docker on $(hostname)" >&2 else - out=$(docker ps 2>&1) - echo "[docker-detect] docker ps failed on $(hostname) as $(id -un): $out" >&2 - DOCKER_CMD="sudo docker" + out=$(docker ps 2>&1 || true) + echo "[docker-detect] no usable docker command on $(hostname) as $(id -un)" >&2 + echo "[docker-detect] id: $(id)" >&2 + echo "[docker-detect] groups: $(groups 2>/dev/null || true)" >&2 + ls -l /var/run/docker.sock >&2 2>/dev/null || true + echo "[docker-detect] docker ps: $out" >&2 + echo "[docker-detect] sg docker: $(command -v sg >/dev/null 2>&1 && sg docker -c "docker ps" 2>&1 || true)" >&2 + echo "[docker-detect] sudo docker: $(command -v sudo >/dev/null 2>&1 && sudo -n docker ps 2>&1 || true)" >&2 + exit 1 fi' # Update SLURM environment variables diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index c7088bc8e..46d87c24a 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -69,6 +69,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then echo "SLURM_ACCOUNT=${SLURM_ACCOUNT:-}" echo "SLURM_PARTITION=${SLURM_PARTITION:-}" echo "SLURM_EXCLUDE_NODES=${SLURM_EXCLUDE_NODES:-}" + echo "NODELIST=${NODELIST:-}" echo "SCRIPT_NAME=${SCRIPT_NAME:-}" echo "BENCHMARK_SUBDIR=${BENCHMARK_SUBDIR:-}" echo "BENCHMARK_LOGS_DIR=${BENCHMARK_LOGS_DIR:-}" @@ -124,6 +125,49 @@ if [[ "$IS_MULTINODE" == "true" ]]; then else BENCHMARK_SUBDIR="single_node" fi + + if [[ -z "${NODELIST:-}" ]]; then + NUM_NODES_REQUIRED=$((PREFILL_NODES + DECODE_NODES)) + SUBMIT_HOST=$(hostname -s) + SELECTED_NODES=("$SUBMIT_HOST") + + echo "Building NODELIST with submit host first: ${SUBMIT_HOST}" + while IFS= read -r candidate; do + [[ -n "$candidate" ]] || continue + [[ "$candidate" == "$SUBMIT_HOST" ]] && continue + if [[ ",${SLURM_EXCLUDE_NODES}," == *",${candidate},"* ]]; then + continue + fi + + # Every selected node must see the GitHub workspace and log path: + # job.slurm mounts DI_REPO_DIR from this path, and Slurm may pick + # any selected node as BatchHost for stdout/stderr creation. + if timeout 20s srun --nodes=1 --ntasks=1 --time=00:02:00 --partition="$SLURM_PARTITION" --nodelist="$candidate" \ + bash -lc "test -d '$GITHUB_WORKSPACE' && mkdir -p '$BENCHMARK_LOGS_DIR' && test -d '$BENCHMARK_LOGS_DIR'" >/dev/null 2>&1; then + SELECTED_NODES+=("$candidate") + echo "Added NODELIST candidate with workspace/log access: $candidate" + else + echo "Skipping NODELIST candidate without workspace/log access: $candidate" + fi + + if [[ "${#SELECTED_NODES[@]}" -ge "$NUM_NODES_REQUIRED" ]]; then + break + fi + done < <(sinfo -h -N -p "$SLURM_PARTITION" -t idle -o "%N" | sort -u) + + if [[ "${#SELECTED_NODES[@]}" -ne "$NUM_NODES_REQUIRED" ]]; then + echo "ERROR: Need ${NUM_NODES_REQUIRED} nodes for multinode job but found ${#SELECTED_NODES[@]} with workspace access." >&2 + echo "Selected nodes so far: ${SELECTED_NODES[*]}" >&2 + exit 1 + fi + + NODELIST=$(IFS=,; echo "${SELECTED_NODES[*]}") + export NODELIST + echo "Using generated NODELIST=${NODELIST}" + else + echo "Using caller-provided NODELIST=${NODELIST}" + fi + SUBMIT_LOG="$BENCHMARK_LOGS_DIR/submit_${SCRIPT_NAME%.sh}.log" bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}" > "$SUBMIT_LOG" 2>&1 SUBMIT_RC=$? From e49e6b50688bce1ca347e1e0fd1b8d9e899b8446 Mon Sep 17 00:00:00 2001 From: haic0 Date: Mon, 29 Jun 2026 02:34:14 +0000 Subject: [PATCH 93/98] fix: stage mi300x multinode workspace Signed-off-by: haic0 Co-authored-by: Cursor --- runners/launch_mi300x-amds.sh | 36 ++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index 46d87c24a..d9394bdf3 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -139,15 +139,12 @@ if [[ "$IS_MULTINODE" == "true" ]]; then continue fi - # Every selected node must see the GitHub workspace and log path: - # job.slurm mounts DI_REPO_DIR from this path, and Slurm may pick - # any selected node as BatchHost for stdout/stderr creation. if timeout 20s srun --nodes=1 --ntasks=1 --time=00:02:00 --partition="$SLURM_PARTITION" --nodelist="$candidate" \ - bash -lc "test -d '$GITHUB_WORKSPACE' && mkdir -p '$BENCHMARK_LOGS_DIR' && test -d '$BENCHMARK_LOGS_DIR'" >/dev/null 2>&1; then + bash -lc "test -d /tmp && test -w /tmp" >/dev/null 2>&1; then SELECTED_NODES+=("$candidate") - echo "Added NODELIST candidate with workspace/log access: $candidate" + echo "Added NODELIST candidate with writable /tmp: $candidate" else - echo "Skipping NODELIST candidate without workspace/log access: $candidate" + echo "Skipping NODELIST candidate without writable /tmp: $candidate" fi if [[ "${#SELECTED_NODES[@]}" -ge "$NUM_NODES_REQUIRED" ]]; then @@ -166,10 +163,35 @@ if [[ "$IS_MULTINODE" == "true" ]]; then echo "Using generated NODELIST=${NODELIST}" else echo "Using caller-provided NODELIST=${NODELIST}" + IFS=',' read -r -a SELECTED_NODES <<< "$NODELIST" fi + SANITIZED_RUNNER=$(printf '%s' "${RUNNER_NAME:-runner}" | tr -c 'a-zA-Z0-9_.-' '_') + STAGED_WORKSPACE="/tmp/inferencex-${USER}-${GITHUB_RUN_ID:-manual}-${SANITIZED_RUNNER}" + JOB_BENCHMARK_LOGS_DIR="${STAGED_WORKSPACE}/benchmark_logs" + + for node in "${SELECTED_NODES[@]}"; do + echo "Staging workspace to ${node}:${STAGED_WORKSPACE}" + tar \ + --exclude='./benchmark_logs' \ + --exclude='./benchmark_artifacts' \ + --exclude='./multinode_server_logs.tar.gz' \ + --exclude='./.git' \ + -C "$GITHUB_WORKSPACE" -cf - . | \ + timeout 120s srun --nodes=1 --ntasks=1 --time=00:05:00 --partition="$SLURM_PARTITION" --nodelist="$node" \ + bash -lc "rm -rf '$STAGED_WORKSPACE' && mkdir -p '$STAGED_WORKSPACE' '$JOB_BENCHMARK_LOGS_DIR' && tar -C '$STAGED_WORKSPACE' -xf - && test -f '$STAGED_WORKSPACE/benchmarks/multi_node/amd_utils/job.slurm' && test -d '$JOB_BENCHMARK_LOGS_DIR'" + stage_status=("${PIPESTATUS[@]}") + if [[ "${stage_status[0]}" -ne 0 || "${stage_status[1]}" -ne 0 ]]; then + echo "ERROR: Failed to stage workspace on ${node}" >&2 + exit 1 + fi + done + + BENCHMARK_LOGS_DIR="$JOB_BENCHMARK_LOGS_DIR" + export BENCHMARK_LOGS_DIR + SUBMIT_LOG="$BENCHMARK_LOGS_DIR/submit_${SCRIPT_NAME%.sh}.log" - bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}" > "$SUBMIT_LOG" 2>&1 + GITHUB_WORKSPACE="$STAGED_WORKSPACE" bash "$STAGED_WORKSPACE/benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}" > "$SUBMIT_LOG" 2>&1 SUBMIT_RC=$? cat "$SUBMIT_LOG" JOB_ID=$(grep -E '^[0-9]+$' "$SUBMIT_LOG" | tail -n 1 || true) From 8e0bf531ac07667c8e9e14dfbd6ce7ef7e8115d3 Mon Sep 17 00:00:00 2001 From: haic0 Date: Mon, 29 Jun 2026 02:56:39 +0000 Subject: [PATCH 94/98] fix: align mi300x multinode cleanup Signed-off-by: haic0 Co-authored-by: Cursor --- runners/launch_mi300x-amds.sh | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index d9394bdf3..bd217bdb7 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -117,7 +117,11 @@ if [[ "$IS_MULTINODE" == "true" ]]; then fi sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true } - trap cleanup_and_save_logs EXIT + if [[ "${KEEP_LOGS:-0}" == "1" ]]; then + trap '' EXIT + else + trap cleanup_and_save_logs EXIT + fi SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi300x_${FRAMEWORK}.sh" if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then @@ -150,10 +154,10 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if [[ "${#SELECTED_NODES[@]}" -ge "$NUM_NODES_REQUIRED" ]]; then break fi - done < <(sinfo -h -N -p "$SLURM_PARTITION" -t idle -o "%N" | sort -u) + done < <(sinfo -h -N -p "$SLURM_PARTITION" -t idle,mix,alloc -o "%N" | sort -u) if [[ "${#SELECTED_NODES[@]}" -ne "$NUM_NODES_REQUIRED" ]]; then - echo "ERROR: Need ${NUM_NODES_REQUIRED} nodes for multinode job but found ${#SELECTED_NODES[@]} with workspace access." >&2 + echo "ERROR: Need ${NUM_NODES_REQUIRED} nodes for multinode job but found ${#SELECTED_NODES[@]} usable nodes with writable /tmp for staging." >&2 echo "Selected nodes so far: ${SELECTED_NODES[*]}" >&2 exit 1 fi @@ -243,8 +247,15 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if [[ "${EVAL_ONLY:-false}" != "true" ]]; then cat > collect_latest_results.py <<'PY' import os, sys -sgl_job_dir, isl, osl, nexp, framework = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]), sys.argv[5] -for path in sorted([f"{sgl_job_dir}/logs/{name}/{framework}_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/{framework}_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: +job_dir, isl, osl, nexp, framework = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]), sys.argv[5] +logs_root = f"{job_dir}/logs/" +candidates = [] +if os.path.isdir(logs_root): + for name in os.listdir(logs_root): + subdir = f"{logs_root}{name}/{framework}_isl_{isl}_osl_{osl}" + if os.path.isdir(subdir): + candidates.append(subdir) +for path in sorted(candidates, key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY From 39716d7b2151cbb6ada8daafa3cd935f5cd4d1b5 Mon Sep 17 00:00:00 2001 From: haic0 Date: Mon, 29 Jun 2026 03:37:45 +0000 Subject: [PATCH 95/98] fix: wait for mi300x staging nodes Signed-off-by: haic0 Co-authored-by: Cursor --- runners/launch_mi300x-amds.sh | 64 ++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index bd217bdb7..b206d97c7 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -133,34 +133,50 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if [[ -z "${NODELIST:-}" ]]; then NUM_NODES_REQUIRED=$((PREFILL_NODES + DECODE_NODES)) SUBMIT_HOST=$(hostname -s) - SELECTED_NODES=("$SUBMIT_HOST") - - echo "Building NODELIST with submit host first: ${SUBMIT_HOST}" - while IFS= read -r candidate; do - [[ -n "$candidate" ]] || continue - [[ "$candidate" == "$SUBMIT_HOST" ]] && continue - if [[ ",${SLURM_EXCLUDE_NODES}," == *",${candidate},"* ]]; then - continue - fi - - if timeout 20s srun --nodes=1 --ntasks=1 --time=00:02:00 --partition="$SLURM_PARTITION" --nodelist="$candidate" \ - bash -lc "test -d /tmp && test -w /tmp" >/dev/null 2>&1; then - SELECTED_NODES+=("$candidate") - echo "Added NODELIST candidate with writable /tmp: $candidate" - else - echo "Skipping NODELIST candidate without writable /tmp: $candidate" + NODELIST_DISCOVERY_TIMEOUT="${NODELIST_DISCOVERY_TIMEOUT:-900}" + NODELIST_DISCOVERY_INTERVAL="${NODELIST_DISCOVERY_INTERVAL:-30}" + discovery_start=$(date +%s) + + while true; do + SELECTED_NODES=("$SUBMIT_HOST") + echo "Building NODELIST with submit host first: ${SUBMIT_HOST}" + + while IFS= read -r candidate; do + [[ -n "$candidate" ]] || continue + [[ "$candidate" == "$SUBMIT_HOST" ]] && continue + if [[ ",${SLURM_EXCLUDE_NODES}," == *",${candidate},"* ]]; then + continue + fi + + if timeout 20s srun --nodes=1 --ntasks=1 --time=00:02:00 --partition="$SLURM_PARTITION" --nodelist="$candidate" \ + bash -lc "test -d /tmp && test -w /tmp" >/dev/null 2>&1; then + SELECTED_NODES+=("$candidate") + echo "Added NODELIST candidate with writable /tmp: $candidate" + else + echo "Skipping NODELIST candidate without writable /tmp: $candidate" + fi + + if [[ "${#SELECTED_NODES[@]}" -ge "$NUM_NODES_REQUIRED" ]]; then + break + fi + done < <(sinfo -h -N -p "$SLURM_PARTITION" -t idle,mix,alloc -o "%N" | sort -u) + + if [[ "${#SELECTED_NODES[@]}" -eq "$NUM_NODES_REQUIRED" ]]; then + break fi - if [[ "${#SELECTED_NODES[@]}" -ge "$NUM_NODES_REQUIRED" ]]; then - break + now=$(date +%s) + elapsed=$((now - discovery_start)) + if (( elapsed >= NODELIST_DISCOVERY_TIMEOUT )); then + echo "ERROR: Need ${NUM_NODES_REQUIRED} nodes for multinode job but found ${#SELECTED_NODES[@]} usable nodes with writable /tmp for staging after ${elapsed}s." >&2 + echo "Selected nodes so far: ${SELECTED_NODES[*]}" >&2 + sinfo -N -p "$SLURM_PARTITION" -o "%N %T" >&2 || true + exit 1 fi - done < <(sinfo -h -N -p "$SLURM_PARTITION" -t idle,mix,alloc -o "%N" | sort -u) - if [[ "${#SELECTED_NODES[@]}" -ne "$NUM_NODES_REQUIRED" ]]; then - echo "ERROR: Need ${NUM_NODES_REQUIRED} nodes for multinode job but found ${#SELECTED_NODES[@]} usable nodes with writable /tmp for staging." >&2 - echo "Selected nodes so far: ${SELECTED_NODES[*]}" >&2 - exit 1 - fi + echo "Only found ${#SELECTED_NODES[@]}/${NUM_NODES_REQUIRED} usable nodes; retrying in ${NODELIST_DISCOVERY_INTERVAL}s..." + sleep "$NODELIST_DISCOVERY_INTERVAL" + done NODELIST=$(IFS=,; echo "${SELECTED_NODES[*]}") export NODELIST From ce883b4024582b00327ac7bd53007a701b57650c Mon Sep 17 00:00:00 2001 From: haic0 Date: Mon, 29 Jun 2026 03:57:59 +0000 Subject: [PATCH 96/98] fix: probe all mi300x nodes for staging Signed-off-by: haic0 Co-authored-by: Cursor --- runners/launch_mi300x-amds.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index b206d97c7..b79560da5 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -135,16 +135,18 @@ if [[ "$IS_MULTINODE" == "true" ]]; then SUBMIT_HOST=$(hostname -s) NODELIST_DISCOVERY_TIMEOUT="${NODELIST_DISCOVERY_TIMEOUT:-900}" NODELIST_DISCOVERY_INTERVAL="${NODELIST_DISCOVERY_INTERVAL:-30}" + MI300X_NODE_INVENTORY="${MI300X_NODE_INVENTORY:-chi-mi300x-034 chi-mi300x-035 chi-mi300x-036 chi-mi300x-043 chi-mi300x-049 chi-mi300x-054 chi-mi300x-057 chi-mi300x-058 chi-mi300x-121}" discovery_start=$(date +%s) while true; do SELECTED_NODES=("$SUBMIT_HOST") echo "Building NODELIST with submit host first: ${SUBMIT_HOST}" - while IFS= read -r candidate; do + for candidate in $MI300X_NODE_INVENTORY; do [[ -n "$candidate" ]] || continue [[ "$candidate" == "$SUBMIT_HOST" ]] && continue if [[ ",${SLURM_EXCLUDE_NODES}," == *",${candidate},"* ]]; then + echo "Skipping excluded NODELIST candidate: $candidate" continue fi @@ -159,7 +161,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if [[ "${#SELECTED_NODES[@]}" -ge "$NUM_NODES_REQUIRED" ]]; then break fi - done < <(sinfo -h -N -p "$SLURM_PARTITION" -t idle,mix,alloc -o "%N" | sort -u) + done if [[ "${#SELECTED_NODES[@]}" -eq "$NUM_NODES_REQUIRED" ]]; then break @@ -170,6 +172,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if (( elapsed >= NODELIST_DISCOVERY_TIMEOUT )); then echo "ERROR: Need ${NUM_NODES_REQUIRED} nodes for multinode job but found ${#SELECTED_NODES[@]} usable nodes with writable /tmp for staging after ${elapsed}s." >&2 echo "Selected nodes so far: ${SELECTED_NODES[*]}" >&2 + echo "MI300X node inventory checked: ${MI300X_NODE_INVENTORY}" >&2 sinfo -N -p "$SLURM_PARTITION" -o "%N %T" >&2 || true exit 1 fi From dfabd359e663e9215f3c8d26622b17c66737d9c7 Mon Sep 17 00:00:00 2001 From: haic0 Date: Mon, 29 Jun 2026 08:49:04 +0000 Subject: [PATCH 97/98] fix: update mi300x vllm router image Signed-off-by: haic0 Co-authored-by: Cursor --- benchmarks/multi_node/amd_utils/job.slurm | 2 +- runners/launch_mi300x-amds.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 9b43e1a8e..3d2d9a1fd 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -329,7 +329,7 @@ SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" # vLLM external router container -VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260511-e667ebb}" +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260629-e667ebb}" ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index b79560da5..b79b5067b 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -40,6 +40,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then export MODEL_PATH="/raid/hf-hub-cache" export IBDEVICES="bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7" export MORI_RDMA_TC=104 + export VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260629-e667ebb}" export MODEL_DIR="$MODEL_PATH" export GPUS_PER_NODE=8 From 7df461227872b207c44003d78c7d951f79934ca6 Mon Sep 17 00:00:00 2001 From: haic0 Date: Mon, 29 Jun 2026 09:48:49 +0000 Subject: [PATCH 98/98] fix: reuse mi300x slurm allocation Signed-off-by: haic0 Co-authored-by: Cursor --- runners/launch_mi300x-amds.sh | 247 +++++++++++++++++++++++++--------- 1 file changed, 187 insertions(+), 60 deletions(-) diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index b79b5067b..ee5aa4b94 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -107,6 +107,10 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if [[ -n "${GITHUB_ACTIONS:-}" ]]; then save_multinode_diagnostics fi + if [[ "${STAGING_ALLOCATION_ID:-}" =~ ^[0-9]+$ ]] && [[ -n "$(squeue -j "$STAGING_ALLOCATION_ID" --noheader 2>/dev/null)" ]]; then + scancel "$STAGING_ALLOCATION_ID" || true + fi + [[ -n "${WORKSPACE_ARCHIVE:-}" ]] && rm -f "$WORKSPACE_ARCHIVE" 2>/dev/null || true local err_file="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID:-unknown}.err" if [[ ! "${JOB_ID:-}" =~ ^[0-9]+$ ]]; then err_file="$BENCHMARK_LOGS_DIR/slurm_job-unknown.err" @@ -131,81 +135,180 @@ if [[ "$IS_MULTINODE" == "true" ]]; then BENCHMARK_SUBDIR="single_node" fi + NUM_NODES_REQUIRED=$((PREFILL_NODES + DECODE_NODES)) + SUBMIT_HOST=$(hostname -s) + SANITIZED_RUNNER=$(printf '%s' "${RUNNER_NAME:-runner}" | tr -c 'a-zA-Z0-9_.-' '_') + STAGED_WORKSPACE="/tmp/inferencex-${USER}-${GITHUB_RUN_ID:-manual}-${SANITIZED_RUNNER}" + JOB_BENCHMARK_LOGS_DIR="${STAGED_WORKSPACE}/benchmark_logs" + WORKSPACE_ARCHIVE="${RUNNER_TEMP:-/tmp}/inferencex-workspace-${USER}-${GITHUB_RUN_ID:-manual}-${SANITIZED_RUNNER}.tar" + + NODELIST_DISCOVERY_TIMEOUT="${NODELIST_DISCOVERY_TIMEOUT:-900}" + NODELIST_DISCOVERY_INTERVAL="${NODELIST_DISCOVERY_INTERVAL:-30}" + NODELIST_ALLOCATION_TIMEOUT="${NODELIST_ALLOCATION_TIMEOUT:-120}" + NODELIST_ALLOC_IMMEDIATE="${NODELIST_ALLOC_IMMEDIATE:-${NODELIST_PROBE_IMMEDIATE:-30}}" + STAGING_TIMEOUT_SECONDS="${STAGING_TIMEOUT_SECONDS:-600}" + STAGING_RETRIES="${STAGING_RETRIES:-3}" + STAGING_RETRY_DELAY="${STAGING_RETRY_DELAY:-20}" + MI300X_ALLOCATION_TIME="${MI300X_ALLOCATION_TIME:-08:30:00}" + MI300X_NODE_INVENTORY="${MI300X_NODE_INVENTORY:-chi-mi300x-034 chi-mi300x-035 chi-mi300x-036 chi-mi300x-043 chi-mi300x-049 chi-mi300x-054 chi-mi300x-057 chi-mi300x-058 chi-mi300x-121}" + + echo "Creating reusable workspace archive at ${WORKSPACE_ARCHIVE}" + rm -f "$WORKSPACE_ARCHIVE" + tar \ + --exclude='./benchmark_logs' \ + --exclude='./benchmark_artifacts' \ + --exclude='./multinode_server_logs.tar.gz' \ + --exclude='./.git' \ + -C "$GITHUB_WORKSPACE" -cf "$WORKSPACE_ARCHIVE" . + + allocate_nodes() { + local requested_nodelist=$1 + local alloc_output + local alloc_id + local errexit_was_set=0 + + [[ $- == *e* ]] && errexit_was_set=1 + + echo "Requesting exclusive MI300X allocation from NODELIST=${requested_nodelist}" + set +e + alloc_output=$(timeout "${NODELIST_ALLOCATION_TIMEOUT}s" salloc \ + --immediate="$NODELIST_ALLOC_IMMEDIATE" \ + --partition="$SLURM_PARTITION" \ + --account="$SLURM_ACCOUNT" \ + --exclusive \ + --gres="gpu:${GPUS_PER_NODE}" \ + -N "$NUM_NODES_REQUIRED" \ + -n "$NUM_NODES_REQUIRED" \ + --nodelist="$requested_nodelist" \ + --time="$MI300X_ALLOCATION_TIME" \ + --no-shell \ + --job-name="$RUNNER_NAME" 2>&1) + local alloc_rc=$? + if [[ "$errexit_was_set" -eq 1 ]]; then + set -e + else + set +e + fi + printf '%s\n' "$alloc_output" + + if [[ "$alloc_rc" -ne 0 ]]; then + return 1 + fi + + alloc_id=$(awk '/Granted job allocation/ {print $NF}' <<< "$alloc_output" | tail -n 1) + if [[ ! "$alloc_id" =~ ^[0-9]+$ ]]; then + echo "ERROR: Failed to parse allocation id from salloc output" >&2 + return 1 + fi + + STAGING_ALLOCATION_ID="$alloc_id" + export STAGING_ALLOCATION_ID + return 0 + } + + stage_workspace_local() { + echo "Staging workspace locally on ${SUBMIT_HOST}:${STAGED_WORKSPACE}" + rm -rf "$STAGED_WORKSPACE" + mkdir -p "$STAGED_WORKSPACE" "$JOB_BENCHMARK_LOGS_DIR" + tar -C "$STAGED_WORKSPACE" -xf "$WORKSPACE_ARCHIVE" + test -f "$STAGED_WORKSPACE/benchmarks/multi_node/amd_utils/job.slurm" + test -d "$JOB_BENCHMARK_LOGS_DIR" + } + + stage_workspace_remote_once() { + local node=$1 + timeout "${STAGING_TIMEOUT_SECONDS}s" srun --jobid="$STAGING_ALLOCATION_ID" --overlap \ + --nodes=1 --ntasks=1 --nodelist="$node" \ + bash -lc "rm -rf '$STAGED_WORKSPACE' && mkdir -p '$STAGED_WORKSPACE' '$JOB_BENCHMARK_LOGS_DIR' && tar -C '$STAGED_WORKSPACE' -xf - && test -f '$STAGED_WORKSPACE/benchmarks/multi_node/amd_utils/job.slurm' && test -d '$JOB_BENCHMARK_LOGS_DIR'" \ + < "$WORKSPACE_ARCHIVE" + } + + stage_workspace_remote() { + local node=$1 + local attempt + + for ((attempt = 1; attempt <= STAGING_RETRIES; attempt++)); do + echo "Staging workspace to ${node}:${STAGED_WORKSPACE} (attempt ${attempt}/${STAGING_RETRIES})" + if stage_workspace_remote_once "$node"; then + return 0 + fi + echo "WARNING: Failed to stage workspace on ${node} (attempt ${attempt}/${STAGING_RETRIES})" >&2 + sleep "$STAGING_RETRY_DELAY" + done + + return 1 + } + if [[ -z "${NODELIST:-}" ]]; then - NUM_NODES_REQUIRED=$((PREFILL_NODES + DECODE_NODES)) - SUBMIT_HOST=$(hostname -s) - NODELIST_DISCOVERY_TIMEOUT="${NODELIST_DISCOVERY_TIMEOUT:-900}" - NODELIST_DISCOVERY_INTERVAL="${NODELIST_DISCOVERY_INTERVAL:-30}" - MI300X_NODE_INVENTORY="${MI300X_NODE_INVENTORY:-chi-mi300x-034 chi-mi300x-035 chi-mi300x-036 chi-mi300x-043 chi-mi300x-049 chi-mi300x-054 chi-mi300x-057 chi-mi300x-058 chi-mi300x-121}" + CANDIDATE_NODES=() + for candidate in $MI300X_NODE_INVENTORY; do + [[ -n "$candidate" ]] || continue + if [[ ",${SLURM_EXCLUDE_NODES}," == *",${candidate},"* ]]; then + echo "Skipping excluded MI300X allocation candidate: $candidate" + continue + fi + CANDIDATE_NODES+=("$candidate") + done + + if [[ "${#CANDIDATE_NODES[@]}" -lt "$NUM_NODES_REQUIRED" ]]; then + echo "ERROR: Need ${NUM_NODES_REQUIRED} MI300X nodes but only ${#CANDIDATE_NODES[@]} candidates remain after exclusions." >&2 + echo "MI300X node inventory checked: ${MI300X_NODE_INVENTORY}" >&2 + echo "SLURM_EXCLUDE_NODES=${SLURM_EXCLUDE_NODES:-}" >&2 + exit 1 + fi + + REQUESTED_NODELIST=$(IFS=,; echo "${CANDIDATE_NODES[*]}") discovery_start=$(date +%s) while true; do - SELECTED_NODES=("$SUBMIT_HOST") - echo "Building NODELIST with submit host first: ${SUBMIT_HOST}" - - for candidate in $MI300X_NODE_INVENTORY; do - [[ -n "$candidate" ]] || continue - [[ "$candidate" == "$SUBMIT_HOST" ]] && continue - if [[ ",${SLURM_EXCLUDE_NODES}," == *",${candidate},"* ]]; then - echo "Skipping excluded NODELIST candidate: $candidate" - continue - fi - - if timeout 20s srun --nodes=1 --ntasks=1 --time=00:02:00 --partition="$SLURM_PARTITION" --nodelist="$candidate" \ - bash -lc "test -d /tmp && test -w /tmp" >/dev/null 2>&1; then - SELECTED_NODES+=("$candidate") - echo "Added NODELIST candidate with writable /tmp: $candidate" - else - echo "Skipping NODELIST candidate without writable /tmp: $candidate" - fi - - if [[ "${#SELECTED_NODES[@]}" -ge "$NUM_NODES_REQUIRED" ]]; then - break - fi - done - - if [[ "${#SELECTED_NODES[@]}" -eq "$NUM_NODES_REQUIRED" ]]; then + if allocate_nodes "$REQUESTED_NODELIST"; then break fi now=$(date +%s) elapsed=$((now - discovery_start)) if (( elapsed >= NODELIST_DISCOVERY_TIMEOUT )); then - echo "ERROR: Need ${NUM_NODES_REQUIRED} nodes for multinode job but found ${#SELECTED_NODES[@]} usable nodes with writable /tmp for staging after ${elapsed}s." >&2 - echo "Selected nodes so far: ${SELECTED_NODES[*]}" >&2 + echo "ERROR: Failed to allocate ${NUM_NODES_REQUIRED} exclusive MI300X nodes after ${elapsed}s." >&2 + echo "Requested NODELIST=${REQUESTED_NODELIST}" >&2 echo "MI300X node inventory checked: ${MI300X_NODE_INVENTORY}" >&2 sinfo -N -p "$SLURM_PARTITION" -o "%N %T" >&2 || true exit 1 fi - echo "Only found ${#SELECTED_NODES[@]}/${NUM_NODES_REQUIRED} usable nodes; retrying in ${NODELIST_DISCOVERY_INTERVAL}s..." + echo "Allocation not available yet; retrying in ${NODELIST_DISCOVERY_INTERVAL}s..." sleep "$NODELIST_DISCOVERY_INTERVAL" done - NODELIST=$(IFS=,; echo "${SELECTED_NODES[*]}") - export NODELIST - echo "Using generated NODELIST=${NODELIST}" else echo "Using caller-provided NODELIST=${NODELIST}" IFS=',' read -r -a SELECTED_NODES <<< "$NODELIST" + if ! allocate_nodes "$NODELIST"; then + echo "ERROR: Failed to allocate caller-provided NODELIST=${NODELIST}" >&2 + exit 1 + fi fi - SANITIZED_RUNNER=$(printf '%s' "${RUNNER_NAME:-runner}" | tr -c 'a-zA-Z0-9_.-' '_') - STAGED_WORKSPACE="/tmp/inferencex-${USER}-${GITHUB_RUN_ID:-manual}-${SANITIZED_RUNNER}" - JOB_BENCHMARK_LOGS_DIR="${STAGED_WORKSPACE}/benchmark_logs" + ALLOC_NODELIST=$(squeue -h -j "$STAGING_ALLOCATION_ID" -o '%N') + if [[ -z "$ALLOC_NODELIST" ]]; then + echo "ERROR: Failed to resolve nodelist for allocation ${STAGING_ALLOCATION_ID}" >&2 + exit 1 + fi + mapfile -t SELECTED_NODES < <(scontrol show hostnames "$ALLOC_NODELIST") + NODELIST=$(IFS=,; echo "${SELECTED_NODES[*]}") + echo "Using allocated NODELIST=${NODELIST} from allocation ${STAGING_ALLOCATION_ID}" + + export NODELIST + export SLURM_REUSE_JOBID="$STAGING_ALLOCATION_ID" + export SLURM_JOB_NODELIST="$ALLOC_NODELIST" + export SLURM_NODELIST="$ALLOC_NODELIST" + stage_workspace_local for node in "${SELECTED_NODES[@]}"; do - echo "Staging workspace to ${node}:${STAGED_WORKSPACE}" - tar \ - --exclude='./benchmark_logs' \ - --exclude='./benchmark_artifacts' \ - --exclude='./multinode_server_logs.tar.gz' \ - --exclude='./.git' \ - -C "$GITHUB_WORKSPACE" -cf - . | \ - timeout 120s srun --nodes=1 --ntasks=1 --time=00:05:00 --partition="$SLURM_PARTITION" --nodelist="$node" \ - bash -lc "rm -rf '$STAGED_WORKSPACE' && mkdir -p '$STAGED_WORKSPACE' '$JOB_BENCHMARK_LOGS_DIR' && tar -C '$STAGED_WORKSPACE' -xf - && test -f '$STAGED_WORKSPACE/benchmarks/multi_node/amd_utils/job.slurm' && test -d '$JOB_BENCHMARK_LOGS_DIR'" - stage_status=("${PIPESTATUS[@]}") - if [[ "${stage_status[0]}" -ne 0 || "${stage_status[1]}" -ne 0 ]]; then + if [[ "$node" == "$SUBMIT_HOST" ]]; then + echo "Skipping remote staging for local allocation node ${node}" + continue + fi + if ! stage_workspace_remote "$node"; then echo "ERROR: Failed to stage workspace on ${node}" >&2 exit 1 fi @@ -236,26 +339,50 @@ if [[ "$IS_MULTINODE" == "true" ]]; then fi LOG_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.out" + REUSE_JOB_PID_FILE="$BENCHMARK_LOGS_DIR/slurm_job-${JOB_ID}.pid" sleep 10 while ! ls "$LOG_FILE" &>/dev/null; do - if ! squeue -u "$USER" --noheader --format='%i' | grep -q "$JOB_ID"; then - echo "ERROR: Job $JOB_ID failed before creating log file" - scontrol show job "$JOB_ID" || true - save_multinode_diagnostics - exit 1 + if [[ -n "${SLURM_REUSE_JOBID:-}" ]]; then + if [[ ! -f "$REUSE_JOB_PID_FILE" ]]; then + echo "ERROR: Missing reused job pid file $REUSE_JOB_PID_FILE before log file was created" + save_multinode_diagnostics + exit 1 + fi + REUSE_JOB_PID=$(<"$REUSE_JOB_PID_FILE") + if [[ ! "$REUSE_JOB_PID" =~ ^[0-9]+$ ]] || ! kill -0 "$REUSE_JOB_PID" 2>/dev/null; then + echo "ERROR: Reused job $JOB_ID exited before creating log file" + save_multinode_diagnostics + exit 1 + fi + else + if ! squeue -u "$USER" --noheader --format='%i' | grep -q "$JOB_ID"; then + echo "ERROR: Job $JOB_ID failed before creating log file" + scontrol show job "$JOB_ID" || true + save_multinode_diagnostics + exit 1 + fi fi sleep 5 done set +x - ( - while squeue -u $USER --noheader --format='%i' | grep -q "$JOB_ID"; do - sleep 10 - done - ) & + if [[ -n "${SLURM_REUSE_JOBID:-}" ]]; then + REUSE_JOB_PID=$(<"$REUSE_JOB_PID_FILE") + ( + while kill -0 "$REUSE_JOB_PID" 2>/dev/null; do + sleep 10 + done + ) & + else + ( + while squeue -u $USER --noheader --format='%i' | grep -q "$JOB_ID"; do + sleep 10 + done + ) & + fi POLL_PID=$! tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null