From 9bd6542194da608c4bc5c356bbf04078755c9116 Mon Sep 17 00:00:00 2001 From: aoshen02 Date: Wed, 17 Jun 2026 05:32:41 +0000 Subject: [PATCH] fix vllm prefix caching defaults --- tests/utils/test_vllm_engine.py | 19 +++++++++++++++++++ vime/backends/vllm_utils/vllm_engine.py | 3 +++ 2 files changed, 22 insertions(+) diff --git a/tests/utils/test_vllm_engine.py b/tests/utils/test_vllm_engine.py index e6f5400e..994b8b11 100644 --- a/tests/utils/test_vllm_engine.py +++ b/tests/utils/test_vllm_engine.py @@ -227,6 +227,25 @@ def test_build_vllm_cmd_adds_sleep_mode_only_for_offload_rollout(vllm_args): assert vllm_args.vllm_enable_sleep_mode is True +@pytest.mark.unit +def test_build_vllm_cmd_enables_prefix_cache_by_default(vllm_args): + server_args = mod._compute_server_args(vllm_args, rank=0, dist_init_addr=None, host="127.0.0.1", port=8000) + + cmd, _ = mod.build_vllm_cmd_and_env(server_args) + + assert "--enable-prefix-caching" in cmd + + +@pytest.mark.unit +def test_build_vllm_cmd_honors_explicit_prefix_cache_disable(vllm_args): + vllm_args.vllm_enable_prefix_caching = False + server_args = mod._compute_server_args(vllm_args, rank=0, dist_init_addr=None, host="127.0.0.1", port=8000) + + cmd, _ = mod.build_vllm_cmd_and_env(server_args) + + assert "--enable-prefix-caching" not in cmd + + @pytest.mark.unit def test_build_vllm_cmd_does_not_infer_sleep_mode_from_colocate(vllm_args): vllm_args.colocate = True diff --git a/vime/backends/vllm_utils/vllm_engine.py b/vime/backends/vllm_utils/vllm_engine.py index ce4790f8..39e32ca9 100644 --- a/vime/backends/vllm_utils/vllm_engine.py +++ b/vime/backends/vllm_utils/vllm_engine.py @@ -400,6 +400,9 @@ def build_vllm_cmd_and_env(server_args: dict[str, Any]) -> tuple[list[str], dict if getattr(args, "use_rollout_routing_replay", False): cmd += ["--enable-return-routed-experts"] + if getattr(args, "vllm_enable_prefix_caching", True) is not False: + cmd += ["--enable-prefix-caching"] + # gpu_memory_utilization: no vime-forced default. In colocate, training and rollout do not # occupy the GPU simultaneously (sleep/offload cycles), so vLLM's own default is fine. A user # value passed via --vllm-gpu-memory-utilization is auto-forwarded by _forward_vllm_cli_args.