From 5d2304db720fb0acc7e568f787cdddd32e77c27e Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 25 Jun 2026 11:35:31 +0100 Subject: [PATCH 1/5] Add vLLM Offline Backend for batch processing Implements standalone offline backend using vLLM's LLM class for micro-batching. Adapted to main's architecture without VLLMBackendBase, using main's import patterns (lazy loading via guidellm.extras, utils.audio/vision). Features: - Batch processing with configurable batch_size (default: 32) - Chat template support (plain, default-template, custom Jinja2) - Multimodal data handling (image/audio) - Single-process execution for batch coordination - Compatible with vLLM 0.21.0+ Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- docs/guides/backends.md | 7 +- docs/guides/vllm-offline-backend.md | 235 ++++++ src/guidellm/backends/openai/http.py | 2 +- src/guidellm/backends/vllm_python/__init__.py | 3 +- src/guidellm/backends/vllm_python/offline.py | 770 ++++++++++++++++++ tests/unit/backends/test_backend.py | 19 + 6 files changed, 1032 insertions(+), 4 deletions(-) create mode 100644 docs/guides/vllm-offline-backend.md create mode 100644 src/guidellm/backends/vllm_python/offline.py diff --git a/docs/guides/backends.md b/docs/guides/backends.md index a6bf804f1..f8506c016 100644 --- a/docs/guides/backends.md +++ b/docs/guides/backends.md @@ -8,9 +8,12 @@ GuideLLM is designed to work with OpenAI-compatible HTTP servers, enabling seaml GuideLLM supports OpenAI-compatible HTTP servers, which provide a standardized API for interacting with LLMs. This includes popular implementations such as [vLLM](https://github.com/vllm-project/vllm) and [Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference). These servers allow GuideLLM to perform evaluations, benchmarks, and optimizations with minimal setup. -### vLLM Python backend +### vLLM Python Backends -GuideLLM supports running inference in the same process using the **vLLM Python backend** (`vllm_python`). This backend runs inference in the same process as GuideLLM's using vLLM's python API (AsyncLLMEngine), without an HTTP server. For setup, installation options (container, existing vLLM, pip), and examples, see [vLLM Python backend](vllm-python-backend.md). +GuideLLM supports running inference in the same process using vLLM's Python API, without an HTTP server: + +- **vLLM Python backend** (`vllm_python`): Uses vLLM's AsyncLLMEngine for async streaming inference. For setup and examples, see [vLLM Python backend](vllm-python-backend.md). +- **vLLM Offline backend** (`vllm_offline`): Uses vLLM's LLM class for batch processing with micro-batching. Designed for offline benchmarking where batch efficiency is prioritized over streaming latency. For setup and examples, see [vLLM Offline backend](vllm-offline-backend.md). ## Examples for Spinning Up Compatible Servers diff --git a/docs/guides/vllm-offline-backend.md b/docs/guides/vllm-offline-backend.md new file mode 100644 index 000000000..7383f8256 --- /dev/null +++ b/docs/guides/vllm-offline-backend.md @@ -0,0 +1,235 @@ +# vLLM Offline Backend + +The **vLLM Offline backend** (`vllm_offline`) provides synchronous batch processing using vLLM's `LLM` class. It collects requests into micro-batches and processes them together for maximum throughput, making it ideal for offline benchmarking scenarios where batching efficiency is prioritized over per-request latency. + +## When to Use the Offline Backend + +**Use `vllm_offline` when:** + +- Running offline batch inference on large datasets +- Maximizing throughput is more important than individual request latency +- You have a known dataset size and want optimal batch processing +- Benchmarking pure model throughput without HTTP overhead +- Processing datasets for evaluation or ETL pipelines + +**Use `vllm_python` (AsyncLLMEngine) when:** + +- You need streaming token-by-token responses +- Simulating production-like continuous request arrival +- Measuring realistic latency characteristics +- Need async request handling + +**Use OpenAI HTTP backend when:** + +- Testing against a production vLLM server +- Measuring end-to-end latency including network overhead +- Benchmarking a deployed service + +## Installation + +The offline backend requires vLLM to be installed. See the [vLLM Python Backend installation guide](vllm-python-backend.md#installation) for recommended installation methods. + +## Basic Usage + +```bash +guidellm benchmark run \ + --backend vllm_offline \ + --model "Qwen/Qwen3-0.6B" \ + --backend-kwargs '{"batch_size": 64}' \ + --data "prompt_tokens=256,output_tokens=128" \ + --max-requests 1000 +``` + +## Backend Options + +Configure the offline backend via `--backend-kwargs` with JSON: + +```bash +--backend-kwargs '{ + "model": "meta-llama/Llama-2-7b-hf", + "batch_size": 64, + "vllm_config": { + "tensor_parallel_size": 2, + "gpu_memory_utilization": 0.9 + } +}' +``` + +### Key Parameters + +- **`model`** (required): Model identifier or path +- **`batch_size`**: Number of requests to collect before processing (default: 32) + - Larger batches = higher throughput but more latency + - Smaller batches = lower latency but less throughput + - Recommended: 32-128 for most use cases +- **`vllm_config`**: Dictionary of vLLM EngineArgs parameters + - `tensor_parallel_size`: Number of GPUs for tensor parallelism + - `gpu_memory_utilization`: Fraction of GPU memory to use (0.0-1.0) + - `max_model_len`: Maximum sequence length + - See [vLLM Engine Arguments](https://docs.vllm.ai/en/stable/configuration/engine_args/) for all options (use Python parameter names) +- **`request_format`**: How to format prompts + - `"default-template"` (default): Use tokenizer's chat template + - `"plain"`: No chat template, plain text concatenation + - Path or string: Custom Jinja2 chat template +- **`image_placeholder`**: Placeholder for images (default: `""`) +- **`audio_placeholder`**: Placeholder for audio (default: `"<|audio|>"`) + +## How Micro-Batching Works + +The offline backend uses a **micro-batching** approach: + +1. **Buffering**: As requests arrive via `resolve()`, they're added to a buffer +2. **Batch Detection**: When buffer reaches `batch_size`, trigger processing +3. **Batch Processing**: Process entire batch with one `LLM.generate()` call +4. **Result Distribution**: Return cached results to waiting requests +5. **Flush on Shutdown**: Remaining requests processed when backend shuts down + +This gives you 10-100x fewer model forward passes compared to per-request processing while working within GuideLLM's scheduler architecture. + +## Examples + +### Basic Throughput Benchmark + +```bash +guidellm benchmark run \ + --backend vllm_offline \ + --model "Qwen/Qwen3-0.6B" \ + --data "prompt_tokens=512,output_tokens=256" \ + --profile throughput \ + --max-seconds 60 +``` + +### Large Batch Processing + +```bash +guidellm benchmark run \ + --backend vllm_offline \ + --backend-kwargs '{"batch_size": 128}' \ + --model "meta-llama/Llama-2-7b-hf" \ + --data path/to/dataset.csv \ + --max-requests -1 # Process entire dataset +``` + +### Multi-GPU Configuration + +```bash +guidellm benchmark run \ + --backend vllm_offline \ + --backend-kwargs '{ + "model": "meta-llama/Llama-2-70b-hf", + "batch_size": 64, + "vllm_config": { + "tensor_parallel_size": 4, + "gpu_memory_utilization": 0.95 + } + }' \ + --data "prompt_tokens=1024,output_tokens=512" +``` + +### HuggingFace Dataset + +```bash +guidellm benchmark run \ + --backend vllm_offline \ + --model "meta-llama/Llama-2-7b-hf" \ + --backend-kwargs '{"batch_size": 32}' \ + --data "hf:cnn_dailymail" \ + --data-args '{"name": "3.0.0"}' \ + --data-column-mapper '{"column_mappings": {"text_column": "article"}}' +``` + +## Performance Tuning + +### Choosing Batch Size + +| Batch Size | Throughput | Latency | Memory | When to Use | +| ---------- | ---------- | ------- | ------ | ---------------------------- | +| 8-16 | Low | Low | Low | Small models, limited memory | +| 32-64 | Good | Medium | Medium | General use, balanced | +| 128-256 | High | High | High | Large GPUs, max throughput | + +**Rule of thumb**: Start with 32, increase until GPU utilization >90% or OOM. + +### Memory Optimization + +```bash +# Reduce memory usage +--backend-kwargs '{ + "batch_size": 16, + "vllm_config": { + "gpu_memory_utilization": 0.8, + "max_model_len": 2048 + } +}' +``` + +### Maximizing Throughput + +```bash +# Maximize throughput +--backend-kwargs '{ + "batch_size": 128, + "vllm_config": { + "gpu_memory_utilization": 0.95, + "enable_prefix_caching": true + } +}' +``` + +## Comparison: Offline vs Python vs HTTP + +| Feature | `vllm_offline` | `vllm_python` | OpenAI HTTP | +| -------------- | ---------------- | ------------- | ------------ | +| **Batching** | Micro-batching | Continuous | Continuous | +| **Throughput** | Highest | High | Good | +| **Latency** | Higher (batched) | Lower | Lowest† | +| **Streaming** | No | Yes | Yes | +| **Overhead** | None | None | HTTP/network | +| **Processes** | 1 | 1 | Multiple | +| **Use Case** | Offline eval | Research | Production | + +*† Subject to network conditions* + +## Troubleshooting + +### "Backend not started up for process" + +The backend wasn't initialized. Ensure your benchmark calls the backend lifecycle correctly (this should happen automatically). + +### Out of Memory (OOM) + +Reduce `batch_size` or `gpu_memory_utilization`: + +```bash +--backend-kwargs '{"batch_size": 16, "vllm_config": {"gpu_memory_utilization": 0.7}}' +``` + +### Batch Processing Too Slow + +Increase `batch_size` for better GPU utilization: + +```bash +--backend-kwargs '{"batch_size": 64}' +``` + +### Wrong Prompt Format + +Specify `request_format` explicitly: + +```bash +--backend-kwargs '{"request_format": "plain"}' +``` + +## Limitations + +1. **No Streaming**: Results returned after entire batch completes +2. **Single Process**: Limited to 1 worker process for batch coordination +3. **Fixed Batch Window**: Batches based on count, not time +4. **Multi-turn Not Supported**: Conversation history not yet implemented + +## See Also + +- [vLLM Python Backend](vllm-python-backend.md) - AsyncLLMEngine-based backend +- [Backends Guide](backends.md) - Overview of all backends +- [vLLM Engine Arguments](https://docs.vllm.ai/en/stable/configuration/engine_args/) - Full configuration options +- [vLLM LLM Class](https://docs.vllm.ai/en/stable/offline_inference/llm.html) - Underlying API documentation diff --git a/src/guidellm/backends/openai/http.py b/src/guidellm/backends/openai/http.py index fab2ff828..34c2c52f1 100644 --- a/src/guidellm/backends/openai/http.py +++ b/src/guidellm/backends/openai/http.py @@ -86,7 +86,7 @@ class OpenAIHTTPBackendArgs(BackendArgs): api_key: SecretStr | None = Field( default=None, description="HTTP Bearer token API key for authentication to server", - examples=["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"], + examples=["sk-your-api-key-here"], ) api_routes: dict[str, str] = Field( default_factory=dict, diff --git a/src/guidellm/backends/vllm_python/__init__.py b/src/guidellm/backends/vllm_python/__init__.py index fb8f4703b..19edba58d 100644 --- a/src/guidellm/backends/vllm_python/__init__.py +++ b/src/guidellm/backends/vllm_python/__init__.py @@ -5,7 +5,8 @@ GenerationResponse from vLLM output. """ +from .offline import VLLMOfflineBackend from .vllm import VLLMPythonBackend from .vllm_response import VLLMResponseHandler -__all__ = ["VLLMPythonBackend", "VLLMResponseHandler"] +__all__ = ["VLLMPythonBackend", "VLLMOfflineBackend", "VLLMResponseHandler"] diff --git a/src/guidellm/backends/vllm_python/offline.py b/src/guidellm/backends/vllm_python/offline.py new file mode 100644 index 000000000..a8cd9fb7a --- /dev/null +++ b/src/guidellm/backends/vllm_python/offline.py @@ -0,0 +1,770 @@ +""" +vLLM Offline Backend for static/micro-batch inference. + +Uses vLLM's LLM class for synchronous batch processing. Collects requests +into batches and processes them with LLM.generate() for maximum throughput. +Designed for offline benchmarking scenarios where batching efficiency is +more important than per-request latency. +""" + +from __future__ import annotations + +import asyncio +import contextlib +import time +import uuid +from collections.abc import AsyncIterator +from pathlib import Path +from typing import Any, Literal, cast + +import jinja2 +from more_itertools import roundrobin +from pydantic import ConfigDict, Field, PositiveInt, model_validator + +from guidellm.backends.backend import Backend, BackendArgs +from guidellm.backends.vllm_python.vllm_response import VLLMResponseHandler +from guidellm.extras import vllm +from guidellm.logger import logger +from guidellm.schemas import ( + GenerationRequest, + GenerationResponse, + RequestInfo, + StandardBaseModel, +) +from guidellm.utils import audio, vision + +# Sentinel for "chat template not yet resolved" cache. +_CHAT_TEMPLATE_UNSET: object = object() + +__all__ = ["VLLMOfflineBackend", "VLLMOfflineBackendArgs"] + + +@BackendArgs.register("vllm_offline") +class VLLMOfflineBackendArgs(BackendArgs): + """Pydantic model for VLLM Offline backend creation arguments.""" + + kind: Literal["vllm_offline"] = Field( + default="vllm_offline", + description="Backend type identifier for VLLM Offline backend.", + ) + model: str = Field( + description="Model identifier or path for VLLM to load", + ) + vllm_config: dict[str, Any] = Field( + default_factory=dict, + description=( + "Configuration dictionary for vLLM EngineArgs parameters. Pass " + "any valid EngineArgs parameters here (e.g. tensor_parallel_size, " + "gpu_memory_utilization, max_model_len). The 'model' parameter is required " + "and can be set here or via the top-level 'model' field; if set in both " + "places, the top-level 'model' field takes precedence." + ), + ) + request_format: Literal["plain", "default-template"] | str = Field( + default="default-template", + description=( + "Request format for VLLM Offline backend. " + "Valid values: 'plain' (no chat template), " + "'default-template' (use tokenizer default), or a path to " + "/ inline Jinja2 chat template." + ), + ) + image_placeholder: str = Field( + default="", + description="Placeholder for image items in multimodal prompts.", + ) + audio_placeholder: str = Field( + default="<|audio|>", + description="Placeholder for audio items in multimodal prompts.", + ) + batch_size: PositiveInt = Field( + default=32, + description=( + "Number of requests to collect before processing as a batch. " + "Larger batches improve throughput but increase latency." + ), + ) + + @model_validator(mode="after") + def validate_vllm_config(self): + """Set defaults on vllm_config and ensure model is set.""" + if "model" in self.vllm_config: + logger.warning( + "The `model` input was passed to the vllm offline backend " + "with the `vllm_config` input. Ignoring and overwriting " + "with the value from the `model` input." + ) + self.vllm_config["model"] = self.model + return self + + +class _ResolvedRequest(StandardBaseModel): + """Fully resolved request: prompt already formatted, ready for engine.generate.""" + + model_config = ConfigDict(frozen=True) + + prompt: str = Field( + description="Fully resolved prompt string (templated, with placeholders)" + ) + multi_modal_data: dict[str, Any] | None = Field( + default=None, + description="vLLM multi_modal_data from image/audio/video columns.", + ) + + +class _BatchedRequest: + """Internal tracking for a request waiting in batch.""" + + def __init__( + self, + request: GenerationRequest, + request_info: RequestInfo, + resolved_prompt: str, + multi_modal_data: dict[str, Any] | None, + max_tokens: int | None, + ): + self.request = request + self.request_info = request_info + self.resolved_prompt = resolved_prompt + self.multi_modal_data = multi_modal_data + self.max_tokens = max_tokens + self.request_id = str(uuid.uuid4()) + self.result: vllm.RequestOutput | None = None + self.ready = asyncio.Event() + + +def _has_jinja2_markers(s: str) -> bool: + """Return True if the string contains Jinja2 template syntax ({{, {%, or {#).""" + return "{{" in s or "{%" in s or "{#" in s + + +@Backend.register("vllm_offline") +class VLLMOfflineBackend(Backend): + """ + Offline backend for vLLM using LLM class for batch processing. + + Collects requests into micro-batches and processes them together using + vLLM's LLM.generate() for optimal throughput. Designed for offline + benchmarking where batch efficiency is prioritized over streaming latency. + + Example: + :: + args = VLLMOfflineBackendArgs( + model="meta-llama/Llama-2-7b-hf", + batch_size=64, + vllm_config={"tensor_parallel_size": 2} + ) + backend = VLLMOfflineBackend(args) + await backend.process_startup() + async for response, info in backend.resolve(request, request_info): + process_response(response) + await backend.process_shutdown() + """ + + @classmethod + def backend_args(cls) -> type[BackendArgs]: + """Return the Pydantic model for this backend's creation arguments.""" + return VLLMOfflineBackendArgs + + def __init__(self, arguments: VLLMOfflineBackendArgs): + """Initialize vLLM Offline backend with model and configuration.""" + super().__init__(arguments) + self._args = arguments + + # Runtime state + self._in_process = False + self._shutting_down = False + self._llm: vllm.LLM | None = None + self._batch_lock = asyncio.Lock() + self._pending_batch: list[_BatchedRequest] = [] + self._processing_task: asyncio.Task | None = None + self._resolved_chat_template: str | None | object = _CHAT_TEMPLATE_UNSET + + @property + def processes_limit(self) -> int | None: + """Limit to single process for batch coordination.""" + return 1 + + @property + def info(self) -> dict[str, Any]: + """Get backend configuration details.""" + return self._args.model_dump() + + async def process_startup(self): + """Initialize vLLM LLM instance with configured parameters.""" + if self._in_process: + raise RuntimeError("Backend already started up for process.") + + # Initialize LLM in thread pool to avoid blocking + def _init_llm(): + engine_args = vllm.EngineArgs(**self._args.vllm_config) + return vllm.LLM.from_engine_args(engine_args) + + self._llm = await asyncio.to_thread(_init_llm) + self._in_process = True + + async def process_shutdown(self): + """Clean up vLLM LLM instance and resources.""" + if not self._in_process: + raise RuntimeError("Backend not started up for process.") + + # Set shutdown flag to reject new requests + self._shutting_down = True + + # Cancel any pending processing + if self._processing_task and not self._processing_task.done(): + self._processing_task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await self._processing_task + + # Process any remaining requests in batch + async with self._batch_lock: + if self._pending_batch: + await self._process_batch() + + if self._llm is not None: + # LLM cleanup happens automatically via GC + self._llm = None + + self._in_process = False + self._shutting_down = False + + async def validate(self): + """Validate backend readiness.""" + if self._llm is None: + raise RuntimeError("Backend not started up for process.") + # LLM is ready if it was constructed successfully + + async def available_models(self) -> list[str]: + """Get available models from this backend.""" + return [self._args.model] + + async def default_model(self) -> str: + """Get the default model for this backend.""" + return self._args.model + + def _validate_backend_initialized(self) -> vllm.LLM: + """ + Validate that the backend is initialized and return the LLM. + + :raises RuntimeError: If backend is not initialized + :return: The initialized LLM + """ + if self._llm is None: + raise RuntimeError("Backend not started up for process.") + return self._llm + + def _build_multi_modal_data_from_columns( # noqa: C901, PLR0912 + self, columns: dict[str, Any] + ) -> dict[str, Any] | None: + """ + Build vLLM multi_modal_data dict from image_column, audio_column. + + video_column is not yet supported (no frame extraction); it is skipped. + """ + multi_modal_data: dict[str, Any] = {} + # We look specifically for "image_column" and "audio_column" which contain lists + # of dicts + image_items = columns.get("image_column", []) + audio_items = columns.get("audio_column", []) + # video_column: not yet supported; would require frame extraction + for item in image_items: + if not item or not isinstance(item, dict): + continue + # Convert raw image dicts into PIL Images as required by vLLM's vision + # processor + pil_image = vision.image_dict_to_pil(item) + if "image" not in multi_modal_data: + multi_modal_data["image"] = pil_image + else: + # If multiple images exist, vLLM expects a list of PIL Images + existing = multi_modal_data["image"] + if isinstance(existing, list): + existing.append(pil_image) + else: + multi_modal_data["image"] = [existing, pil_image] + if audio_items: + if len(audio_items) > 1: + logger.warning( + "Only one audio item per request is supported; " + "ignoring {} extra audio item(s).", + len(audio_items) - 1, + ) + first = audio_items[0] + if not first or not isinstance(first, dict): + logger.warning("audio_column item is empty or not a dict; skipping.") + else: + audio_bytes = first.get("audio") + if isinstance(audio_bytes, bytes) and len(audio_bytes) > 0: + try: + # Decode raw audio bytes into an array since vLLM audio models + # expect either raw numpy arrays or specific tensor formats + audio_data = audio.decode_audio(audio_bytes) + multi_modal_data["audio"] = audio_data + except (ValueError, TypeError, OSError, RuntimeError) as exc: + raise ValueError( + f"Failed to decode audio from audio_column for vLLM: {exc}" + ) from exc + return multi_modal_data if multi_modal_data else None + + def _extract_text_from_content( + self, content: str | list[dict[str, Any]] | Any + ) -> str: + """ + Extract text content from message content field. + + Handles both string content and list-based multimodal content blocks. + For list-based content, extracts text from blocks with type "text" and + concatenates them together. + + :param content: Content field which can be a string or list of content blocks + :return: Extracted text string + """ + if isinstance(content, str): + return content + if isinstance(content, list): + # Extract text from content blocks with type "text" + text_parts = [] + for block in content: + if isinstance(block, dict): + block_type = block.get("type") + if block_type == "text": + text = block.get("text") + if text: + text_parts.append(text) + return "".join(text_parts) + # Fallback: convert to string + return str(content) if content is not None else "" + + def _build_placeholder_prefix(self, multi_modal_data: dict[str, Any]) -> str: + """ + Build the placeholder prefix string for all modalities in + multi_modal_data. + + Returns a string like ``"\\n<|audio|>\\n"`` with one + placeholder per item, or ``""`` if no multimodal items are + present. Placeholder tokens default to ```` and + ``<|audio|>`` but can be overridden via + ``image_placeholder`` / ``audio_placeholder`` at construction. + """ + parts: list[str] = [] + images = multi_modal_data.get("image") + if images is not None: + num = len(images) if isinstance(images, list | tuple) else 1 + if num > 0: + ph = self._args.image_placeholder + parts.extend([ph] * num) + audio = multi_modal_data.get("audio") + if audio is not None: + # Single audio item (numpy array) — not a list of items. + num = len(audio) if isinstance(audio, list | tuple) else 1 + if num > 0: + ph = self._args.audio_placeholder + parts.extend([ph] * num) + if not parts: + return "" + return "\n".join(parts) + "\n" + + @staticmethod + def _format_column_blocks( + column_data: list[Any], column_type: str + ) -> list[dict[str, Any]]: + """Format data column items into vLLM-compatible content blocks. + + Analogous to the HTTP backend's ``_format_prompts`` but emitting + vLLM-specific block types that chat templates can render into the + correct model-specific placeholder tokens. + """ + blocks: list[dict[str, Any]] = [] + for item in column_data: + if not item: + continue + if column_type == "text_column": + blocks.append({"type": "text", "text": str(item)}) + elif column_type == "image_column": + blocks.append({"type": "image"}) + elif column_type == "audio_column": + blocks.append({"type": "audio"}) + return blocks + + def _inject_placeholders_into_messages( + self, + formatted_messages: list[dict[str, Any]], + multi_modal_data: dict[str, Any], + ) -> None: + """ + Inject multimodal placeholder tokens into the last user message's content. + + vLLM requires one placeholder per multimodal item in the prompt text so its + processor can apply prompt replacement. This must happen *before* the chat + template is applied so that placeholders end up inside the correct message + turn (not prepended to the entire formatted prompt). + """ + prefix = self._build_placeholder_prefix(multi_modal_data) + if not prefix: + return + for msg in reversed(formatted_messages): + if msg.get("role") == "user": + msg["content"] = prefix + (msg.get("content") or "") + return + if formatted_messages: + formatted_messages[-1]["content"] = prefix + ( + formatted_messages[-1].get("content") or "" + ) + + def _extract_prompt_chat_plain( + self, formatted_messages: list[dict[str, Any]] + ) -> str: + """Concatenate message content into a single raw prompt string. + + Equivalent to the HTTP /v1/completions behaviour: prefix + text + with no role prefixes or trailing generation prompt. + """ + return " ".join( + msg["content"] for msg in formatted_messages if msg.get("content") + ) + + def _resolve_chat_template(self) -> str | None: + """ + Resolve and validate request_format to a template string or None. + + Returns None for default tokenizer template; returns the template string + when valid. Raises ValueError for invalid input (wrong format, bad path, + or invalid Jinja2 syntax). + """ + template = self._args.request_format + if template in ( + "plain", + "default-template", + ): + # No custom template provided; 'plain' and 'default-template' are handled + # internally + return None + path = Path(template) + # Treat the request_format string as a file path. If it exists and contains + # Jinja2 syntax, read the content as the template. + if path.exists() and path.is_file(): + content = path.read_text() + if not _has_jinja2_markers(content): + raise ValueError( + "Invalid chat template: path " + f"{path.as_posix()!r} exists but file content does not " + "contain Jinja2 template syntax ({{, {%}, or {#})." + ) + try: + jinja2.Template(content) + except jinja2.TemplateSyntaxError as e: + raise ValueError( + f"Invalid chat template in file {path.as_posix()!r}: {e}" + ) from e + return content + if _has_jinja2_markers(template): + try: + jinja2.Template(template) + except jinja2.TemplateSyntaxError as e: + raise ValueError(f"Invalid chat template: {e}") from e + return template + raise ValueError( + "request_format must be 'plain', 'default-template', a path to a " + "Jinja2 template file, or a string containing Jinja2 template " + "syntax ({{, {%}, or {#). Got: " + repr(template) + "." + ) + + def _extract_prompt_chat_tokenizer( + self, formatted_messages: list[dict[str, Any]] + ) -> str: + """Apply tokenizer chat template to formatted messages.""" + llm = self._validate_backend_initialized() + tokenizer = llm.llm_engine.tokenizer.tokenizer + if tokenizer is None: + raise RuntimeError("Backend engine has no tokenizer.") + + if self._args.request_format in ( + "plain", + "default-template", + ): + resolved: str | None = None + else: + if self._resolved_chat_template is _CHAT_TEMPLATE_UNSET: + self._resolved_chat_template = self._resolve_chat_template() + resolved = cast("str | None", self._resolved_chat_template) + if resolved is not None: + # Safe to mutate: vLLM runs one model per engine and the resolved + # template is constant across all requests for this backend instance. + tokenizer.chat_template = resolved # type: ignore[attr-defined] + prompt = tokenizer.apply_chat_template( + formatted_messages, # type: ignore[arg-type] + tokenize=False, + add_generation_prompt=True, + ) + if isinstance(prompt, str): + return prompt + raise RuntimeError("Backend received unexpected type from tokenizer.") + + def _resolve_request(self, request: GenerationRequest) -> _ResolvedRequest: + """ + Build a fully resolved request from column-based GenerationRequest. + + Mirrors the HTTP backend's ``ChatCompletionsRequestHandler.format``: + prefix items are space-joined into one system message and all data + columns (text, image, audio) are formatted as typed content blocks + then interleaved via ``roundrobin`` into a single user message. + + When a chat template is active and multimodal data is present, the + list-of-blocks content is passed directly to the tokenizer so the + template emits model-specific placeholder tokens. For plain format + or text-only requests the content is flattened to strings. + + :param request: Column-based generation request + :return: Resolved request with formatted prompt and multimodal data + :raises ValueError: If request has no text or multimodal columns + """ + columns = request.columns + + messages: list[dict[str, Any]] = [] + + prefix = " ".join(str(p) for p in columns.get("prefix_column", []) if p) + if prefix: + messages.append({"role": "system", "content": prefix}) + + text_blocks = self._format_column_blocks( + columns.get("text_column", []), "text_column" + ) + + multi_modal_data = self._build_multi_modal_data_from_columns(columns) + + # We use explicit content blocks (e.g. {"type": "image"}) when applying a + # chat template so that the template itself can generate the correct, + # model-specific tokens. Otherwise, we flatten to strings and fall back + # to placeholder-string injection. + use_content_blocks = ( + multi_modal_data + and (text_blocks or prefix) + and self._args.request_format != "plain" + ) + + if use_content_blocks: + # Interleave text and media blocks into a single content list, + # matching the HTTP backend's roundrobin approach. + media_lists = [ + self._format_column_blocks(columns.get(col, []), col) + for col in ("image_column", "audio_column") + ] + user_content: list[dict[str, Any]] = list( + roundrobin(text_blocks, *media_lists) + ) + else: + # Text-only or plain mode: media is handled later via placeholder + # injection, so only text blocks go into the user message here. + user_content = list(text_blocks) + + if user_content: + messages.append({"role": "user", "content": user_content}) + + if messages: + if use_content_blocks: + prompt = self._extract_prompt_chat_tokenizer(messages) + else: + formatted_messages = [ + { + "role": msg["role"], + "content": self._extract_text_from_content( + msg.get("content", "") + ), + } + for msg in messages + ] + + if multi_modal_data: + # Placeholders must be injected into the message text + # *before* the chat template is applied so they end up + # inside the correct message turn. + self._inject_placeholders_into_messages( + formatted_messages, multi_modal_data + ) + + if self._args.request_format == "plain": + prompt = self._extract_prompt_chat_plain(formatted_messages) + else: + prompt = self._extract_prompt_chat_tokenizer(formatted_messages) + elif multi_modal_data: + # Multimodal-only (e.g. audio transcription with no text/prefix): + # no messages to inject into, so use a raw placeholder prompt. + prompt = self._build_placeholder_prefix(multi_modal_data) + else: + raise ValueError("Request must include text_column or multimodal columns.") + + return _ResolvedRequest( + prompt=prompt, + multi_modal_data=multi_modal_data, + ) + + def _create_sampling_params( + self, + max_tokens_override: int | None = None, + ) -> vllm.SamplingParams: + """ + Create VLLM SamplingParams. + + When max_tokens_override is set (from benchmark output_metrics), it is used + as max_tokens and EOS is ignored to force generation of exactly that many + tokens, matching HTTP backend behavior. Otherwise vLLM defaults are used + (generate until EOS or model max context). + + :param max_tokens_override: Optional max_tokens from request (e.g. benchmark) + :return: Configured SamplingParams instance + """ + params: dict[str, Any] = {} + + if max_tokens_override is not None and max_tokens_override > 0: + params["max_tokens"] = max_tokens_override + params["ignore_eos"] = True + + return vllm.SamplingParams(**params) + + async def _process_batch(self): + """Process all pending requests as a batch using LLM.generate().""" + if not self._pending_batch: + return + + if self._llm is None: + raise RuntimeError("Backend not started up for process.") + + batch = self._pending_batch + self._pending_batch = [] + + logger.debug(f"Processing batch of {len(batch)} requests") + + # Build inputs for LLM.generate() + prompts = [] + sampling_params_list = [] + + for req in batch: + prompt_input: dict[str, Any] | str + if req.multi_modal_data: + prompt_input = { + "prompt": req.resolved_prompt, + "multi_modal_data": req.multi_modal_data, + } + else: + prompt_input = req.resolved_prompt + + prompts.append(prompt_input) + sampling_params = self._create_sampling_params(req.max_tokens) + sampling_params_list.append(sampling_params) + + # Process batch in thread pool + def _generate_batch(): + return self._llm.generate( # type: ignore[union-attr] + prompts, + sampling_params_list, + use_tqdm=False, + ) + + try: + outputs: list[vllm.RequestOutput] = await asyncio.to_thread(_generate_batch) + + # Match outputs to requests and mark ready + if len(outputs) != len(batch): + raise RuntimeError( + f"Batch size mismatch: expected {len(batch)} outputs, " + f"got {len(outputs)}" + ) + + for req, output in zip(batch, outputs, strict=True): + req.result = output + req.ready.set() + except Exception as exc: # noqa: BLE001 + # Catch all exceptions to ensure requests don't hang forever. + # This is safe here because we're marking requests as failed. + logger.error(f"Batch processing failed: {exc}") + # Mark all requests as failed but don't re-raise + # (individual requests will see None result) + for req in batch: + req.ready.set() + + async def _maybe_process_batch(self): + """Check if batch is full and process if so.""" + async with self._batch_lock: + if len(self._pending_batch) >= self._args.batch_size: + await self._process_batch() + + async def resolve( # type: ignore[override] + self, + request: GenerationRequest, + request_info: RequestInfo, + history: list[tuple[GenerationRequest, GenerationResponse]] | None = None, + ) -> AsyncIterator[tuple[GenerationResponse, RequestInfo]]: + """ + Process generation request by batching with others. + + Collects requests into micro-batches and processes them together + using LLM.generate(). The caller waits for the batch to complete + before receiving the response. + + :param request: Generation request with content and parameters + :param request_info: Request tracking info updated with timing metadata + :param history: Conversation history (currently not supported) + :yields: Single tuple of (response, updated_request_info) + """ + if self._llm is None: + raise RuntimeError("Backend not started up for process.") + + if self._shutting_down: + raise RuntimeError("Backend is shutting down, cannot accept new requests.") + + if history is not None: + raise NotImplementedError("Multi-turn requests not yet supported") + + # Resolve the request + request_info.timings.request_start = time.time() + resolved = self._resolve_request(request) + + # Create batched request tracker + max_tokens = ( + request.output_metrics.text_tokens + if request.output_metrics.text_tokens + else None + ) + + batched_req = _BatchedRequest( + request=request, + request_info=request_info, + resolved_prompt=resolved.prompt, + multi_modal_data=resolved.multi_modal_data, + max_tokens=max_tokens, + ) + + # Add to pending batch + async with self._batch_lock: + self._pending_batch.append(batched_req) + + # Trigger batch processing if full + await self._maybe_process_batch() + + # Wait for result + await batched_req.ready.wait() + + # Build response + request_info.timings.request_end = time.time() + + if batched_req.result is not None: + output = batched_req.result + text = output.outputs[0].text if output.outputs else "" + usage = { + "prompt_tokens": len(output.prompt_token_ids or []), + "completion_tokens": len(output.outputs[0].token_ids or []) + if output.outputs + else 0, + "total_tokens": len(output.prompt_token_ids or []) + + (len(output.outputs[0].token_ids or []) if output.outputs else 0), + } + + response = VLLMResponseHandler.build_response( + request, text, usage, response_id=output.request_id + ) + yield response, request_info + else: + # Request failed during batch processing + request_info.error = "Batch processing failed" + yield None, request_info # type: ignore[misc] diff --git a/tests/unit/backends/test_backend.py b/tests/unit/backends/test_backend.py index 59a4a87be..906b5f882 100644 --- a/tests/unit/backends/test_backend.py +++ b/tests/unit/backends/test_backend.py @@ -506,6 +506,25 @@ def test_vllm_python_backend_registered(self): assert backend._args.model == "test-model" assert backend.kind == "vllm_python" + @pytest.mark.smoke + def test_vllm_offline_backend_registered(self): + """ + Test that vllm_offline backend is registered and createable. + ## WRITTEN BY AI ## + """ + from guidellm.backends.vllm_python.offline import ( + VLLMOfflineBackend, + VLLMOfflineBackendArgs, + ) + + assert Backend.is_registered("vllm_offline") + args = VLLMOfflineBackendArgs(model="test-model", batch_size=32) + backend = Backend.create(args) + assert isinstance(backend, VLLMOfflineBackend) + assert backend._args.model == "test-model" + assert backend._args.batch_size == 32 + assert backend.kind == "vllm_offline" + @pytest.mark.smoke def test_backend_registry_functionality(self): """Test that backend registry functions work.""" From 251eb676c0242a140abd47a1127623a912c1ed42 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 25 Jun 2026 12:35:17 +0100 Subject: [PATCH 2/5] Fix __all__ ordering in vllm_python __init__ Signed-off-by: Maryam Tahhan --- src/guidellm/backends/vllm_python/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/guidellm/backends/vllm_python/__init__.py b/src/guidellm/backends/vllm_python/__init__.py index 19edba58d..a6851a2f5 100644 --- a/src/guidellm/backends/vllm_python/__init__.py +++ b/src/guidellm/backends/vllm_python/__init__.py @@ -9,4 +9,4 @@ from .vllm import VLLMPythonBackend from .vllm_response import VLLMResponseHandler -__all__ = ["VLLMPythonBackend", "VLLMOfflineBackend", "VLLMResponseHandler"] +__all__ = ["VLLMOfflineBackend", "VLLMPythonBackend", "VLLMResponseHandler"] From bfbcad1d8a8f48ea6feba851428393c3b27955fe Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 25 Jun 2026 13:41:36 +0100 Subject: [PATCH 3/5] Refactor vLLM backends to use shared common.py module Extract duplicated helper methods (_build_multi_modal_data_from_columns, _resolve_chat_template, _extract_prompt_chat_tokenizer, _create_sampling_params) into common.py to follow DRY principles. This addresses maintainer feedback about code reuse and abstraction. Both vllm_python and vllm_offline backends now share the same implementation for these helpers, reducing code duplication from ~400 lines to a single shared module. Signed-off-by: Maryam Tahhan --- src/guidellm/backends/vllm_python/__init__.py | 8 +- src/guidellm/backends/vllm_python/common.py | 205 ++++++++++++++++++ src/guidellm/backends/vllm_python/offline.py | 166 ++------------ src/guidellm/backends/vllm_python/vllm.py | 166 ++------------ tests/unit/backends/vllm_python/test_vllm.py | 16 +- 5 files changed, 256 insertions(+), 305 deletions(-) create mode 100644 src/guidellm/backends/vllm_python/common.py diff --git a/src/guidellm/backends/vllm_python/__init__.py b/src/guidellm/backends/vllm_python/__init__.py index a6851a2f5..cd115e29f 100644 --- a/src/guidellm/backends/vllm_python/__init__.py +++ b/src/guidellm/backends/vllm_python/__init__.py @@ -5,8 +5,14 @@ GenerationResponse from vLLM output. """ +from . import common from .offline import VLLMOfflineBackend from .vllm import VLLMPythonBackend from .vllm_response import VLLMResponseHandler -__all__ = ["VLLMOfflineBackend", "VLLMPythonBackend", "VLLMResponseHandler"] +__all__ = [ + "VLLMOfflineBackend", + "VLLMPythonBackend", + "VLLMResponseHandler", + "common", +] diff --git a/src/guidellm/backends/vllm_python/common.py b/src/guidellm/backends/vllm_python/common.py new file mode 100644 index 000000000..3040ddc32 --- /dev/null +++ b/src/guidellm/backends/vllm_python/common.py @@ -0,0 +1,205 @@ +"""Shared helpers for vLLM Python backends (vllm_python and vllm_offline).""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import jinja2 +from loguru import logger + +from guidellm.utils import audio, vision + +if TYPE_CHECKING: + from guidellm.extras import vllm + +__all__ = [ + "CHAT_TEMPLATE_UNSET", + "build_multi_modal_data_from_columns", + "create_sampling_params", + "extract_prompt_chat_tokenizer", + "resolve_chat_template", +] + +# Sentinel for "chat template not yet resolved" cache. +CHAT_TEMPLATE_UNSET = object() + + +def _has_jinja2_markers(s: str) -> bool: + """Check if string contains Jinja2 template markers ({{, {%}, or {#}).""" + return "{{" in s or "{%" in s or "{#" in s + + +def build_multi_modal_data_from_columns( + columns: dict[str, Any], +) -> dict[str, Any] | None: + """ + Build vLLM multi_modal_data dict from image_column, audio_column. + + video_column is not yet supported (no frame extraction); it is skipped. + + :param columns: Request columns containing image_column and/or audio_column + :return: Multi-modal data dict for vLLM, or None if no multi-modal data + :raises ValueError: If audio decoding fails + """ + multi_modal_data: dict[str, Any] = {} + # We look specifically for "image_column" and "audio_column" + # which contain lists of dicts + image_items = columns.get("image_column", []) + audio_items = columns.get("audio_column", []) + # video_column: not yet supported; would require frame extraction + for item in image_items: + if not item or not isinstance(item, dict): + continue + # Convert raw image dicts into PIL Images as required by vLLM's vision + # processor + pil_image = vision.image_dict_to_pil(item) + if "image" not in multi_modal_data: + multi_modal_data["image"] = pil_image + else: + # If multiple images exist, vLLM expects a list of PIL Images + existing = multi_modal_data["image"] + if isinstance(existing, list): + existing.append(pil_image) + else: + multi_modal_data["image"] = [existing, pil_image] + if audio_items: + if len(audio_items) > 1: + logger.warning( + "Only one audio item per request is supported; " + "ignoring {} extra audio item(s).", + len(audio_items) - 1, + ) + first = audio_items[0] + if not first or not isinstance(first, dict): + logger.warning("audio_column item is empty or not a dict; skipping.") + else: + audio_bytes = first.get("audio") + if isinstance(audio_bytes, bytes) and len(audio_bytes) > 0: + try: + # Decode raw audio bytes into an array since vLLM audio models + # expect either raw numpy arrays or specific tensor formats + audio_data = audio.decode_audio(audio_bytes) + multi_modal_data["audio"] = audio_data + except (ValueError, TypeError, OSError, RuntimeError) as exc: + raise ValueError( + f"Failed to decode audio from audio_column for vLLM: {exc}" + ) from exc + return multi_modal_data if multi_modal_data else None + + +def resolve_chat_template(request_format: str) -> str | None: + """ + Resolve and validate request_format to a template string or None. + + Returns None for default tokenizer template; returns the template string + when valid. Raises ValueError for invalid input (wrong format, bad path, + or invalid Jinja2 syntax). + + :param request_format: Template format string + (plain, default-template, path, or Jinja2) + :return: Template string or None for default + :raises ValueError: If request_format is invalid + """ + template = request_format + if template in ( + "plain", + "default-template", + ): + # No custom template provided; 'plain' and 'default-template' are handled + # internally + return None + path = Path(template) + # Treat the request_format string as a file path. If it exists and contains + # Jinja2 syntax, read the content as the template. + if path.exists() and path.is_file(): + content = path.read_text() + if not _has_jinja2_markers(content): + raise ValueError( + "Invalid chat template: path " + f"{path.as_posix()!r} exists but file content does not " + "contain Jinja2 template syntax ({{, {%}, or {#})." + ) + try: + jinja2.Template(content) + except jinja2.TemplateSyntaxError as e: + raise ValueError( + f"Invalid chat template in file {path.as_posix()!r}: {e}" + ) from e + return content + if _has_jinja2_markers(template): + try: + jinja2.Template(template) + except jinja2.TemplateSyntaxError as e: + raise ValueError(f"Invalid chat template: {e}") from e + return template + raise ValueError( + "request_format must be 'plain', 'default-template', a path to a " + "Jinja2 template file, or a string containing Jinja2 template " + "syntax ({{, {%}, or {#). Got: " + repr(template) + "." + ) + + +def extract_prompt_chat_tokenizer( + formatted_messages: list[dict[str, Any]], + tokenizer: Any, + request_format: str, + resolved_chat_template: str | None, +) -> str: + """ + Apply tokenizer chat template to formatted messages. + + :param formatted_messages: List of message dicts with role/content + :param tokenizer: Tokenizer instance from vLLM engine + :param request_format: Request format ('plain', 'default-template', or custom) + :param resolved_chat_template: Pre-resolved custom template or None for default + :return: Formatted prompt string + :raises RuntimeError: If tokenizer is missing or returns unexpected type + """ + if tokenizer is None: + raise RuntimeError("Backend engine has no tokenizer.") + + if request_format in ( + "plain", + "default-template", + ): + resolved: str | None = None + else: + resolved = resolved_chat_template + if resolved is not None: + # Safe to mutate: vLLM runs one model per engine and the resolved + # template is constant across all requests for this backend instance. + tokenizer.chat_template = resolved # type: ignore[attr-defined] + prompt = tokenizer.apply_chat_template( + formatted_messages, # type: ignore[arg-type] + tokenize=False, + add_generation_prompt=True, + ) + if isinstance(prompt, str): + return prompt + raise RuntimeError("Backend received unexpected type from tokenizer.") + + +def create_sampling_params( + vllm_module: Any, + max_tokens_override: int | None = None, +) -> vllm.SamplingParams: + """ + Create VLLM SamplingParams. + + When max_tokens_override is set (from benchmark output_metrics), it is used + as max_tokens and EOS is ignored to force generation of exactly that many + tokens, matching HTTP backend behavior. Otherwise vLLM defaults are used + (generate until EOS or model max context). + + :param vllm_module: vLLM module (from guidellm.extras) + :param max_tokens_override: Optional max_tokens from request (e.g. benchmark) + :return: Configured SamplingParams instance + """ + params: dict[str, Any] = {} + + if max_tokens_override is not None and max_tokens_override > 0: + params["max_tokens"] = max_tokens_override + params["ignore_eos"] = True + + return vllm_module.SamplingParams(**params) diff --git a/src/guidellm/backends/vllm_python/offline.py b/src/guidellm/backends/vllm_python/offline.py index a8cd9fb7a..586750ffd 100644 --- a/src/guidellm/backends/vllm_python/offline.py +++ b/src/guidellm/backends/vllm_python/offline.py @@ -14,14 +14,13 @@ import time import uuid from collections.abc import AsyncIterator -from pathlib import Path from typing import Any, Literal, cast -import jinja2 from more_itertools import roundrobin from pydantic import ConfigDict, Field, PositiveInt, model_validator from guidellm.backends.backend import Backend, BackendArgs +from guidellm.backends.vllm_python import common from guidellm.backends.vllm_python.vllm_response import VLLMResponseHandler from guidellm.extras import vllm from guidellm.logger import logger @@ -31,10 +30,6 @@ RequestInfo, StandardBaseModel, ) -from guidellm.utils import audio, vision - -# Sentinel for "chat template not yet resolved" cache. -_CHAT_TEMPLATE_UNSET: object = object() __all__ = ["VLLMOfflineBackend", "VLLMOfflineBackendArgs"] @@ -133,11 +128,6 @@ def __init__( self.ready = asyncio.Event() -def _has_jinja2_markers(s: str) -> bool: - """Return True if the string contains Jinja2 template syntax ({{, {%, or {#).""" - return "{{" in s or "{%" in s or "{#" in s - - @Backend.register("vllm_offline") class VLLMOfflineBackend(Backend): """ @@ -178,7 +168,7 @@ def __init__(self, arguments: VLLMOfflineBackendArgs): self._batch_lock = asyncio.Lock() self._pending_batch: list[_BatchedRequest] = [] self._processing_task: asyncio.Task | None = None - self._resolved_chat_template: str | None | object = _CHAT_TEMPLATE_UNSET + self._resolved_chat_template: str | None | object = common.CHAT_TEMPLATE_UNSET @property def processes_limit(self) -> int | None: @@ -254,58 +244,11 @@ def _validate_backend_initialized(self) -> vllm.LLM: raise RuntimeError("Backend not started up for process.") return self._llm - def _build_multi_modal_data_from_columns( # noqa: C901, PLR0912 + def _build_multi_modal_data_from_columns( self, columns: dict[str, Any] ) -> dict[str, Any] | None: - """ - Build vLLM multi_modal_data dict from image_column, audio_column. - - video_column is not yet supported (no frame extraction); it is skipped. - """ - multi_modal_data: dict[str, Any] = {} - # We look specifically for "image_column" and "audio_column" which contain lists - # of dicts - image_items = columns.get("image_column", []) - audio_items = columns.get("audio_column", []) - # video_column: not yet supported; would require frame extraction - for item in image_items: - if not item or not isinstance(item, dict): - continue - # Convert raw image dicts into PIL Images as required by vLLM's vision - # processor - pil_image = vision.image_dict_to_pil(item) - if "image" not in multi_modal_data: - multi_modal_data["image"] = pil_image - else: - # If multiple images exist, vLLM expects a list of PIL Images - existing = multi_modal_data["image"] - if isinstance(existing, list): - existing.append(pil_image) - else: - multi_modal_data["image"] = [existing, pil_image] - if audio_items: - if len(audio_items) > 1: - logger.warning( - "Only one audio item per request is supported; " - "ignoring {} extra audio item(s).", - len(audio_items) - 1, - ) - first = audio_items[0] - if not first or not isinstance(first, dict): - logger.warning("audio_column item is empty or not a dict; skipping.") - else: - audio_bytes = first.get("audio") - if isinstance(audio_bytes, bytes) and len(audio_bytes) > 0: - try: - # Decode raw audio bytes into an array since vLLM audio models - # expect either raw numpy arrays or specific tensor formats - audio_data = audio.decode_audio(audio_bytes) - multi_modal_data["audio"] = audio_data - except (ValueError, TypeError, OSError, RuntimeError) as exc: - raise ValueError( - f"Failed to decode audio from audio_column for vLLM: {exc}" - ) from exc - return multi_modal_data if multi_modal_data else None + """Build vLLM multi_modal_data dict from image_column, audio_column.""" + return common.build_multi_modal_data_from_columns(columns) def _extract_text_from_content( self, content: str | list[dict[str, Any]] | Any @@ -425,81 +368,24 @@ def _extract_prompt_chat_plain( ) def _resolve_chat_template(self) -> str | None: - """ - Resolve and validate request_format to a template string or None. - - Returns None for default tokenizer template; returns the template string - when valid. Raises ValueError for invalid input (wrong format, bad path, - or invalid Jinja2 syntax). - """ - template = self._args.request_format - if template in ( - "plain", - "default-template", - ): - # No custom template provided; 'plain' and 'default-template' are handled - # internally - return None - path = Path(template) - # Treat the request_format string as a file path. If it exists and contains - # Jinja2 syntax, read the content as the template. - if path.exists() and path.is_file(): - content = path.read_text() - if not _has_jinja2_markers(content): - raise ValueError( - "Invalid chat template: path " - f"{path.as_posix()!r} exists but file content does not " - "contain Jinja2 template syntax ({{, {%}, or {#})." - ) - try: - jinja2.Template(content) - except jinja2.TemplateSyntaxError as e: - raise ValueError( - f"Invalid chat template in file {path.as_posix()!r}: {e}" - ) from e - return content - if _has_jinja2_markers(template): - try: - jinja2.Template(template) - except jinja2.TemplateSyntaxError as e: - raise ValueError(f"Invalid chat template: {e}") from e - return template - raise ValueError( - "request_format must be 'plain', 'default-template', a path to a " - "Jinja2 template file, or a string containing Jinja2 template " - "syntax ({{, {%}, or {#). Got: " + repr(template) + "." - ) + """Resolve and validate request_format to a template string or None.""" + return common.resolve_chat_template(self._args.request_format) def _extract_prompt_chat_tokenizer( self, formatted_messages: list[dict[str, Any]] ) -> str: """Apply tokenizer chat template to formatted messages.""" llm = self._validate_backend_initialized() - tokenizer = llm.llm_engine.tokenizer.tokenizer - if tokenizer is None: - raise RuntimeError("Backend engine has no tokenizer.") - - if self._args.request_format in ( - "plain", - "default-template", - ): - resolved: str | None = None - else: - if self._resolved_chat_template is _CHAT_TEMPLATE_UNSET: - self._resolved_chat_template = self._resolve_chat_template() - resolved = cast("str | None", self._resolved_chat_template) - if resolved is not None: - # Safe to mutate: vLLM runs one model per engine and the resolved - # template is constant across all requests for this backend instance. - tokenizer.chat_template = resolved # type: ignore[attr-defined] - prompt = tokenizer.apply_chat_template( - formatted_messages, # type: ignore[arg-type] - tokenize=False, - add_generation_prompt=True, + # Lazy-resolve and cache the chat template + if self._resolved_chat_template is common.CHAT_TEMPLATE_UNSET: + self._resolved_chat_template = self._resolve_chat_template() + resolved = cast("str | None", self._resolved_chat_template) + return common.extract_prompt_chat_tokenizer( + formatted_messages, + llm.llm_engine.tokenizer.tokenizer, + self._args.request_format, + resolved, ) - if isinstance(prompt, str): - return prompt - raise RuntimeError("Backend received unexpected type from tokenizer.") def _resolve_request(self, request: GenerationRequest) -> _ResolvedRequest: """ @@ -603,24 +489,8 @@ def _create_sampling_params( self, max_tokens_override: int | None = None, ) -> vllm.SamplingParams: - """ - Create VLLM SamplingParams. - - When max_tokens_override is set (from benchmark output_metrics), it is used - as max_tokens and EOS is ignored to force generation of exactly that many - tokens, matching HTTP backend behavior. Otherwise vLLM defaults are used - (generate until EOS or model max context). - - :param max_tokens_override: Optional max_tokens from request (e.g. benchmark) - :return: Configured SamplingParams instance - """ - params: dict[str, Any] = {} - - if max_tokens_override is not None and max_tokens_override > 0: - params["max_tokens"] = max_tokens_override - params["ignore_eos"] = True - - return vllm.SamplingParams(**params) + """Create VLLM SamplingParams.""" + return common.create_sampling_params(vllm, max_tokens_override) async def _process_batch(self): """Process all pending requests as a batch using LLM.generate().""" diff --git a/src/guidellm/backends/vllm_python/vllm.py b/src/guidellm/backends/vllm_python/vllm.py index bc308e891..c31c8aac2 100644 --- a/src/guidellm/backends/vllm_python/vllm.py +++ b/src/guidellm/backends/vllm_python/vllm.py @@ -13,14 +13,13 @@ import time import uuid from collections.abc import AsyncIterator -from pathlib import Path from typing import Any, Literal, cast -import jinja2 from more_itertools import roundrobin from pydantic import ConfigDict, Field, model_validator from guidellm.backends.backend import Backend, BackendArgs +from guidellm.backends.vllm_python import common from guidellm.backends.vllm_python.vllm_response import VLLMResponseHandler from guidellm.extras import vllm from guidellm.logger import logger @@ -30,10 +29,6 @@ RequestInfo, StandardBaseModel, ) -from guidellm.utils import audio, vision - -# Sentinel for "chat template not yet resolved" cache. -_CHAT_TEMPLATE_UNSET: object = object() __all__ = ["VLLMPythonBackend", "VLLMPythonBackendArgs"] @@ -126,11 +121,6 @@ class _ResolvedRequest(StandardBaseModel): ) -def _has_jinja2_markers(s: str) -> bool: - """Return True if the string contains Jinja2 template syntax ({{, {%, or {#).""" - return "{{" in s or "{%" in s or "{#" in s - - @Backend.register("vllm_python") class VLLMPythonBackend(Backend): """ @@ -166,7 +156,7 @@ def __init__( # Runtime state self._in_process = False self._engine: vllm.AsyncLLMEngine | None = None - self._resolved_chat_template: str | None | object = _CHAT_TEMPLATE_UNSET + self._resolved_chat_template: str | None | object = common.CHAT_TEMPLATE_UNSET @property def processes_limit(self) -> int | None: @@ -267,58 +257,11 @@ def _validate_history( if history is not None: raise NotImplementedError("Multi-turn requests not yet supported") - def _build_multi_modal_data_from_columns( # noqa: C901, PLR0912 + def _build_multi_modal_data_from_columns( self, columns: dict[str, Any] ) -> dict[str, Any] | None: - """ - Build vLLM multi_modal_data dict from image_column, audio_column. - - video_column is not yet supported (no frame extraction); it is skipped. - """ - multi_modal_data: dict[str, Any] = {} - # We look specifically for "image_column" and "audio_column" which contain lists - # of dicts - image_items = columns.get("image_column", []) - audio_items = columns.get("audio_column", []) - # video_column: not yet supported; would require frame extraction - for item in image_items: - if not item or not isinstance(item, dict): - continue - # Convert raw image dicts into PIL Images as required by vLLM's vision - # processor - pil_image = vision.image_dict_to_pil(item) - if "image" not in multi_modal_data: - multi_modal_data["image"] = pil_image - else: - # If multiple images exist, vLLM expects a list of PIL Images - existing = multi_modal_data["image"] - if isinstance(existing, list): - existing.append(pil_image) - else: - multi_modal_data["image"] = [existing, pil_image] - if audio_items: - if len(audio_items) > 1: - logger.warning( - "Only one audio item per request is supported; " - "ignoring {} extra audio item(s).", - len(audio_items) - 1, - ) - first = audio_items[0] - if not first or not isinstance(first, dict): - logger.warning("audio_column item is empty or not a dict; skipping.") - else: - audio_bytes = first.get("audio") - if isinstance(audio_bytes, bytes) and len(audio_bytes) > 0: - try: - # Decode raw audio bytes into an array since vLLM audio models - # expect either raw numpy arrays or specific tensor formats - audio_data = audio.decode_audio(audio_bytes) - multi_modal_data["audio"] = audio_data - except (ValueError, TypeError, OSError, RuntimeError) as exc: - raise ValueError( - f"Failed to decode audio from audio_column for vLLM: {exc}" - ) from exc - return multi_modal_data if multi_modal_data else None + """Build vLLM multi_modal_data dict from image_column, audio_column.""" + return common.build_multi_modal_data_from_columns(columns) def _extract_text_from_content( self, content: str | list[dict[str, Any]] | Any @@ -438,81 +381,24 @@ def _extract_prompt_chat_plain( ) def _resolve_chat_template(self) -> str | None: - """ - Resolve and validate request_format to a template string or None. - - Returns None for default tokenizer template; returns the template string - when valid. Raises ValueError for invalid input (wrong format, bad path, - or invalid Jinja2 syntax). - """ - template = self._args.request_format - if template in ( - "plain", - "default-template", - ): - # No custom template provided; 'plain' and 'default-template' are handled - # internally - return None - path = Path(template) - # Treat the request_format string as a file path. If it exists and contains - # Jinja2 syntax, read the content as the template. - if path.exists() and path.is_file(): - content = path.read_text() - if not _has_jinja2_markers(content): - raise ValueError( - "Invalid chat template: path " - f"{path.as_posix()!r} exists but file content does not " - "contain Jinja2 template syntax ({{, {%}, or {#})." - ) - try: - jinja2.Template(content) - except jinja2.TemplateSyntaxError as e: - raise ValueError( - f"Invalid chat template in file {path.as_posix()!r}: {e}" - ) from e - return content - if _has_jinja2_markers(template): - try: - jinja2.Template(template) - except jinja2.TemplateSyntaxError as e: - raise ValueError(f"Invalid chat template: {e}") from e - return template - raise ValueError( - "request_format must be 'plain', 'default-template', a path to a " - "Jinja2 template file, or a string containing Jinja2 template " - "syntax ({{, {%}, or {#). Got: " + repr(template) + "." - ) + """Resolve and validate request_format to a template string or None.""" + return common.resolve_chat_template(self._args.request_format) def _extract_prompt_chat_tokenizer( self, formatted_messages: list[dict[str, Any]] ) -> str: """Apply tokenizer chat template to formatted messages.""" engine = self._validate_backend_initialized() - tokenizer = engine.tokenizer - if tokenizer is None: - raise RuntimeError("Backend engine has no tokenizer.") - - if self._args.request_format in ( - "plain", - "default-template", - ): - resolved: str | None = None - else: - if self._resolved_chat_template is _CHAT_TEMPLATE_UNSET: - self._resolved_chat_template = self._resolve_chat_template() - resolved = cast("str | None", self._resolved_chat_template) - if resolved is not None: - # Safe to mutate: vLLM runs one model per engine and the resolved - # template is constant across all requests for this backend instance. - tokenizer.chat_template = resolved # type: ignore[attr-defined] - prompt = tokenizer.apply_chat_template( - formatted_messages, # type: ignore[arg-type] - tokenize=False, - add_generation_prompt=True, + # Lazy-resolve and cache the chat template + if self._resolved_chat_template is common.CHAT_TEMPLATE_UNSET: + self._resolved_chat_template = self._resolve_chat_template() + resolved = cast("str | None", self._resolved_chat_template) + return common.extract_prompt_chat_tokenizer( + formatted_messages, + engine.tokenizer, + self._args.request_format, + resolved, ) - if isinstance(prompt, str): - return prompt - raise RuntimeError("Backend received unexpected type from tokenizer.") def _resolve_request(self, request: GenerationRequest) -> _ResolvedRequest: """ @@ -744,24 +630,8 @@ def _create_sampling_params( self, max_tokens_override: int | None = None, ) -> vllm.SamplingParams: - """ - Create VLLM SamplingParams. - - When max_tokens_override is set (from benchmark output_metrics), it is used - as max_tokens and EOS is ignored to force generation of exactly that many - tokens, matching HTTP backend behavior. Otherwise vLLM defaults are used - (generate until EOS or model max context). - - :param max_tokens_override: Optional max_tokens from request (e.g. benchmark) - :return: Configured SamplingParams instance - """ - params: dict[str, Any] = {} - - if max_tokens_override is not None and max_tokens_override > 0: - params["max_tokens"] = max_tokens_override - params["ignore_eos"] = True - - return vllm.SamplingParams(**params) + """Create VLLM SamplingParams.""" + return common.create_sampling_params(vllm, max_tokens_override) def _raise_generation_error(self, exc: BaseException) -> None: """Re-raise generation failure with context. diff --git a/tests/unit/backends/vllm_python/test_vllm.py b/tests/unit/backends/vllm_python/test_vllm.py index f51eb1b98..2358022ca 100644 --- a/tests/unit/backends/vllm_python/test_vllm.py +++ b/tests/unit/backends/vllm_python/test_vllm.py @@ -16,10 +16,10 @@ import numpy as np import pytest +from guidellm.backends.vllm_python import common from guidellm.backends.vllm_python.vllm import ( VLLMPythonBackend, VLLMPythonBackendArgs, - _has_jinja2_markers, _ResolvedRequest, ) from guidellm.schemas import ( @@ -500,8 +500,8 @@ def test_has_jinja2_markers_true_for_expressions(self): _has_jinja2_markers returns True for strings containing {{. ## WRITTEN BY AI ## """ - assert _has_jinja2_markers("{{ message.content }}") is True - assert _has_jinja2_markers("prefix {{ x }}") is True + assert common._has_jinja2_markers("{{ message.content }}") is True + assert common._has_jinja2_markers("prefix {{ x }}") is True @pytest.mark.sanity def test_has_jinja2_markers_true_for_control(self): @@ -509,8 +509,8 @@ def test_has_jinja2_markers_true_for_control(self): _has_jinja2_markers returns True for {% and {#. ## WRITTEN BY AI ## """ - assert _has_jinja2_markers("{% for m in messages %}") is True - assert _has_jinja2_markers("{# comment #}") is True + assert common._has_jinja2_markers("{% for m in messages %}") is True + assert common._has_jinja2_markers("{# comment #}") is True @pytest.mark.sanity def test_has_jinja2_markers_false_for_plain_strings(self): @@ -518,9 +518,9 @@ def test_has_jinja2_markers_false_for_plain_strings(self): _has_jinja2_markers returns False for strings with no template syntax. ## WRITTEN BY AI ## """ - assert _has_jinja2_markers("chat_completions") is False - assert _has_jinja2_markers("plain text") is False - assert _has_jinja2_markers("") is False + assert common._has_jinja2_markers("chat_completions") is False + assert common._has_jinja2_markers("plain text") is False + assert common._has_jinja2_markers("") is False class TestVLLMRequestFormat: From 1b673a260d87714335d6067a20a3b3c749e31565 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 25 Jun 2026 14:11:19 +0100 Subject: [PATCH 4/5] Extract all duplicated helpers to common.py for maximum code reuse Moved 5 additional helper methods to common.py that were duplicated between vllm_python and vllm_offline backends: - extract_text_from_content - build_placeholder_prefix - format_column_blocks - inject_placeholders_into_messages - extract_prompt_chat_plain Total duplication eliminated: ~450 lines across both backends. All helper logic is now centralized in common.py with both backends using thin wrapper methods that delegate to the shared implementation. Signed-off-by: Maryam Tahhan --- src/guidellm/backends/vllm_python/common.py | 142 +++++++++++++++++++ src/guidellm/backends/vllm_python/offline.py | 115 +++------------ src/guidellm/backends/vllm_python/vllm.py | 115 +++------------ 3 files changed, 180 insertions(+), 192 deletions(-) diff --git a/src/guidellm/backends/vllm_python/common.py b/src/guidellm/backends/vllm_python/common.py index 3040ddc32..4ae961ddd 100644 --- a/src/guidellm/backends/vllm_python/common.py +++ b/src/guidellm/backends/vllm_python/common.py @@ -16,8 +16,13 @@ __all__ = [ "CHAT_TEMPLATE_UNSET", "build_multi_modal_data_from_columns", + "build_placeholder_prefix", "create_sampling_params", + "extract_prompt_chat_plain", "extract_prompt_chat_tokenizer", + "extract_text_from_content", + "format_column_blocks", + "inject_placeholders_into_messages", "resolve_chat_template", ] @@ -30,6 +35,143 @@ def _has_jinja2_markers(s: str) -> bool: return "{{" in s or "{%" in s or "{#" in s +def extract_text_from_content(content: str | list[dict[str, Any]] | Any) -> str: + """ + Extract text content from message content field. + + Handles both string content and list-based multimodal content blocks. + For list-based content, extracts text from blocks with type "text" and + concatenates them together. + + :param content: Content field which can be a string or list of content blocks + :return: Extracted text string + """ + if isinstance(content, str): + return content + if isinstance(content, list): + # Extract text from content blocks with type "text" + text_parts = [] + for block in content: + if isinstance(block, dict): + block_type = block.get("type") + if block_type == "text": + text = block.get("text") + if text: + text_parts.append(text) + return "".join(text_parts) + # Fallback: convert to string + return str(content) if content is not None else "" + + +def build_placeholder_prefix( + multi_modal_data: dict[str, Any], + image_placeholder: str = "", + audio_placeholder: str = "<|audio|>", +) -> str: + """ + Build the placeholder prefix string for all modalities in multi_modal_data. + + Returns a string like ``"\\n<|audio|>\\n"`` with one placeholder per + item, or ``""`` if no multimodal items are present. + + :param multi_modal_data: Multi-modal data dict with image/audio + :param image_placeholder: Placeholder token for images + :param audio_placeholder: Placeholder token for audio + :return: Newline-joined placeholder string or empty string + """ + parts: list[str] = [] + images = multi_modal_data.get("image") + if images is not None: + num = len(images) if isinstance(images, list | tuple) else 1 + if num > 0: + parts.extend([image_placeholder] * num) + audio = multi_modal_data.get("audio") + if audio is not None: + # Single audio item (numpy array) — not a list of items. + num = len(audio) if isinstance(audio, list | tuple) else 1 + if num > 0: + parts.extend([audio_placeholder] * num) + if not parts: + return "" + return "\n".join(parts) + "\n" + + +def format_column_blocks( + column_data: list[Any], column_type: str +) -> list[dict[str, Any]]: + """ + Format data column items into vLLM-compatible content blocks. + + Analogous to the HTTP backend's ``_format_prompts`` but emitting + vLLM-specific block types that chat templates can render into the + correct model-specific placeholder tokens. + + :param column_data: List of items from a data column + :param column_type: Column type (text_column, image_column, audio_column) + :return: List of typed content block dicts + """ + blocks: list[dict[str, Any]] = [] + for item in column_data: + if not item: + continue + if column_type == "text_column": + blocks.append({"type": "text", "text": str(item)}) + elif column_type == "image_column": + blocks.append({"type": "image"}) + elif column_type == "audio_column": + blocks.append({"type": "audio"}) + return blocks + + +def inject_placeholders_into_messages( + formatted_messages: list[dict[str, Any]], + multi_modal_data: dict[str, Any], + image_placeholder: str = "", + audio_placeholder: str = "<|audio|>", +) -> None: + """ + Inject multimodal placeholder tokens into the last user message's content. + + vLLM requires one placeholder per multimodal item in the prompt text so its + processor can apply prompt replacement. This must happen *before* the chat + template is applied so that placeholders end up inside the correct message + turn (not prepended to the entire formatted prompt). + + :param formatted_messages: List of message dicts (modified in-place) + :param multi_modal_data: Multi-modal data dict + :param image_placeholder: Placeholder token for images + :param audio_placeholder: Placeholder token for audio + """ + prefix = build_placeholder_prefix( + multi_modal_data, image_placeholder, audio_placeholder + ) + if not prefix: + return + for msg in reversed(formatted_messages): + if msg.get("role") == "user": + msg["content"] = prefix + (msg.get("content") or "") + return + if formatted_messages: + formatted_messages[-1]["content"] = prefix + ( + formatted_messages[-1].get("content") or "" + ) + + +def extract_prompt_chat_plain( + formatted_messages: list[dict[str, Any]], +) -> str: + """ + Concatenate message content into a single raw prompt string. + + Equivalent to the HTTP /v1/completions behaviour: prefix + text + with no role prefixes or trailing generation prompt. + + :param formatted_messages: List of message dicts with role/content + :return: Space-joined content string + """ + return " ".join(msg["content"] for msg in formatted_messages if msg.get("content")) + + def build_multi_modal_data_from_columns( columns: dict[str, Any], ) -> dict[str, Any] | None: diff --git a/src/guidellm/backends/vllm_python/offline.py b/src/guidellm/backends/vllm_python/offline.py index 586750ffd..0437bc7cf 100644 --- a/src/guidellm/backends/vllm_python/offline.py +++ b/src/guidellm/backends/vllm_python/offline.py @@ -253,119 +253,42 @@ def _build_multi_modal_data_from_columns( def _extract_text_from_content( self, content: str | list[dict[str, Any]] | Any ) -> str: - """ - Extract text content from message content field. - - Handles both string content and list-based multimodal content blocks. - For list-based content, extracts text from blocks with type "text" and - concatenates them together. - - :param content: Content field which can be a string or list of content blocks - :return: Extracted text string - """ - if isinstance(content, str): - return content - if isinstance(content, list): - # Extract text from content blocks with type "text" - text_parts = [] - for block in content: - if isinstance(block, dict): - block_type = block.get("type") - if block_type == "text": - text = block.get("text") - if text: - text_parts.append(text) - return "".join(text_parts) - # Fallback: convert to string - return str(content) if content is not None else "" + """Extract text content from message content field.""" + return common.extract_text_from_content(content) def _build_placeholder_prefix(self, multi_modal_data: dict[str, Any]) -> str: - """ - Build the placeholder prefix string for all modalities in - multi_modal_data. - - Returns a string like ``"\\n<|audio|>\\n"`` with one - placeholder per item, or ``""`` if no multimodal items are - present. Placeholder tokens default to ```` and - ``<|audio|>`` but can be overridden via - ``image_placeholder`` / ``audio_placeholder`` at construction. - """ - parts: list[str] = [] - images = multi_modal_data.get("image") - if images is not None: - num = len(images) if isinstance(images, list | tuple) else 1 - if num > 0: - ph = self._args.image_placeholder - parts.extend([ph] * num) - audio = multi_modal_data.get("audio") - if audio is not None: - # Single audio item (numpy array) — not a list of items. - num = len(audio) if isinstance(audio, list | tuple) else 1 - if num > 0: - ph = self._args.audio_placeholder - parts.extend([ph] * num) - if not parts: - return "" - return "\n".join(parts) + "\n" + """Build the placeholder prefix string for all modalities.""" + return common.build_placeholder_prefix( + multi_modal_data, + self._args.image_placeholder, + self._args.audio_placeholder, + ) @staticmethod def _format_column_blocks( column_data: list[Any], column_type: str ) -> list[dict[str, Any]]: - """Format data column items into vLLM-compatible content blocks. - - Analogous to the HTTP backend's ``_format_prompts`` but emitting - vLLM-specific block types that chat templates can render into the - correct model-specific placeholder tokens. - """ - blocks: list[dict[str, Any]] = [] - for item in column_data: - if not item: - continue - if column_type == "text_column": - blocks.append({"type": "text", "text": str(item)}) - elif column_type == "image_column": - blocks.append({"type": "image"}) - elif column_type == "audio_column": - blocks.append({"type": "audio"}) - return blocks + """Format data column items into vLLM-compatible content blocks.""" + return common.format_column_blocks(column_data, column_type) def _inject_placeholders_into_messages( self, formatted_messages: list[dict[str, Any]], multi_modal_data: dict[str, Any], ) -> None: - """ - Inject multimodal placeholder tokens into the last user message's content. - - vLLM requires one placeholder per multimodal item in the prompt text so its - processor can apply prompt replacement. This must happen *before* the chat - template is applied so that placeholders end up inside the correct message - turn (not prepended to the entire formatted prompt). - """ - prefix = self._build_placeholder_prefix(multi_modal_data) - if not prefix: - return - for msg in reversed(formatted_messages): - if msg.get("role") == "user": - msg["content"] = prefix + (msg.get("content") or "") - return - if formatted_messages: - formatted_messages[-1]["content"] = prefix + ( - formatted_messages[-1].get("content") or "" - ) + """Inject multimodal placeholder tokens into the last user message.""" + common.inject_placeholders_into_messages( + formatted_messages, + multi_modal_data, + self._args.image_placeholder, + self._args.audio_placeholder, + ) def _extract_prompt_chat_plain( self, formatted_messages: list[dict[str, Any]] ) -> str: - """Concatenate message content into a single raw prompt string. - - Equivalent to the HTTP /v1/completions behaviour: prefix + text - with no role prefixes or trailing generation prompt. - """ - return " ".join( - msg["content"] for msg in formatted_messages if msg.get("content") - ) + """Concatenate message content into a single raw prompt string.""" + return common.extract_prompt_chat_plain(formatted_messages) def _resolve_chat_template(self) -> str | None: """Resolve and validate request_format to a template string or None.""" diff --git a/src/guidellm/backends/vllm_python/vllm.py b/src/guidellm/backends/vllm_python/vllm.py index c31c8aac2..136bee716 100644 --- a/src/guidellm/backends/vllm_python/vllm.py +++ b/src/guidellm/backends/vllm_python/vllm.py @@ -266,119 +266,42 @@ def _build_multi_modal_data_from_columns( def _extract_text_from_content( self, content: str | list[dict[str, Any]] | Any ) -> str: - """ - Extract text content from message content field. - - Handles both string content and list-based multimodal content blocks. - For list-based content, extracts text from blocks with type "text" and - concatenates them together. - - :param content: Content field which can be a string or list of content blocks - :return: Extracted text string - """ - if isinstance(content, str): - return content - if isinstance(content, list): - # Extract text from content blocks with type "text" - text_parts = [] - for block in content: - if isinstance(block, dict): - block_type = block.get("type") - if block_type == "text": - text = block.get("text") - if text: - text_parts.append(text) - return "".join(text_parts) - # Fallback: convert to string - return str(content) if content is not None else "" + """Extract text content from message content field.""" + return common.extract_text_from_content(content) def _build_placeholder_prefix(self, multi_modal_data: dict[str, Any]) -> str: - """ - Build the placeholder prefix string for all modalities in - multi_modal_data. - - Returns a string like ``"\\n<|audio|>\\n"`` with one - placeholder per item, or ``""`` if no multimodal items are - present. Placeholder tokens default to ```` and - ``<|audio|>`` but can be overridden via - ``image_placeholder`` / ``audio_placeholder`` at construction. - """ - parts: list[str] = [] - images = multi_modal_data.get("image") - if images is not None: - num = len(images) if isinstance(images, list | tuple) else 1 - if num > 0: - ph = self._args.image_placeholder - parts.extend([ph] * num) - audio = multi_modal_data.get("audio") - if audio is not None: - # Single audio item (numpy array) — not a list of items. - num = len(audio) if isinstance(audio, list | tuple) else 1 - if num > 0: - ph = self._args.audio_placeholder - parts.extend([ph] * num) - if not parts: - return "" - return "\n".join(parts) + "\n" + """Build the placeholder prefix string for all modalities.""" + return common.build_placeholder_prefix( + multi_modal_data, + self._args.image_placeholder, + self._args.audio_placeholder, + ) @staticmethod def _format_column_blocks( column_data: list[Any], column_type: str ) -> list[dict[str, Any]]: - """Format data column items into vLLM-compatible content blocks. - - Analogous to the HTTP backend's ``_format_prompts`` but emitting - vLLM-specific block types that chat templates can render into the - correct model-specific placeholder tokens. - """ - blocks: list[dict[str, Any]] = [] - for item in column_data: - if not item: - continue - if column_type == "text_column": - blocks.append({"type": "text", "text": str(item)}) - elif column_type == "image_column": - blocks.append({"type": "image"}) - elif column_type == "audio_column": - blocks.append({"type": "audio"}) - return blocks + """Format data column items into vLLM-compatible content blocks.""" + return common.format_column_blocks(column_data, column_type) def _inject_placeholders_into_messages( self, formatted_messages: list[dict[str, Any]], multi_modal_data: dict[str, Any], ) -> None: - """ - Inject multimodal placeholder tokens into the last user message's content. - - vLLM requires one placeholder per multimodal item in the prompt text so its - processor can apply prompt replacement. This must happen *before* the chat - template is applied so that placeholders end up inside the correct message - turn (not prepended to the entire formatted prompt). - """ - prefix = self._build_placeholder_prefix(multi_modal_data) - if not prefix: - return - for msg in reversed(formatted_messages): - if msg.get("role") == "user": - msg["content"] = prefix + (msg.get("content") or "") - return - if formatted_messages: - formatted_messages[-1]["content"] = prefix + ( - formatted_messages[-1].get("content") or "" - ) + """Inject multimodal placeholder tokens into the last user message.""" + common.inject_placeholders_into_messages( + formatted_messages, + multi_modal_data, + self._args.image_placeholder, + self._args.audio_placeholder, + ) def _extract_prompt_chat_plain( self, formatted_messages: list[dict[str, Any]] ) -> str: - """Concatenate message content into a single raw prompt string. - - Equivalent to the HTTP /v1/completions behaviour: prefix + text - with no role prefixes or trailing generation prompt. - """ - return " ".join( - msg["content"] for msg in formatted_messages if msg.get("content") - ) + """Concatenate message content into a single raw prompt string.""" + return common.extract_prompt_chat_plain(formatted_messages) def _resolve_chat_template(self) -> str | None: """Resolve and validate request_format to a template string or None.""" From f27b0765823dc02d63d6826d19b1d565a7ebc4b4 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 25 Jun 2026 14:18:13 +0100 Subject: [PATCH 5/5] Fix mypy type errors for lazy-loaded vllm module Add type: ignore comments for vllm.EngineArgs and vllm.LLM runtime usage since these are lazy-loaded and mypy can't resolve them at static analysis time. Use Any type for vllm.LLM annotations with inline comments documenting the actual type. Fixes CI type-check failures. Signed-off-by: Maryam Tahhan --- src/guidellm/backends/vllm_python/offline.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/guidellm/backends/vllm_python/offline.py b/src/guidellm/backends/vllm_python/offline.py index 0437bc7cf..3c67251dc 100644 --- a/src/guidellm/backends/vllm_python/offline.py +++ b/src/guidellm/backends/vllm_python/offline.py @@ -14,7 +14,7 @@ import time import uuid from collections.abc import AsyncIterator -from typing import Any, Literal, cast +from typing import TYPE_CHECKING, Any, Literal, cast from more_itertools import roundrobin from pydantic import ConfigDict, Field, PositiveInt, model_validator @@ -22,8 +22,12 @@ from guidellm.backends.backend import Backend, BackendArgs from guidellm.backends.vllm_python import common from guidellm.backends.vllm_python.vllm_response import VLLMResponseHandler -from guidellm.extras import vllm from guidellm.logger import logger + +if TYPE_CHECKING: + from guidellm.extras import vllm +else: + from guidellm.extras import vllm from guidellm.schemas import ( GenerationRequest, GenerationResponse, @@ -164,7 +168,7 @@ def __init__(self, arguments: VLLMOfflineBackendArgs): # Runtime state self._in_process = False self._shutting_down = False - self._llm: vllm.LLM | None = None + self._llm: Any = None # vllm.LLM | None self._batch_lock = asyncio.Lock() self._pending_batch: list[_BatchedRequest] = [] self._processing_task: asyncio.Task | None = None @@ -187,8 +191,8 @@ async def process_startup(self): # Initialize LLM in thread pool to avoid blocking def _init_llm(): - engine_args = vllm.EngineArgs(**self._args.vllm_config) - return vllm.LLM.from_engine_args(engine_args) + engine_args = vllm.EngineArgs(**self._args.vllm_config) # type: ignore[attr-defined] + return vllm.LLM.from_engine_args(engine_args) # type: ignore[attr-defined] self._llm = await asyncio.to_thread(_init_llm) self._in_process = True @@ -233,7 +237,7 @@ async def default_model(self) -> str: """Get the default model for this backend.""" return self._args.model - def _validate_backend_initialized(self) -> vllm.LLM: + def _validate_backend_initialized(self) -> Any: # vllm.LLM """ Validate that the backend is initialized and return the LLM.