From 5d2304db720fb0acc7e568f787cdddd32e77c27e Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Thu, 25 Jun 2026 11:35:31 +0100
Subject: [PATCH 1/5] Add vLLM Offline Backend for batch processing

Implements standalone offline backend using vLLM's LLM class for micro-batching.
Adapted to main's architecture without VLLMBackendBase, using main's import patterns
(lazy loading via guidellm.extras, utils.audio/vision).

Features:
- Batch processing with configurable batch_size (default: 32)
- Chat template support (plain, default-template, custom Jinja2)
- Multimodal data handling (image/audio)
- Single-process execution for batch coordination
- Compatible with vLLM 0.21.0+

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 docs/guides/backends.md                       |   7 +-
 docs/guides/vllm-offline-backend.md           | 235 ++++++
 src/guidellm/backends/openai/http.py          |   2 +-
 src/guidellm/backends/vllm_python/__init__.py |   3 +-
 src/guidellm/backends/vllm_python/offline.py  | 770 ++++++++++++++++++
 tests/unit/backends/test_backend.py           |  19 +
 6 files changed, 1032 insertions(+), 4 deletions(-)
 create mode 100644 docs/guides/vllm-offline-backend.md
 create mode 100644 src/guidellm/backends/vllm_python/offline.py

diff --git a/docs/guides/backends.md b/docs/guides/backends.md
index a6bf804f1..f8506c016 100644
--- a/docs/guides/backends.md
+++ b/docs/guides/backends.md
@@ -8,9 +8,12 @@ GuideLLM is designed to work with OpenAI-compatible HTTP servers, enabling seaml
 
 GuideLLM supports OpenAI-compatible HTTP servers, which provide a standardized API for interacting with LLMs. This includes popular implementations such as [vLLM](https://github.com/vllm-project/vllm) and [Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference). These servers allow GuideLLM to perform evaluations, benchmarks, and optimizations with minimal setup.
 
-### vLLM Python backend
+### vLLM Python Backends
 
-GuideLLM supports running inference in the same process using the **vLLM Python backend** (`vllm_python`). This backend runs inference in the same process as GuideLLM's using vLLM's python API (AsyncLLMEngine), without an HTTP server. For setup, installation options (container, existing vLLM, pip), and examples, see [vLLM Python backend](vllm-python-backend.md).
+GuideLLM supports running inference in the same process using vLLM's Python API, without an HTTP server:
+
+- **vLLM Python backend** (`vllm_python`): Uses vLLM's AsyncLLMEngine for async streaming inference. For setup and examples, see [vLLM Python backend](vllm-python-backend.md).
+- **vLLM Offline backend** (`vllm_offline`): Uses vLLM's LLM class for batch processing with micro-batching. Designed for offline benchmarking where batch efficiency is prioritized over streaming latency. For setup and examples, see [vLLM Offline backend](vllm-offline-backend.md).
 
 ## Examples for Spinning Up Compatible Servers
 
diff --git a/docs/guides/vllm-offline-backend.md b/docs/guides/vllm-offline-backend.md
new file mode 100644
index 000000000..7383f8256
--- /dev/null
+++ b/docs/guides/vllm-offline-backend.md
@@ -0,0 +1,235 @@
+# vLLM Offline Backend
+
+The **vLLM Offline backend** (`vllm_offline`) provides synchronous batch processing using vLLM's `LLM` class. It collects requests into micro-batches and processes them together for maximum throughput, making it ideal for offline benchmarking scenarios where batching efficiency is prioritized over per-request latency.
+
+## When to Use the Offline Backend
+
+**Use `vllm_offline` when:**
+
+- Running offline batch inference on large datasets
+- Maximizing throughput is more important than individual request latency
+- You have a known dataset size and want optimal batch processing
+- Benchmarking pure model throughput without HTTP overhead
+- Processing datasets for evaluation or ETL pipelines
+
+**Use `vllm_python` (AsyncLLMEngine) when:**
+
+- You need streaming token-by-token responses
+- Simulating production-like continuous request arrival
+- Measuring realistic latency characteristics
+- Need async request handling
+
+**Use OpenAI HTTP backend when:**
+
+- Testing against a production vLLM server
+- Measuring end-to-end latency including network overhead
+- Benchmarking a deployed service
+
+## Installation
+
+The offline backend requires vLLM to be installed. See the [vLLM Python Backend installation guide](vllm-python-backend.md#installation) for recommended installation methods.
+
+## Basic Usage
+
+```bash
+guidellm benchmark run \
+  --backend vllm_offline \
+  --model "Qwen/Qwen3-0.6B" \
+  --backend-kwargs '{"batch_size": 64}' \
+  --data "prompt_tokens=256,output_tokens=128" \
+  --max-requests 1000
+```
+
+## Backend Options
+
+Configure the offline backend via `--backend-kwargs` with JSON:
+
+```bash
+--backend-kwargs '{
+  "model": "meta-llama/Llama-2-7b-hf",
+  "batch_size": 64,
+  "vllm_config": {
+    "tensor_parallel_size": 2,
+    "gpu_memory_utilization": 0.9
+  }
+}'
+```
+
+### Key Parameters
+
+- **`model`** (required): Model identifier or path
+- **`batch_size`**: Number of requests to collect before processing (default: 32)
+  - Larger batches = higher throughput but more latency
+  - Smaller batches = lower latency but less throughput
+  - Recommended: 32-128 for most use cases
+- **`vllm_config`**: Dictionary of vLLM EngineArgs parameters
+  - `tensor_parallel_size`: Number of GPUs for tensor parallelism
+  - `gpu_memory_utilization`: Fraction of GPU memory to use (0.0-1.0)
+  - `max_model_len`: Maximum sequence length
+  - See [vLLM Engine Arguments](https://docs.vllm.ai/en/stable/configuration/engine_args/) for all options (use Python parameter names)
+- **`request_format`**: How to format prompts
+  - `"default-template"` (default): Use tokenizer's chat template
+  - `"plain"`: No chat template, plain text concatenation
+  - Path or string: Custom Jinja2 chat template
+- **`image_placeholder`**: Placeholder for images (default: `"<image>"`)
+- **`audio_placeholder`**: Placeholder for audio (default: `"<|audio|>"`)
+
+## How Micro-Batching Works
+
+The offline backend uses a **micro-batching** approach:
+
+1. **Buffering**: As requests arrive via `resolve()`, they're added to a buffer
+2. **Batch Detection**: When buffer reaches `batch_size`, trigger processing
+3. **Batch Processing**: Process entire batch with one `LLM.generate()` call
+4. **Result Distribution**: Return cached results to waiting requests
+5. **Flush on Shutdown**: Remaining requests processed when backend shuts down
+
+This gives you 10-100x fewer model forward passes compared to per-request processing while working within GuideLLM's scheduler architecture.
+
+## Examples
+
+### Basic Throughput Benchmark
+
+```bash
+guidellm benchmark run \
+  --backend vllm_offline \
+  --model "Qwen/Qwen3-0.6B" \
+  --data "prompt_tokens=512,output_tokens=256" \
+  --profile throughput \
+  --max-seconds 60
+```
+
+### Large Batch Processing
+
+```bash
+guidellm benchmark run \
+  --backend vllm_offline \
+  --backend-kwargs '{"batch_size": 128}' \
+  --model "meta-llama/Llama-2-7b-hf" \
+  --data path/to/dataset.csv \
+  --max-requests -1  # Process entire dataset
+```
+
+### Multi-GPU Configuration
+
+```bash
+guidellm benchmark run \
+  --backend vllm_offline \
+  --backend-kwargs '{
+    "model": "meta-llama/Llama-2-70b-hf",
+    "batch_size": 64,
+    "vllm_config": {
+      "tensor_parallel_size": 4,
+      "gpu_memory_utilization": 0.95
+    }
+  }' \
+  --data "prompt_tokens=1024,output_tokens=512"
+```
+
+### HuggingFace Dataset
+
+```bash
+guidellm benchmark run \
+  --backend vllm_offline \
+  --model "meta-llama/Llama-2-7b-hf" \
+  --backend-kwargs '{"batch_size": 32}' \
+  --data "hf:cnn_dailymail" \
+  --data-args '{"name": "3.0.0"}' \
+  --data-column-mapper '{"column_mappings": {"text_column": "article"}}'
+```
+
+## Performance Tuning
+
+### Choosing Batch Size
+
+| Batch Size | Throughput | Latency | Memory | When to Use                  |
+| ---------- | ---------- | ------- | ------ | ---------------------------- |
+| 8-16       | Low        | Low     | Low    | Small models, limited memory |
+| 32-64      | Good       | Medium  | Medium | General use, balanced        |
+| 128-256    | High       | High    | High   | Large GPUs, max throughput   |
+
+**Rule of thumb**: Start with 32, increase until GPU utilization >90% or OOM.
+
+### Memory Optimization
+
+```bash
+# Reduce memory usage
+--backend-kwargs '{
+  "batch_size": 16,
+  "vllm_config": {
+    "gpu_memory_utilization": 0.8,
+    "max_model_len": 2048
+  }
+}'
+```
+
+### Maximizing Throughput
+
+```bash
+# Maximize throughput
+--backend-kwargs '{
+  "batch_size": 128,
+  "vllm_config": {
+    "gpu_memory_utilization": 0.95,
+    "enable_prefix_caching": true
+  }
+}'
+```
+
+## Comparison: Offline vs Python vs HTTP
+
+| Feature        | `vllm_offline`   | `vllm_python` | OpenAI HTTP  |
+| -------------- | ---------------- | ------------- | ------------ |
+| **Batching**   | Micro-batching   | Continuous    | Continuous   |
+| **Throughput** | Highest          | High          | Good         |
+| **Latency**    | Higher (batched) | Lower         | Lowest†      |
+| **Streaming**  | No               | Yes           | Yes          |
+| **Overhead**   | None             | None          | HTTP/network |
+| **Processes**  | 1                | 1             | Multiple     |
+| **Use Case**   | Offline eval     | Research      | Production   |
+
+*† Subject to network conditions*
+
+## Troubleshooting
+
+### "Backend not started up for process"
+
+The backend wasn't initialized. Ensure your benchmark calls the backend lifecycle correctly (this should happen automatically).
+
+### Out of Memory (OOM)
+
+Reduce `batch_size` or `gpu_memory_utilization`:
+
+```bash
+--backend-kwargs '{"batch_size": 16, "vllm_config": {"gpu_memory_utilization": 0.7}}'
+```
+
+### Batch Processing Too Slow
+
+Increase `batch_size` for better GPU utilization:
+
+```bash
+--backend-kwargs '{"batch_size": 64}'
+```
+
+### Wrong Prompt Format
+
+Specify `request_format` explicitly:
+
+```bash
+--backend-kwargs '{"request_format": "plain"}'
+```
+
+## Limitations
+
+1. **No Streaming**: Results returned after entire batch completes
+2. **Single Process**: Limited to 1 worker process for batch coordination
+3. **Fixed Batch Window**: Batches based on count, not time
+4. **Multi-turn Not Supported**: Conversation history not yet implemented
+
+## See Also
+
+- [vLLM Python Backend](vllm-python-backend.md) - AsyncLLMEngine-based backend
+- [Backends Guide](backends.md) - Overview of all backends
+- [vLLM Engine Arguments](https://docs.vllm.ai/en/stable/configuration/engine_args/) - Full configuration options
+- [vLLM LLM Class](https://docs.vllm.ai/en/stable/offline_inference/llm.html) - Underlying API documentation
diff --git a/src/guidellm/backends/openai/http.py b/src/guidellm/backends/openai/http.py
index fab2ff828..34c2c52f1 100644
--- a/src/guidellm/backends/openai/http.py
+++ b/src/guidellm/backends/openai/http.py
@@ -86,7 +86,7 @@ class OpenAIHTTPBackendArgs(BackendArgs):
     api_key: SecretStr | None = Field(
         default=None,
         description="HTTP Bearer token API key for authentication to server",
-        examples=["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"],
+        examples=["sk-your-api-key-here"],
     )
     api_routes: dict[str, str] = Field(
         default_factory=dict,
diff --git a/src/guidellm/backends/vllm_python/__init__.py b/src/guidellm/backends/vllm_python/__init__.py
index fb8f4703b..19edba58d 100644
--- a/src/guidellm/backends/vllm_python/__init__.py
+++ b/src/guidellm/backends/vllm_python/__init__.py
@@ -5,7 +5,8 @@
 GenerationResponse from vLLM output.
 """
 
+from .offline import VLLMOfflineBackend
 from .vllm import VLLMPythonBackend
 from .vllm_response import VLLMResponseHandler
 
-__all__ = ["VLLMPythonBackend", "VLLMResponseHandler"]
+__all__ = ["VLLMPythonBackend", "VLLMOfflineBackend", "VLLMResponseHandler"]
diff --git a/src/guidellm/backends/vllm_python/offline.py b/src/guidellm/backends/vllm_python/offline.py
new file mode 100644
index 000000000..a8cd9fb7a
--- /dev/null
+++ b/src/guidellm/backends/vllm_python/offline.py
@@ -0,0 +1,770 @@
+"""
+vLLM Offline Backend for static/micro-batch inference.
+
+Uses vLLM's LLM class for synchronous batch processing. Collects requests
+into batches and processes them with LLM.generate() for maximum throughput.
+Designed for offline benchmarking scenarios where batching efficiency is
+more important than per-request latency.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import time
+import uuid
+from collections.abc import AsyncIterator
+from pathlib import Path
+from typing import Any, Literal, cast
+
+import jinja2
+from more_itertools import roundrobin
+from pydantic import ConfigDict, Field, PositiveInt, model_validator
+
+from guidellm.backends.backend import Backend, BackendArgs
+from guidellm.backends.vllm_python.vllm_response import VLLMResponseHandler
+from guidellm.extras import vllm
+from guidellm.logger import logger
+from guidellm.schemas import (
+    GenerationRequest,
+    GenerationResponse,
+    RequestInfo,
+    StandardBaseModel,
+)
+from guidellm.utils import audio, vision
+
+# Sentinel for "chat template not yet resolved" cache.
+_CHAT_TEMPLATE_UNSET: object = object()
+
+__all__ = ["VLLMOfflineBackend", "VLLMOfflineBackendArgs"]
+
+
+@BackendArgs.register("vllm_offline")
+class VLLMOfflineBackendArgs(BackendArgs):
+    """Pydantic model for VLLM Offline backend creation arguments."""
+
+    kind: Literal["vllm_offline"] = Field(
+        default="vllm_offline",
+        description="Backend type identifier for VLLM Offline backend.",
+    )
+    model: str = Field(
+        description="Model identifier or path for VLLM to load",
+    )
+    vllm_config: dict[str, Any] = Field(
+        default_factory=dict,
+        description=(
+            "Configuration dictionary for vLLM EngineArgs parameters. Pass "
+            "any valid EngineArgs parameters here (e.g. tensor_parallel_size, "
+            "gpu_memory_utilization, max_model_len). The 'model' parameter is required "
+            "and can be set here or via the top-level 'model' field; if set in both "
+            "places, the top-level 'model' field takes precedence."
+        ),
+    )
+    request_format: Literal["plain", "default-template"] | str = Field(
+        default="default-template",
+        description=(
+            "Request format for VLLM Offline backend. "
+            "Valid values: 'plain' (no chat template), "
+            "'default-template' (use tokenizer default), or a path to "
+            "/ inline Jinja2 chat template."
+        ),
+    )
+    image_placeholder: str = Field(
+        default="<image>",
+        description="Placeholder for image items in multimodal prompts.",
+    )
+    audio_placeholder: str = Field(
+        default="<|audio|>",
+        description="Placeholder for audio items in multimodal prompts.",
+    )
+    batch_size: PositiveInt = Field(
+        default=32,
+        description=(
+            "Number of requests to collect before processing as a batch. "
+            "Larger batches improve throughput but increase latency."
+        ),
+    )
+
+    @model_validator(mode="after")
+    def validate_vllm_config(self):
+        """Set defaults on vllm_config and ensure model is set."""
+        if "model" in self.vllm_config:
+            logger.warning(
+                "The `model` input was passed to the vllm offline backend "
+                "with the `vllm_config` input. Ignoring and overwriting "
+                "with the value from the `model` input."
+            )
+        self.vllm_config["model"] = self.model
+        return self
+
+
+class _ResolvedRequest(StandardBaseModel):
+    """Fully resolved request: prompt already formatted, ready for engine.generate."""
+
+    model_config = ConfigDict(frozen=True)
+
+    prompt: str = Field(
+        description="Fully resolved prompt string (templated, with placeholders)"
+    )
+    multi_modal_data: dict[str, Any] | None = Field(
+        default=None,
+        description="vLLM multi_modal_data from image/audio/video columns.",
+    )
+
+
+class _BatchedRequest:
+    """Internal tracking for a request waiting in batch."""
+
+    def __init__(
+        self,
+        request: GenerationRequest,
+        request_info: RequestInfo,
+        resolved_prompt: str,
+        multi_modal_data: dict[str, Any] | None,
+        max_tokens: int | None,
+    ):
+        self.request = request
+        self.request_info = request_info
+        self.resolved_prompt = resolved_prompt
+        self.multi_modal_data = multi_modal_data
+        self.max_tokens = max_tokens
+        self.request_id = str(uuid.uuid4())
+        self.result: vllm.RequestOutput | None = None
+        self.ready = asyncio.Event()
+
+
+def _has_jinja2_markers(s: str) -> bool:
+    """Return True if the string contains Jinja2 template syntax ({{, {%, or {#)."""
+    return "{{" in s or "{%" in s or "{#" in s
+
+
+@Backend.register("vllm_offline")
+class VLLMOfflineBackend(Backend):
+    """
+    Offline backend for vLLM using LLM class for batch processing.
+
+    Collects requests into micro-batches and processes them together using
+    vLLM's LLM.generate() for optimal throughput. Designed for offline
+    benchmarking where batch efficiency is prioritized over streaming latency.
+
+    Example:
+    ::
+        args = VLLMOfflineBackendArgs(
+            model="meta-llama/Llama-2-7b-hf",
+            batch_size=64,
+            vllm_config={"tensor_parallel_size": 2}
+        )
+        backend = VLLMOfflineBackend(args)
+        await backend.process_startup()
+        async for response, info in backend.resolve(request, request_info):
+            process_response(response)
+        await backend.process_shutdown()
+    """
+
+    @classmethod
+    def backend_args(cls) -> type[BackendArgs]:
+        """Return the Pydantic model for this backend's creation arguments."""
+        return VLLMOfflineBackendArgs
+
+    def __init__(self, arguments: VLLMOfflineBackendArgs):
+        """Initialize vLLM Offline backend with model and configuration."""
+        super().__init__(arguments)
+        self._args = arguments
+
+        # Runtime state
+        self._in_process = False
+        self._shutting_down = False
+        self._llm: vllm.LLM | None = None
+        self._batch_lock = asyncio.Lock()
+        self._pending_batch: list[_BatchedRequest] = []
+        self._processing_task: asyncio.Task | None = None
+        self._resolved_chat_template: str | None | object = _CHAT_TEMPLATE_UNSET
+
+    @property
+    def processes_limit(self) -> int | None:
+        """Limit to single process for batch coordination."""
+        return 1
+
+    @property
+    def info(self) -> dict[str, Any]:
+        """Get backend configuration details."""
+        return self._args.model_dump()
+
+    async def process_startup(self):
+        """Initialize vLLM LLM instance with configured parameters."""
+        if self._in_process:
+            raise RuntimeError("Backend already started up for process.")
+
+        # Initialize LLM in thread pool to avoid blocking
+        def _init_llm():
+            engine_args = vllm.EngineArgs(**self._args.vllm_config)
+            return vllm.LLM.from_engine_args(engine_args)
+
+        self._llm = await asyncio.to_thread(_init_llm)
+        self._in_process = True
+
+    async def process_shutdown(self):
+        """Clean up vLLM LLM instance and resources."""
+        if not self._in_process:
+            raise RuntimeError("Backend not started up for process.")
+
+        # Set shutdown flag to reject new requests
+        self._shutting_down = True
+
+        # Cancel any pending processing
+        if self._processing_task and not self._processing_task.done():
+            self._processing_task.cancel()
+            with contextlib.suppress(asyncio.CancelledError):
+                await self._processing_task
+
+        # Process any remaining requests in batch
+        async with self._batch_lock:
+            if self._pending_batch:
+                await self._process_batch()
+
+        if self._llm is not None:
+            # LLM cleanup happens automatically via GC
+            self._llm = None
+
+        self._in_process = False
+        self._shutting_down = False
+
+    async def validate(self):
+        """Validate backend readiness."""
+        if self._llm is None:
+            raise RuntimeError("Backend not started up for process.")
+        # LLM is ready if it was constructed successfully
+
+    async def available_models(self) -> list[str]:
+        """Get available models from this backend."""
+        return [self._args.model]
+
+    async def default_model(self) -> str:
+        """Get the default model for this backend."""
+        return self._args.model
+
+    def _validate_backend_initialized(self) -> vllm.LLM:
+        """
+        Validate that the backend is initialized and return the LLM.
+
+        :raises RuntimeError: If backend is not initialized
+        :return: The initialized LLM
+        """
+        if self._llm is None:
+            raise RuntimeError("Backend not started up for process.")
+        return self._llm
+
+    def _build_multi_modal_data_from_columns(  # noqa: C901, PLR0912
+        self, columns: dict[str, Any]
+    ) -> dict[str, Any] | None:
+        """
+        Build vLLM multi_modal_data dict from image_column, audio_column.
+
+        video_column is not yet supported (no frame extraction); it is skipped.
+        """
+        multi_modal_data: dict[str, Any] = {}
+        # We look specifically for "image_column" and "audio_column" which contain lists
+        # of dicts
+        image_items = columns.get("image_column", [])
+        audio_items = columns.get("audio_column", [])
+        # video_column: not yet supported; would require frame extraction
+        for item in image_items:
+            if not item or not isinstance(item, dict):
+                continue
+            # Convert raw image dicts into PIL Images as required by vLLM's vision
+            # processor
+            pil_image = vision.image_dict_to_pil(item)
+            if "image" not in multi_modal_data:
+                multi_modal_data["image"] = pil_image
+            else:
+                # If multiple images exist, vLLM expects a list of PIL Images
+                existing = multi_modal_data["image"]
+                if isinstance(existing, list):
+                    existing.append(pil_image)
+                else:
+                    multi_modal_data["image"] = [existing, pil_image]
+        if audio_items:
+            if len(audio_items) > 1:
+                logger.warning(
+                    "Only one audio item per request is supported; "
+                    "ignoring {} extra audio item(s).",
+                    len(audio_items) - 1,
+                )
+            first = audio_items[0]
+            if not first or not isinstance(first, dict):
+                logger.warning("audio_column item is empty or not a dict; skipping.")
+            else:
+                audio_bytes = first.get("audio")
+                if isinstance(audio_bytes, bytes) and len(audio_bytes) > 0:
+                    try:
+                        # Decode raw audio bytes into an array since vLLM audio models
+                        # expect either raw numpy arrays or specific tensor formats
+                        audio_data = audio.decode_audio(audio_bytes)
+                        multi_modal_data["audio"] = audio_data
+                    except (ValueError, TypeError, OSError, RuntimeError) as exc:
+                        raise ValueError(
+                            f"Failed to decode audio from audio_column for vLLM: {exc}"
+                        ) from exc
+        return multi_modal_data if multi_modal_data else None
+
+    def _extract_text_from_content(
+        self, content: str | list[dict[str, Any]] | Any
+    ) -> str:
+        """
+        Extract text content from message content field.
+
+        Handles both string content and list-based multimodal content blocks.
+        For list-based content, extracts text from blocks with type "text" and
+        concatenates them together.
+
+        :param content: Content field which can be a string or list of content blocks
+        :return: Extracted text string
+        """
+        if isinstance(content, str):
+            return content
+        if isinstance(content, list):
+            # Extract text from content blocks with type "text"
+            text_parts = []
+            for block in content:
+                if isinstance(block, dict):
+                    block_type = block.get("type")
+                    if block_type == "text":
+                        text = block.get("text")
+                        if text:
+                            text_parts.append(text)
+            return "".join(text_parts)
+        # Fallback: convert to string
+        return str(content) if content is not None else ""
+
+    def _build_placeholder_prefix(self, multi_modal_data: dict[str, Any]) -> str:
+        """
+        Build the placeholder prefix string for all modalities in
+        multi_modal_data.
+
+        Returns a string like ``"<image>\\n<|audio|>\\n"`` with one
+        placeholder per item, or ``""`` if no multimodal items are
+        present.  Placeholder tokens default to ``<image>`` and
+        ``<|audio|>`` but can be overridden via
+        ``image_placeholder`` / ``audio_placeholder`` at construction.
+        """
+        parts: list[str] = []
+        images = multi_modal_data.get("image")
+        if images is not None:
+            num = len(images) if isinstance(images, list | tuple) else 1
+            if num > 0:
+                ph = self._args.image_placeholder
+                parts.extend([ph] * num)
+        audio = multi_modal_data.get("audio")
+        if audio is not None:
+            # Single audio item (numpy array) — not a list of items.
+            num = len(audio) if isinstance(audio, list | tuple) else 1
+            if num > 0:
+                ph = self._args.audio_placeholder
+                parts.extend([ph] * num)
+        if not parts:
+            return ""
+        return "\n".join(parts) + "\n"
+
+    @staticmethod
+    def _format_column_blocks(
+        column_data: list[Any], column_type: str
+    ) -> list[dict[str, Any]]:
+        """Format data column items into vLLM-compatible content blocks.
+
+        Analogous to the HTTP backend's ``_format_prompts`` but emitting
+        vLLM-specific block types that chat templates can render into the
+        correct model-specific placeholder tokens.
+        """
+        blocks: list[dict[str, Any]] = []
+        for item in column_data:
+            if not item:
+                continue
+            if column_type == "text_column":
+                blocks.append({"type": "text", "text": str(item)})
+            elif column_type == "image_column":
+                blocks.append({"type": "image"})
+            elif column_type == "audio_column":
+                blocks.append({"type": "audio"})
+        return blocks
+
+    def _inject_placeholders_into_messages(
+        self,
+        formatted_messages: list[dict[str, Any]],
+        multi_modal_data: dict[str, Any],
+    ) -> None:
+        """
+        Inject multimodal placeholder tokens into the last user message's content.
+
+        vLLM requires one placeholder per multimodal item in the prompt text so its
+        processor can apply prompt replacement. This must happen *before* the chat
+        template is applied so that placeholders end up inside the correct message
+        turn (not prepended to the entire formatted prompt).
+        """
+        prefix = self._build_placeholder_prefix(multi_modal_data)
+        if not prefix:
+            return
+        for msg in reversed(formatted_messages):
+            if msg.get("role") == "user":
+                msg["content"] = prefix + (msg.get("content") or "")
+                return
+        if formatted_messages:
+            formatted_messages[-1]["content"] = prefix + (
+                formatted_messages[-1].get("content") or ""
+            )
+
+    def _extract_prompt_chat_plain(
+        self, formatted_messages: list[dict[str, Any]]
+    ) -> str:
+        """Concatenate message content into a single raw prompt string.
+
+        Equivalent to the HTTP /v1/completions behaviour: prefix + text
+        with no role prefixes or trailing generation prompt.
+        """
+        return " ".join(
+            msg["content"] for msg in formatted_messages if msg.get("content")
+        )
+
+    def _resolve_chat_template(self) -> str | None:
+        """
+        Resolve and validate request_format to a template string or None.
+
+        Returns None for default tokenizer template; returns the template string
+        when valid. Raises ValueError for invalid input (wrong format, bad path,
+        or invalid Jinja2 syntax).
+        """
+        template = self._args.request_format
+        if template in (
+            "plain",
+            "default-template",
+        ):
+            # No custom template provided; 'plain' and 'default-template' are handled
+            # internally
+            return None
+        path = Path(template)
+        # Treat the request_format string as a file path. If it exists and contains
+        # Jinja2 syntax, read the content as the template.
+        if path.exists() and path.is_file():
+            content = path.read_text()
+            if not _has_jinja2_markers(content):
+                raise ValueError(
+                    "Invalid chat template: path "
+                    f"{path.as_posix()!r} exists but file content does not "
+                    "contain Jinja2 template syntax ({{, {%}, or {#})."
+                )
+            try:
+                jinja2.Template(content)
+            except jinja2.TemplateSyntaxError as e:
+                raise ValueError(
+                    f"Invalid chat template in file {path.as_posix()!r}: {e}"
+                ) from e
+            return content
+        if _has_jinja2_markers(template):
+            try:
+                jinja2.Template(template)
+            except jinja2.TemplateSyntaxError as e:
+                raise ValueError(f"Invalid chat template: {e}") from e
+            return template
+        raise ValueError(
+            "request_format must be 'plain', 'default-template', a path to a "
+            "Jinja2 template file, or a string containing Jinja2 template "
+            "syntax ({{, {%}, or {#). Got: " + repr(template) + "."
+        )
+
+    def _extract_prompt_chat_tokenizer(
+        self, formatted_messages: list[dict[str, Any]]
+    ) -> str:
+        """Apply tokenizer chat template to formatted messages."""
+        llm = self._validate_backend_initialized()
+        tokenizer = llm.llm_engine.tokenizer.tokenizer
+        if tokenizer is None:
+            raise RuntimeError("Backend engine has no tokenizer.")
+
+        if self._args.request_format in (
+            "plain",
+            "default-template",
+        ):
+            resolved: str | None = None
+        else:
+            if self._resolved_chat_template is _CHAT_TEMPLATE_UNSET:
+                self._resolved_chat_template = self._resolve_chat_template()
+            resolved = cast("str | None", self._resolved_chat_template)
+        if resolved is not None:
+            # Safe to mutate: vLLM runs one model per engine and the resolved
+            # template is constant across all requests for this backend instance.
+            tokenizer.chat_template = resolved  # type: ignore[attr-defined]
+        prompt = tokenizer.apply_chat_template(
+            formatted_messages,  # type: ignore[arg-type]
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        if isinstance(prompt, str):
+            return prompt
+        raise RuntimeError("Backend received unexpected type from tokenizer.")
+
+    def _resolve_request(self, request: GenerationRequest) -> _ResolvedRequest:
+        """
+        Build a fully resolved request from column-based GenerationRequest.
+
+        Mirrors the HTTP backend's ``ChatCompletionsRequestHandler.format``:
+        prefix items are space-joined into one system message and all data
+        columns (text, image, audio) are formatted as typed content blocks
+        then interleaved via ``roundrobin`` into a single user message.
+
+        When a chat template is active and multimodal data is present, the
+        list-of-blocks content is passed directly to the tokenizer so the
+        template emits model-specific placeholder tokens.  For plain format
+        or text-only requests the content is flattened to strings.
+
+        :param request: Column-based generation request
+        :return: Resolved request with formatted prompt and multimodal data
+        :raises ValueError: If request has no text or multimodal columns
+        """
+        columns = request.columns
+
+        messages: list[dict[str, Any]] = []
+
+        prefix = " ".join(str(p) for p in columns.get("prefix_column", []) if p)
+        if prefix:
+            messages.append({"role": "system", "content": prefix})
+
+        text_blocks = self._format_column_blocks(
+            columns.get("text_column", []), "text_column"
+        )
+
+        multi_modal_data = self._build_multi_modal_data_from_columns(columns)
+
+        # We use explicit content blocks (e.g. {"type": "image"}) when applying a
+        # chat template so that the template itself can generate the correct,
+        # model-specific tokens. Otherwise, we flatten to strings and fall back
+        # to placeholder-string injection.
+        use_content_blocks = (
+            multi_modal_data
+            and (text_blocks or prefix)
+            and self._args.request_format != "plain"
+        )
+
+        if use_content_blocks:
+            # Interleave text and media blocks into a single content list,
+            # matching the HTTP backend's roundrobin approach.
+            media_lists = [
+                self._format_column_blocks(columns.get(col, []), col)
+                for col in ("image_column", "audio_column")
+            ]
+            user_content: list[dict[str, Any]] = list(
+                roundrobin(text_blocks, *media_lists)
+            )
+        else:
+            # Text-only or plain mode: media is handled later via placeholder
+            # injection, so only text blocks go into the user message here.
+            user_content = list(text_blocks)
+
+        if user_content:
+            messages.append({"role": "user", "content": user_content})
+
+        if messages:
+            if use_content_blocks:
+                prompt = self._extract_prompt_chat_tokenizer(messages)
+            else:
+                formatted_messages = [
+                    {
+                        "role": msg["role"],
+                        "content": self._extract_text_from_content(
+                            msg.get("content", "")
+                        ),
+                    }
+                    for msg in messages
+                ]
+
+                if multi_modal_data:
+                    # Placeholders must be injected into the message text
+                    # *before* the chat template is applied so they end up
+                    # inside the correct message turn.
+                    self._inject_placeholders_into_messages(
+                        formatted_messages, multi_modal_data
+                    )
+
+                if self._args.request_format == "plain":
+                    prompt = self._extract_prompt_chat_plain(formatted_messages)
+                else:
+                    prompt = self._extract_prompt_chat_tokenizer(formatted_messages)
+        elif multi_modal_data:
+            # Multimodal-only (e.g. audio transcription with no text/prefix):
+            # no messages to inject into, so use a raw placeholder prompt.
+            prompt = self._build_placeholder_prefix(multi_modal_data)
+        else:
+            raise ValueError("Request must include text_column or multimodal columns.")
+
+        return _ResolvedRequest(
+            prompt=prompt,
+            multi_modal_data=multi_modal_data,
+        )
+
+    def _create_sampling_params(
+        self,
+        max_tokens_override: int | None = None,
+    ) -> vllm.SamplingParams:
+        """
+        Create VLLM SamplingParams.
+
+        When max_tokens_override is set (from benchmark output_metrics), it is used
+        as max_tokens and EOS is ignored to force generation of exactly that many
+        tokens, matching HTTP backend behavior. Otherwise vLLM defaults are used
+        (generate until EOS or model max context).
+
+        :param max_tokens_override: Optional max_tokens from request (e.g. benchmark)
+        :return: Configured SamplingParams instance
+        """
+        params: dict[str, Any] = {}
+
+        if max_tokens_override is not None and max_tokens_override > 0:
+            params["max_tokens"] = max_tokens_override
+            params["ignore_eos"] = True
+
+        return vllm.SamplingParams(**params)
+
+    async def _process_batch(self):
+        """Process all pending requests as a batch using LLM.generate()."""
+        if not self._pending_batch:
+            return
+
+        if self._llm is None:
+            raise RuntimeError("Backend not started up for process.")
+
+        batch = self._pending_batch
+        self._pending_batch = []
+
+        logger.debug(f"Processing batch of {len(batch)} requests")
+
+        # Build inputs for LLM.generate()
+        prompts = []
+        sampling_params_list = []
+
+        for req in batch:
+            prompt_input: dict[str, Any] | str
+            if req.multi_modal_data:
+                prompt_input = {
+                    "prompt": req.resolved_prompt,
+                    "multi_modal_data": req.multi_modal_data,
+                }
+            else:
+                prompt_input = req.resolved_prompt
+
+            prompts.append(prompt_input)
+            sampling_params = self._create_sampling_params(req.max_tokens)
+            sampling_params_list.append(sampling_params)
+
+        # Process batch in thread pool
+        def _generate_batch():
+            return self._llm.generate(  # type: ignore[union-attr]
+                prompts,
+                sampling_params_list,
+                use_tqdm=False,
+            )
+
+        try:
+            outputs: list[vllm.RequestOutput] = await asyncio.to_thread(_generate_batch)
+
+            # Match outputs to requests and mark ready
+            if len(outputs) != len(batch):
+                raise RuntimeError(
+                    f"Batch size mismatch: expected {len(batch)} outputs, "
+                    f"got {len(outputs)}"
+                )
+
+            for req, output in zip(batch, outputs, strict=True):
+                req.result = output
+                req.ready.set()
+        except Exception as exc:  # noqa: BLE001
+            # Catch all exceptions to ensure requests don't hang forever.
+            # This is safe here because we're marking requests as failed.
+            logger.error(f"Batch processing failed: {exc}")
+            # Mark all requests as failed but don't re-raise
+            # (individual requests will see None result)
+            for req in batch:
+                req.ready.set()
+
+    async def _maybe_process_batch(self):
+        """Check if batch is full and process if so."""
+        async with self._batch_lock:
+            if len(self._pending_batch) >= self._args.batch_size:
+                await self._process_batch()
+
+    async def resolve(  # type: ignore[override]
+        self,
+        request: GenerationRequest,
+        request_info: RequestInfo,
+        history: list[tuple[GenerationRequest, GenerationResponse]] | None = None,
+    ) -> AsyncIterator[tuple[GenerationResponse, RequestInfo]]:
+        """
+        Process generation request by batching with others.
+
+        Collects requests into micro-batches and processes them together
+        using LLM.generate(). The caller waits for the batch to complete
+        before receiving the response.
+
+        :param request: Generation request with content and parameters
+        :param request_info: Request tracking info updated with timing metadata
+        :param history: Conversation history (currently not supported)
+        :yields: Single tuple of (response, updated_request_info)
+        """
+        if self._llm is None:
+            raise RuntimeError("Backend not started up for process.")
+
+        if self._shutting_down:
+            raise RuntimeError("Backend is shutting down, cannot accept new requests.")
+
+        if history is not None:
+            raise NotImplementedError("Multi-turn requests not yet supported")
+
+        # Resolve the request
+        request_info.timings.request_start = time.time()
+        resolved = self._resolve_request(request)
+
+        # Create batched request tracker
+        max_tokens = (
+            request.output_metrics.text_tokens
+            if request.output_metrics.text_tokens
+            else None
+        )
+
+        batched_req = _BatchedRequest(
+            request=request,
+            request_info=request_info,
+            resolved_prompt=resolved.prompt,
+            multi_modal_data=resolved.multi_modal_data,
+            max_tokens=max_tokens,
+        )
+
+        # Add to pending batch
+        async with self._batch_lock:
+            self._pending_batch.append(batched_req)
+
+        # Trigger batch processing if full
+        await self._maybe_process_batch()
+
+        # Wait for result
+        await batched_req.ready.wait()
+
+        # Build response
+        request_info.timings.request_end = time.time()
+
+        if batched_req.result is not None:
+            output = batched_req.result
+            text = output.outputs[0].text if output.outputs else ""
+            usage = {
+                "prompt_tokens": len(output.prompt_token_ids or []),
+                "completion_tokens": len(output.outputs[0].token_ids or [])
+                if output.outputs
+                else 0,
+                "total_tokens": len(output.prompt_token_ids or [])
+                + (len(output.outputs[0].token_ids or []) if output.outputs else 0),
+            }
+
+            response = VLLMResponseHandler.build_response(
+                request, text, usage, response_id=output.request_id
+            )
+            yield response, request_info
+        else:
+            # Request failed during batch processing
+            request_info.error = "Batch processing failed"
+            yield None, request_info  # type: ignore[misc]
diff --git a/tests/unit/backends/test_backend.py b/tests/unit/backends/test_backend.py
index 59a4a87be..906b5f882 100644
--- a/tests/unit/backends/test_backend.py
+++ b/tests/unit/backends/test_backend.py
@@ -506,6 +506,25 @@ def test_vllm_python_backend_registered(self):
         assert backend._args.model == "test-model"
         assert backend.kind == "vllm_python"
 
+    @pytest.mark.smoke
+    def test_vllm_offline_backend_registered(self):
+        """
+        Test that vllm_offline backend is registered and createable.
+        ## WRITTEN BY AI ##
+        """
+        from guidellm.backends.vllm_python.offline import (
+            VLLMOfflineBackend,
+            VLLMOfflineBackendArgs,
+        )
+
+        assert Backend.is_registered("vllm_offline")
+        args = VLLMOfflineBackendArgs(model="test-model", batch_size=32)
+        backend = Backend.create(args)
+        assert isinstance(backend, VLLMOfflineBackend)
+        assert backend._args.model == "test-model"
+        assert backend._args.batch_size == 32
+        assert backend.kind == "vllm_offline"
+
     @pytest.mark.smoke
     def test_backend_registry_functionality(self):
         """Test that backend registry functions work."""

From 251eb676c0242a140abd47a1127623a912c1ed42 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Thu, 25 Jun 2026 12:35:17 +0100
Subject: [PATCH 2/5] Fix __all__ ordering in vllm_python __init__

Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 src/guidellm/backends/vllm_python/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/guidellm/backends/vllm_python/__init__.py b/src/guidellm/backends/vllm_python/__init__.py
index 19edba58d..a6851a2f5 100644
--- a/src/guidellm/backends/vllm_python/__init__.py
+++ b/src/guidellm/backends/vllm_python/__init__.py
@@ -9,4 +9,4 @@
 from .vllm import VLLMPythonBackend
 from .vllm_response import VLLMResponseHandler
 
-__all__ = ["VLLMPythonBackend", "VLLMOfflineBackend", "VLLMResponseHandler"]
+__all__ = ["VLLMOfflineBackend", "VLLMPythonBackend", "VLLMResponseHandler"]

From bfbcad1d8a8f48ea6feba851428393c3b27955fe Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Thu, 25 Jun 2026 13:41:36 +0100
Subject: [PATCH 3/5] Refactor vLLM backends to use shared common.py module

Extract duplicated helper methods (_build_multi_modal_data_from_columns,
_resolve_chat_template, _extract_prompt_chat_tokenizer, _create_sampling_params)
into common.py to follow DRY principles.

This addresses maintainer feedback about code reuse and abstraction.
Both vllm_python and vllm_offline backends now share the same implementation
for these helpers, reducing code duplication from ~400 lines to a single
shared module.

Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 src/guidellm/backends/vllm_python/__init__.py |   8 +-
 src/guidellm/backends/vllm_python/common.py   | 205 ++++++++++++++++++
 src/guidellm/backends/vllm_python/offline.py  | 166 ++------------
 src/guidellm/backends/vllm_python/vllm.py     | 166 ++------------
 tests/unit/backends/vllm_python/test_vllm.py  |  16 +-
 5 files changed, 256 insertions(+), 305 deletions(-)
 create mode 100644 src/guidellm/backends/vllm_python/common.py

diff --git a/src/guidellm/backends/vllm_python/__init__.py b/src/guidellm/backends/vllm_python/__init__.py
index a6851a2f5..cd115e29f 100644
--- a/src/guidellm/backends/vllm_python/__init__.py
+++ b/src/guidellm/backends/vllm_python/__init__.py
@@ -5,8 +5,14 @@
 GenerationResponse from vLLM output.
 """
 
+from . import common
 from .offline import VLLMOfflineBackend
 from .vllm import VLLMPythonBackend
 from .vllm_response import VLLMResponseHandler
 
-__all__ = ["VLLMOfflineBackend", "VLLMPythonBackend", "VLLMResponseHandler"]
+__all__ = [
+    "VLLMOfflineBackend",
+    "VLLMPythonBackend",
+    "VLLMResponseHandler",
+    "common",
+]
diff --git a/src/guidellm/backends/vllm_python/common.py b/src/guidellm/backends/vllm_python/common.py
new file mode 100644
index 000000000..3040ddc32
--- /dev/null
+++ b/src/guidellm/backends/vllm_python/common.py
@@ -0,0 +1,205 @@
+"""Shared helpers for vLLM Python backends (vllm_python and vllm_offline)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+import jinja2
+from loguru import logger
+
+from guidellm.utils import audio, vision
+
+if TYPE_CHECKING:
+    from guidellm.extras import vllm
+
+__all__ = [
+    "CHAT_TEMPLATE_UNSET",
+    "build_multi_modal_data_from_columns",
+    "create_sampling_params",
+    "extract_prompt_chat_tokenizer",
+    "resolve_chat_template",
+]
+
+# Sentinel for "chat template not yet resolved" cache.
+CHAT_TEMPLATE_UNSET = object()
+
+
+def _has_jinja2_markers(s: str) -> bool:
+    """Check if string contains Jinja2 template markers ({{, {%}, or {#})."""
+    return "{{" in s or "{%" in s or "{#" in s
+
+
+def build_multi_modal_data_from_columns(
+    columns: dict[str, Any],
+) -> dict[str, Any] | None:
+    """
+    Build vLLM multi_modal_data dict from image_column, audio_column.
+
+    video_column is not yet supported (no frame extraction); it is skipped.
+
+    :param columns: Request columns containing image_column and/or audio_column
+    :return: Multi-modal data dict for vLLM, or None if no multi-modal data
+    :raises ValueError: If audio decoding fails
+    """
+    multi_modal_data: dict[str, Any] = {}
+    # We look specifically for "image_column" and "audio_column"
+    # which contain lists of dicts
+    image_items = columns.get("image_column", [])
+    audio_items = columns.get("audio_column", [])
+    # video_column: not yet supported; would require frame extraction
+    for item in image_items:
+        if not item or not isinstance(item, dict):
+            continue
+        # Convert raw image dicts into PIL Images as required by vLLM's vision
+        # processor
+        pil_image = vision.image_dict_to_pil(item)
+        if "image" not in multi_modal_data:
+            multi_modal_data["image"] = pil_image
+        else:
+            # If multiple images exist, vLLM expects a list of PIL Images
+            existing = multi_modal_data["image"]
+            if isinstance(existing, list):
+                existing.append(pil_image)
+            else:
+                multi_modal_data["image"] = [existing, pil_image]
+    if audio_items:
+        if len(audio_items) > 1:
+            logger.warning(
+                "Only one audio item per request is supported; "
+                "ignoring {} extra audio item(s).",
+                len(audio_items) - 1,
+            )
+        first = audio_items[0]
+        if not first or not isinstance(first, dict):
+            logger.warning("audio_column item is empty or not a dict; skipping.")
+        else:
+            audio_bytes = first.get("audio")
+            if isinstance(audio_bytes, bytes) and len(audio_bytes) > 0:
+                try:
+                    # Decode raw audio bytes into an array since vLLM audio models
+                    # expect either raw numpy arrays or specific tensor formats
+                    audio_data = audio.decode_audio(audio_bytes)
+                    multi_modal_data["audio"] = audio_data
+                except (ValueError, TypeError, OSError, RuntimeError) as exc:
+                    raise ValueError(
+                        f"Failed to decode audio from audio_column for vLLM: {exc}"
+                    ) from exc
+    return multi_modal_data if multi_modal_data else None
+
+
+def resolve_chat_template(request_format: str) -> str | None:
+    """
+    Resolve and validate request_format to a template string or None.
+
+    Returns None for default tokenizer template; returns the template string
+    when valid. Raises ValueError for invalid input (wrong format, bad path,
+    or invalid Jinja2 syntax).
+
+    :param request_format: Template format string
+        (plain, default-template, path, or Jinja2)
+    :return: Template string or None for default
+    :raises ValueError: If request_format is invalid
+    """
+    template = request_format
+    if template in (
+        "plain",
+        "default-template",
+    ):
+        # No custom template provided; 'plain' and 'default-template' are handled
+        # internally
+        return None
+    path = Path(template)
+    # Treat the request_format string as a file path. If it exists and contains
+    # Jinja2 syntax, read the content as the template.
+    if path.exists() and path.is_file():
+        content = path.read_text()
+        if not _has_jinja2_markers(content):
+            raise ValueError(
+                "Invalid chat template: path "
+                f"{path.as_posix()!r} exists but file content does not "
+                "contain Jinja2 template syntax ({{, {%}, or {#})."
+            )
+        try:
+            jinja2.Template(content)
+        except jinja2.TemplateSyntaxError as e:
+            raise ValueError(
+                f"Invalid chat template in file {path.as_posix()!r}: {e}"
+            ) from e
+        return content
+    if _has_jinja2_markers(template):
+        try:
+            jinja2.Template(template)
+        except jinja2.TemplateSyntaxError as e:
+            raise ValueError(f"Invalid chat template: {e}") from e
+        return template
+    raise ValueError(
+        "request_format must be 'plain', 'default-template', a path to a "
+        "Jinja2 template file, or a string containing Jinja2 template "
+        "syntax ({{, {%}, or {#). Got: " + repr(template) + "."
+    )
+
+
+def extract_prompt_chat_tokenizer(
+    formatted_messages: list[dict[str, Any]],
+    tokenizer: Any,
+    request_format: str,
+    resolved_chat_template: str | None,
+) -> str:
+    """
+    Apply tokenizer chat template to formatted messages.
+
+    :param formatted_messages: List of message dicts with role/content
+    :param tokenizer: Tokenizer instance from vLLM engine
+    :param request_format: Request format ('plain', 'default-template', or custom)
+    :param resolved_chat_template: Pre-resolved custom template or None for default
+    :return: Formatted prompt string
+    :raises RuntimeError: If tokenizer is missing or returns unexpected type
+    """
+    if tokenizer is None:
+        raise RuntimeError("Backend engine has no tokenizer.")
+
+    if request_format in (
+        "plain",
+        "default-template",
+    ):
+        resolved: str | None = None
+    else:
+        resolved = resolved_chat_template
+    if resolved is not None:
+        # Safe to mutate: vLLM runs one model per engine and the resolved
+        # template is constant across all requests for this backend instance.
+        tokenizer.chat_template = resolved  # type: ignore[attr-defined]
+    prompt = tokenizer.apply_chat_template(
+        formatted_messages,  # type: ignore[arg-type]
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+    if isinstance(prompt, str):
+        return prompt
+    raise RuntimeError("Backend received unexpected type from tokenizer.")
+
+
+def create_sampling_params(
+    vllm_module: Any,
+    max_tokens_override: int | None = None,
+) -> vllm.SamplingParams:
+    """
+    Create VLLM SamplingParams.
+
+    When max_tokens_override is set (from benchmark output_metrics), it is used
+    as max_tokens and EOS is ignored to force generation of exactly that many
+    tokens, matching HTTP backend behavior. Otherwise vLLM defaults are used
+    (generate until EOS or model max context).
+
+    :param vllm_module: vLLM module (from guidellm.extras)
+    :param max_tokens_override: Optional max_tokens from request (e.g. benchmark)
+    :return: Configured SamplingParams instance
+    """
+    params: dict[str, Any] = {}
+
+    if max_tokens_override is not None and max_tokens_override > 0:
+        params["max_tokens"] = max_tokens_override
+        params["ignore_eos"] = True
+
+    return vllm_module.SamplingParams(**params)
diff --git a/src/guidellm/backends/vllm_python/offline.py b/src/guidellm/backends/vllm_python/offline.py
index a8cd9fb7a..586750ffd 100644
--- a/src/guidellm/backends/vllm_python/offline.py
+++ b/src/guidellm/backends/vllm_python/offline.py
@@ -14,14 +14,13 @@
 import time
 import uuid
 from collections.abc import AsyncIterator
-from pathlib import Path
 from typing import Any, Literal, cast
 
-import jinja2
 from more_itertools import roundrobin
 from pydantic import ConfigDict, Field, PositiveInt, model_validator
 
 from guidellm.backends.backend import Backend, BackendArgs
+from guidellm.backends.vllm_python import common
 from guidellm.backends.vllm_python.vllm_response import VLLMResponseHandler
 from guidellm.extras import vllm
 from guidellm.logger import logger
@@ -31,10 +30,6 @@
     RequestInfo,
     StandardBaseModel,
 )
-from guidellm.utils import audio, vision
-
-# Sentinel for "chat template not yet resolved" cache.
-_CHAT_TEMPLATE_UNSET: object = object()
 
 __all__ = ["VLLMOfflineBackend", "VLLMOfflineBackendArgs"]
 
@@ -133,11 +128,6 @@ def __init__(
         self.ready = asyncio.Event()
 
 
-def _has_jinja2_markers(s: str) -> bool:
-    """Return True if the string contains Jinja2 template syntax ({{, {%, or {#)."""
-    return "{{" in s or "{%" in s or "{#" in s
-
-
 @Backend.register("vllm_offline")
 class VLLMOfflineBackend(Backend):
     """
@@ -178,7 +168,7 @@ def __init__(self, arguments: VLLMOfflineBackendArgs):
         self._batch_lock = asyncio.Lock()
         self._pending_batch: list[_BatchedRequest] = []
         self._processing_task: asyncio.Task | None = None
-        self._resolved_chat_template: str | None | object = _CHAT_TEMPLATE_UNSET
+        self._resolved_chat_template: str | None | object = common.CHAT_TEMPLATE_UNSET
 
     @property
     def processes_limit(self) -> int | None:
@@ -254,58 +244,11 @@ def _validate_backend_initialized(self) -> vllm.LLM:
             raise RuntimeError("Backend not started up for process.")
         return self._llm
 
-    def _build_multi_modal_data_from_columns(  # noqa: C901, PLR0912
+    def _build_multi_modal_data_from_columns(
         self, columns: dict[str, Any]
     ) -> dict[str, Any] | None:
-        """
-        Build vLLM multi_modal_data dict from image_column, audio_column.
-
-        video_column is not yet supported (no frame extraction); it is skipped.
-        """
-        multi_modal_data: dict[str, Any] = {}
-        # We look specifically for "image_column" and "audio_column" which contain lists
-        # of dicts
-        image_items = columns.get("image_column", [])
-        audio_items = columns.get("audio_column", [])
-        # video_column: not yet supported; would require frame extraction
-        for item in image_items:
-            if not item or not isinstance(item, dict):
-                continue
-            # Convert raw image dicts into PIL Images as required by vLLM's vision
-            # processor
-            pil_image = vision.image_dict_to_pil(item)
-            if "image" not in multi_modal_data:
-                multi_modal_data["image"] = pil_image
-            else:
-                # If multiple images exist, vLLM expects a list of PIL Images
-                existing = multi_modal_data["image"]
-                if isinstance(existing, list):
-                    existing.append(pil_image)
-                else:
-                    multi_modal_data["image"] = [existing, pil_image]
-        if audio_items:
-            if len(audio_items) > 1:
-                logger.warning(
-                    "Only one audio item per request is supported; "
-                    "ignoring {} extra audio item(s).",
-                    len(audio_items) - 1,
-                )
-            first = audio_items[0]
-            if not first or not isinstance(first, dict):
-                logger.warning("audio_column item is empty or not a dict; skipping.")
-            else:
-                audio_bytes = first.get("audio")
-                if isinstance(audio_bytes, bytes) and len(audio_bytes) > 0:
-                    try:
-                        # Decode raw audio bytes into an array since vLLM audio models
-                        # expect either raw numpy arrays or specific tensor formats
-                        audio_data = audio.decode_audio(audio_bytes)
-                        multi_modal_data["audio"] = audio_data
-                    except (ValueError, TypeError, OSError, RuntimeError) as exc:
-                        raise ValueError(
-                            f"Failed to decode audio from audio_column for vLLM: {exc}"
-                        ) from exc
-        return multi_modal_data if multi_modal_data else None
+        """Build vLLM multi_modal_data dict from image_column, audio_column."""
+        return common.build_multi_modal_data_from_columns(columns)
 
     def _extract_text_from_content(
         self, content: str | list[dict[str, Any]] | Any
@@ -425,81 +368,24 @@ def _extract_prompt_chat_plain(
         )
 
     def _resolve_chat_template(self) -> str | None:
-        """
-        Resolve and validate request_format to a template string or None.
-
-        Returns None for default tokenizer template; returns the template string
-        when valid. Raises ValueError for invalid input (wrong format, bad path,
-        or invalid Jinja2 syntax).
-        """
-        template = self._args.request_format
-        if template in (
-            "plain",
-            "default-template",
-        ):
-            # No custom template provided; 'plain' and 'default-template' are handled
-            # internally
-            return None
-        path = Path(template)
-        # Treat the request_format string as a file path. If it exists and contains
-        # Jinja2 syntax, read the content as the template.
-        if path.exists() and path.is_file():
-            content = path.read_text()
-            if not _has_jinja2_markers(content):
-                raise ValueError(
-                    "Invalid chat template: path "
-                    f"{path.as_posix()!r} exists but file content does not "
-                    "contain Jinja2 template syntax ({{, {%}, or {#})."
-                )
-            try:
-                jinja2.Template(content)
-            except jinja2.TemplateSyntaxError as e:
-                raise ValueError(
-                    f"Invalid chat template in file {path.as_posix()!r}: {e}"
-                ) from e
-            return content
-        if _has_jinja2_markers(template):
-            try:
-                jinja2.Template(template)
-            except jinja2.TemplateSyntaxError as e:
-                raise ValueError(f"Invalid chat template: {e}") from e
-            return template
-        raise ValueError(
-            "request_format must be 'plain', 'default-template', a path to a "
-            "Jinja2 template file, or a string containing Jinja2 template "
-            "syntax ({{, {%}, or {#). Got: " + repr(template) + "."
-        )
+        """Resolve and validate request_format to a template string or None."""
+        return common.resolve_chat_template(self._args.request_format)
 
     def _extract_prompt_chat_tokenizer(
         self, formatted_messages: list[dict[str, Any]]
     ) -> str:
         """Apply tokenizer chat template to formatted messages."""
         llm = self._validate_backend_initialized()
-        tokenizer = llm.llm_engine.tokenizer.tokenizer
-        if tokenizer is None:
-            raise RuntimeError("Backend engine has no tokenizer.")
-
-        if self._args.request_format in (
-            "plain",
-            "default-template",
-        ):
-            resolved: str | None = None
-        else:
-            if self._resolved_chat_template is _CHAT_TEMPLATE_UNSET:
-                self._resolved_chat_template = self._resolve_chat_template()
-            resolved = cast("str | None", self._resolved_chat_template)
-        if resolved is not None:
-            # Safe to mutate: vLLM runs one model per engine and the resolved
-            # template is constant across all requests for this backend instance.
-            tokenizer.chat_template = resolved  # type: ignore[attr-defined]
-        prompt = tokenizer.apply_chat_template(
-            formatted_messages,  # type: ignore[arg-type]
-            tokenize=False,
-            add_generation_prompt=True,
+        # Lazy-resolve and cache the chat template
+        if self._resolved_chat_template is common.CHAT_TEMPLATE_UNSET:
+            self._resolved_chat_template = self._resolve_chat_template()
+        resolved = cast("str | None", self._resolved_chat_template)
+        return common.extract_prompt_chat_tokenizer(
+            formatted_messages,
+            llm.llm_engine.tokenizer.tokenizer,
+            self._args.request_format,
+            resolved,
         )
-        if isinstance(prompt, str):
-            return prompt
-        raise RuntimeError("Backend received unexpected type from tokenizer.")
 
     def _resolve_request(self, request: GenerationRequest) -> _ResolvedRequest:
         """
@@ -603,24 +489,8 @@ def _create_sampling_params(
         self,
         max_tokens_override: int | None = None,
     ) -> vllm.SamplingParams:
-        """
-        Create VLLM SamplingParams.
-
-        When max_tokens_override is set (from benchmark output_metrics), it is used
-        as max_tokens and EOS is ignored to force generation of exactly that many
-        tokens, matching HTTP backend behavior. Otherwise vLLM defaults are used
-        (generate until EOS or model max context).
-
-        :param max_tokens_override: Optional max_tokens from request (e.g. benchmark)
-        :return: Configured SamplingParams instance
-        """
-        params: dict[str, Any] = {}
-
-        if max_tokens_override is not None and max_tokens_override > 0:
-            params["max_tokens"] = max_tokens_override
-            params["ignore_eos"] = True
-
-        return vllm.SamplingParams(**params)
+        """Create VLLM SamplingParams."""
+        return common.create_sampling_params(vllm, max_tokens_override)
 
     async def _process_batch(self):
         """Process all pending requests as a batch using LLM.generate()."""
diff --git a/src/guidellm/backends/vllm_python/vllm.py b/src/guidellm/backends/vllm_python/vllm.py
index bc308e891..c31c8aac2 100644
--- a/src/guidellm/backends/vllm_python/vllm.py
+++ b/src/guidellm/backends/vllm_python/vllm.py
@@ -13,14 +13,13 @@
 import time
 import uuid
 from collections.abc import AsyncIterator
-from pathlib import Path
 from typing import Any, Literal, cast
 
-import jinja2
 from more_itertools import roundrobin
 from pydantic import ConfigDict, Field, model_validator
 
 from guidellm.backends.backend import Backend, BackendArgs
+from guidellm.backends.vllm_python import common
 from guidellm.backends.vllm_python.vllm_response import VLLMResponseHandler
 from guidellm.extras import vllm
 from guidellm.logger import logger
@@ -30,10 +29,6 @@
     RequestInfo,
     StandardBaseModel,
 )
-from guidellm.utils import audio, vision
-
-# Sentinel for "chat template not yet resolved" cache.
-_CHAT_TEMPLATE_UNSET: object = object()
 
 __all__ = ["VLLMPythonBackend", "VLLMPythonBackendArgs"]
 
@@ -126,11 +121,6 @@ class _ResolvedRequest(StandardBaseModel):
     )
 
 
-def _has_jinja2_markers(s: str) -> bool:
-    """Return True if the string contains Jinja2 template syntax ({{, {%, or {#)."""
-    return "{{" in s or "{%" in s or "{#" in s
-
-
 @Backend.register("vllm_python")
 class VLLMPythonBackend(Backend):
     """
@@ -166,7 +156,7 @@ def __init__(
         # Runtime state
         self._in_process = False
         self._engine: vllm.AsyncLLMEngine | None = None
-        self._resolved_chat_template: str | None | object = _CHAT_TEMPLATE_UNSET
+        self._resolved_chat_template: str | None | object = common.CHAT_TEMPLATE_UNSET
 
     @property
     def processes_limit(self) -> int | None:
@@ -267,58 +257,11 @@ def _validate_history(
         if history is not None:
             raise NotImplementedError("Multi-turn requests not yet supported")
 
-    def _build_multi_modal_data_from_columns(  # noqa: C901, PLR0912
+    def _build_multi_modal_data_from_columns(
         self, columns: dict[str, Any]
     ) -> dict[str, Any] | None:
-        """
-        Build vLLM multi_modal_data dict from image_column, audio_column.
-
-        video_column is not yet supported (no frame extraction); it is skipped.
-        """
-        multi_modal_data: dict[str, Any] = {}
-        # We look specifically for "image_column" and "audio_column" which contain lists
-        # of dicts
-        image_items = columns.get("image_column", [])
-        audio_items = columns.get("audio_column", [])
-        # video_column: not yet supported; would require frame extraction
-        for item in image_items:
-            if not item or not isinstance(item, dict):
-                continue
-            # Convert raw image dicts into PIL Images as required by vLLM's vision
-            # processor
-            pil_image = vision.image_dict_to_pil(item)
-            if "image" not in multi_modal_data:
-                multi_modal_data["image"] = pil_image
-            else:
-                # If multiple images exist, vLLM expects a list of PIL Images
-                existing = multi_modal_data["image"]
-                if isinstance(existing, list):
-                    existing.append(pil_image)
-                else:
-                    multi_modal_data["image"] = [existing, pil_image]
-        if audio_items:
-            if len(audio_items) > 1:
-                logger.warning(
-                    "Only one audio item per request is supported; "
-                    "ignoring {} extra audio item(s).",
-                    len(audio_items) - 1,
-                )
-            first = audio_items[0]
-            if not first or not isinstance(first, dict):
-                logger.warning("audio_column item is empty or not a dict; skipping.")
-            else:
-                audio_bytes = first.get("audio")
-                if isinstance(audio_bytes, bytes) and len(audio_bytes) > 0:
-                    try:
-                        # Decode raw audio bytes into an array since vLLM audio models
-                        # expect either raw numpy arrays or specific tensor formats
-                        audio_data = audio.decode_audio(audio_bytes)
-                        multi_modal_data["audio"] = audio_data
-                    except (ValueError, TypeError, OSError, RuntimeError) as exc:
-                        raise ValueError(
-                            f"Failed to decode audio from audio_column for vLLM: {exc}"
-                        ) from exc
-        return multi_modal_data if multi_modal_data else None
+        """Build vLLM multi_modal_data dict from image_column, audio_column."""
+        return common.build_multi_modal_data_from_columns(columns)
 
     def _extract_text_from_content(
         self, content: str | list[dict[str, Any]] | Any
@@ -438,81 +381,24 @@ def _extract_prompt_chat_plain(
         )
 
     def _resolve_chat_template(self) -> str | None:
-        """
-        Resolve and validate request_format to a template string or None.
-
-        Returns None for default tokenizer template; returns the template string
-        when valid. Raises ValueError for invalid input (wrong format, bad path,
-        or invalid Jinja2 syntax).
-        """
-        template = self._args.request_format
-        if template in (
-            "plain",
-            "default-template",
-        ):
-            # No custom template provided; 'plain' and 'default-template' are handled
-            # internally
-            return None
-        path = Path(template)
-        # Treat the request_format string as a file path. If it exists and contains
-        # Jinja2 syntax, read the content as the template.
-        if path.exists() and path.is_file():
-            content = path.read_text()
-            if not _has_jinja2_markers(content):
-                raise ValueError(
-                    "Invalid chat template: path "
-                    f"{path.as_posix()!r} exists but file content does not "
-                    "contain Jinja2 template syntax ({{, {%}, or {#})."
-                )
-            try:
-                jinja2.Template(content)
-            except jinja2.TemplateSyntaxError as e:
-                raise ValueError(
-                    f"Invalid chat template in file {path.as_posix()!r}: {e}"
-                ) from e
-            return content
-        if _has_jinja2_markers(template):
-            try:
-                jinja2.Template(template)
-            except jinja2.TemplateSyntaxError as e:
-                raise ValueError(f"Invalid chat template: {e}") from e
-            return template
-        raise ValueError(
-            "request_format must be 'plain', 'default-template', a path to a "
-            "Jinja2 template file, or a string containing Jinja2 template "
-            "syntax ({{, {%}, or {#). Got: " + repr(template) + "."
-        )
+        """Resolve and validate request_format to a template string or None."""
+        return common.resolve_chat_template(self._args.request_format)
 
     def _extract_prompt_chat_tokenizer(
         self, formatted_messages: list[dict[str, Any]]
     ) -> str:
         """Apply tokenizer chat template to formatted messages."""
         engine = self._validate_backend_initialized()
-        tokenizer = engine.tokenizer
-        if tokenizer is None:
-            raise RuntimeError("Backend engine has no tokenizer.")
-
-        if self._args.request_format in (
-            "plain",
-            "default-template",
-        ):
-            resolved: str | None = None
-        else:
-            if self._resolved_chat_template is _CHAT_TEMPLATE_UNSET:
-                self._resolved_chat_template = self._resolve_chat_template()
-            resolved = cast("str | None", self._resolved_chat_template)
-        if resolved is not None:
-            # Safe to mutate: vLLM runs one model per engine and the resolved
-            # template is constant across all requests for this backend instance.
-            tokenizer.chat_template = resolved  # type: ignore[attr-defined]
-        prompt = tokenizer.apply_chat_template(
-            formatted_messages,  # type: ignore[arg-type]
-            tokenize=False,
-            add_generation_prompt=True,
+        # Lazy-resolve and cache the chat template
+        if self._resolved_chat_template is common.CHAT_TEMPLATE_UNSET:
+            self._resolved_chat_template = self._resolve_chat_template()
+        resolved = cast("str | None", self._resolved_chat_template)
+        return common.extract_prompt_chat_tokenizer(
+            formatted_messages,
+            engine.tokenizer,
+            self._args.request_format,
+            resolved,
         )
-        if isinstance(prompt, str):
-            return prompt
-        raise RuntimeError("Backend received unexpected type from tokenizer.")
 
     def _resolve_request(self, request: GenerationRequest) -> _ResolvedRequest:
         """
@@ -744,24 +630,8 @@ def _create_sampling_params(
         self,
         max_tokens_override: int | None = None,
     ) -> vllm.SamplingParams:
-        """
-        Create VLLM SamplingParams.
-
-        When max_tokens_override is set (from benchmark output_metrics), it is used
-        as max_tokens and EOS is ignored to force generation of exactly that many
-        tokens, matching HTTP backend behavior. Otherwise vLLM defaults are used
-        (generate until EOS or model max context).
-
-        :param max_tokens_override: Optional max_tokens from request (e.g. benchmark)
-        :return: Configured SamplingParams instance
-        """
-        params: dict[str, Any] = {}
-
-        if max_tokens_override is not None and max_tokens_override > 0:
-            params["max_tokens"] = max_tokens_override
-            params["ignore_eos"] = True
-
-        return vllm.SamplingParams(**params)
+        """Create VLLM SamplingParams."""
+        return common.create_sampling_params(vllm, max_tokens_override)
 
     def _raise_generation_error(self, exc: BaseException) -> None:
         """Re-raise generation failure with context.
diff --git a/tests/unit/backends/vllm_python/test_vllm.py b/tests/unit/backends/vllm_python/test_vllm.py
index f51eb1b98..2358022ca 100644
--- a/tests/unit/backends/vllm_python/test_vllm.py
+++ b/tests/unit/backends/vllm_python/test_vllm.py
@@ -16,10 +16,10 @@
 import numpy as np
 import pytest
 
+from guidellm.backends.vllm_python import common
 from guidellm.backends.vllm_python.vllm import (
     VLLMPythonBackend,
     VLLMPythonBackendArgs,
-    _has_jinja2_markers,
     _ResolvedRequest,
 )
 from guidellm.schemas import (
@@ -500,8 +500,8 @@ def test_has_jinja2_markers_true_for_expressions(self):
         _has_jinja2_markers returns True for strings containing {{.
         ## WRITTEN BY AI ##
         """
-        assert _has_jinja2_markers("{{ message.content }}") is True
-        assert _has_jinja2_markers("prefix {{ x }}") is True
+        assert common._has_jinja2_markers("{{ message.content }}") is True
+        assert common._has_jinja2_markers("prefix {{ x }}") is True
 
     @pytest.mark.sanity
     def test_has_jinja2_markers_true_for_control(self):
@@ -509,8 +509,8 @@ def test_has_jinja2_markers_true_for_control(self):
         _has_jinja2_markers returns True for {% and {#.
         ## WRITTEN BY AI ##
         """
-        assert _has_jinja2_markers("{% for m in messages %}") is True
-        assert _has_jinja2_markers("{# comment #}") is True
+        assert common._has_jinja2_markers("{% for m in messages %}") is True
+        assert common._has_jinja2_markers("{# comment #}") is True
 
     @pytest.mark.sanity
     def test_has_jinja2_markers_false_for_plain_strings(self):
@@ -518,9 +518,9 @@ def test_has_jinja2_markers_false_for_plain_strings(self):
         _has_jinja2_markers returns False for strings with no template syntax.
         ## WRITTEN BY AI ##
         """
-        assert _has_jinja2_markers("chat_completions") is False
-        assert _has_jinja2_markers("plain text") is False
-        assert _has_jinja2_markers("") is False
+        assert common._has_jinja2_markers("chat_completions") is False
+        assert common._has_jinja2_markers("plain text") is False
+        assert common._has_jinja2_markers("") is False
 
 
 class TestVLLMRequestFormat:

From 1b673a260d87714335d6067a20a3b3c749e31565 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Thu, 25 Jun 2026 14:11:19 +0100
Subject: [PATCH 4/5] Extract all duplicated helpers to common.py for maximum
 code reuse

Moved 5 additional helper methods to common.py that were duplicated between
vllm_python and vllm_offline backends:
- extract_text_from_content
- build_placeholder_prefix
- format_column_blocks
- inject_placeholders_into_messages
- extract_prompt_chat_plain

Total duplication eliminated: ~450 lines across both backends.

All helper logic is now centralized in common.py with both backends using thin
wrapper methods that delegate to the shared implementation.

Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 src/guidellm/backends/vllm_python/common.py  | 142 +++++++++++++++++++
 src/guidellm/backends/vllm_python/offline.py | 115 +++------------
 src/guidellm/backends/vllm_python/vllm.py    | 115 +++------------
 3 files changed, 180 insertions(+), 192 deletions(-)

diff --git a/src/guidellm/backends/vllm_python/common.py b/src/guidellm/backends/vllm_python/common.py
index 3040ddc32..4ae961ddd 100644
--- a/src/guidellm/backends/vllm_python/common.py
+++ b/src/guidellm/backends/vllm_python/common.py
@@ -16,8 +16,13 @@
 __all__ = [
     "CHAT_TEMPLATE_UNSET",
     "build_multi_modal_data_from_columns",
+    "build_placeholder_prefix",
     "create_sampling_params",
+    "extract_prompt_chat_plain",
     "extract_prompt_chat_tokenizer",
+    "extract_text_from_content",
+    "format_column_blocks",
+    "inject_placeholders_into_messages",
     "resolve_chat_template",
 ]
 
@@ -30,6 +35,143 @@ def _has_jinja2_markers(s: str) -> bool:
     return "{{" in s or "{%" in s or "{#" in s
 
 
+def extract_text_from_content(content: str | list[dict[str, Any]] | Any) -> str:
+    """
+    Extract text content from message content field.
+
+    Handles both string content and list-based multimodal content blocks.
+    For list-based content, extracts text from blocks with type "text" and
+    concatenates them together.
+
+    :param content: Content field which can be a string or list of content blocks
+    :return: Extracted text string
+    """
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        # Extract text from content blocks with type "text"
+        text_parts = []
+        for block in content:
+            if isinstance(block, dict):
+                block_type = block.get("type")
+                if block_type == "text":
+                    text = block.get("text")
+                    if text:
+                        text_parts.append(text)
+        return "".join(text_parts)
+    # Fallback: convert to string
+    return str(content) if content is not None else ""
+
+
+def build_placeholder_prefix(
+    multi_modal_data: dict[str, Any],
+    image_placeholder: str = "<image>",
+    audio_placeholder: str = "<|audio|>",
+) -> str:
+    """
+    Build the placeholder prefix string for all modalities in multi_modal_data.
+
+    Returns a string like ``"<image>\\n<|audio|>\\n"`` with one placeholder per
+    item, or ``""`` if no multimodal items are present.
+
+    :param multi_modal_data: Multi-modal data dict with image/audio
+    :param image_placeholder: Placeholder token for images
+    :param audio_placeholder: Placeholder token for audio
+    :return: Newline-joined placeholder string or empty string
+    """
+    parts: list[str] = []
+    images = multi_modal_data.get("image")
+    if images is not None:
+        num = len(images) if isinstance(images, list | tuple) else 1
+        if num > 0:
+            parts.extend([image_placeholder] * num)
+    audio = multi_modal_data.get("audio")
+    if audio is not None:
+        # Single audio item (numpy array) — not a list of items.
+        num = len(audio) if isinstance(audio, list | tuple) else 1
+        if num > 0:
+            parts.extend([audio_placeholder] * num)
+    if not parts:
+        return ""
+    return "\n".join(parts) + "\n"
+
+
+def format_column_blocks(
+    column_data: list[Any], column_type: str
+) -> list[dict[str, Any]]:
+    """
+    Format data column items into vLLM-compatible content blocks.
+
+    Analogous to the HTTP backend's ``_format_prompts`` but emitting
+    vLLM-specific block types that chat templates can render into the
+    correct model-specific placeholder tokens.
+
+    :param column_data: List of items from a data column
+    :param column_type: Column type (text_column, image_column, audio_column)
+    :return: List of typed content block dicts
+    """
+    blocks: list[dict[str, Any]] = []
+    for item in column_data:
+        if not item:
+            continue
+        if column_type == "text_column":
+            blocks.append({"type": "text", "text": str(item)})
+        elif column_type == "image_column":
+            blocks.append({"type": "image"})
+        elif column_type == "audio_column":
+            blocks.append({"type": "audio"})
+    return blocks
+
+
+def inject_placeholders_into_messages(
+    formatted_messages: list[dict[str, Any]],
+    multi_modal_data: dict[str, Any],
+    image_placeholder: str = "<image>",
+    audio_placeholder: str = "<|audio|>",
+) -> None:
+    """
+    Inject multimodal placeholder tokens into the last user message's content.
+
+    vLLM requires one placeholder per multimodal item in the prompt text so its
+    processor can apply prompt replacement. This must happen *before* the chat
+    template is applied so that placeholders end up inside the correct message
+    turn (not prepended to the entire formatted prompt).
+
+    :param formatted_messages: List of message dicts (modified in-place)
+    :param multi_modal_data: Multi-modal data dict
+    :param image_placeholder: Placeholder token for images
+    :param audio_placeholder: Placeholder token for audio
+    """
+    prefix = build_placeholder_prefix(
+        multi_modal_data, image_placeholder, audio_placeholder
+    )
+    if not prefix:
+        return
+    for msg in reversed(formatted_messages):
+        if msg.get("role") == "user":
+            msg["content"] = prefix + (msg.get("content") or "")
+            return
+    if formatted_messages:
+        formatted_messages[-1]["content"] = prefix + (
+            formatted_messages[-1].get("content") or ""
+        )
+
+
+def extract_prompt_chat_plain(
+    formatted_messages: list[dict[str, Any]],
+) -> str:
+    """
+    Concatenate message content into a single raw prompt string.
+
+    Equivalent to the HTTP /v1/completions behaviour: prefix + text
+    with no role prefixes or trailing generation prompt.
+
+    :param formatted_messages: List of message dicts with role/content
+    :return: Space-joined content string
+    """
+    return " ".join(msg["content"] for msg in formatted_messages if msg.get("content"))
+
+
 def build_multi_modal_data_from_columns(
     columns: dict[str, Any],
 ) -> dict[str, Any] | None:
diff --git a/src/guidellm/backends/vllm_python/offline.py b/src/guidellm/backends/vllm_python/offline.py
index 586750ffd..0437bc7cf 100644
--- a/src/guidellm/backends/vllm_python/offline.py
+++ b/src/guidellm/backends/vllm_python/offline.py
@@ -253,119 +253,42 @@ def _build_multi_modal_data_from_columns(
     def _extract_text_from_content(
         self, content: str | list[dict[str, Any]] | Any
     ) -> str:
-        """
-        Extract text content from message content field.
-
-        Handles both string content and list-based multimodal content blocks.
-        For list-based content, extracts text from blocks with type "text" and
-        concatenates them together.
-
-        :param content: Content field which can be a string or list of content blocks
-        :return: Extracted text string
-        """
-        if isinstance(content, str):
-            return content
-        if isinstance(content, list):
-            # Extract text from content blocks with type "text"
-            text_parts = []
-            for block in content:
-                if isinstance(block, dict):
-                    block_type = block.get("type")
-                    if block_type == "text":
-                        text = block.get("text")
-                        if text:
-                            text_parts.append(text)
-            return "".join(text_parts)
-        # Fallback: convert to string
-        return str(content) if content is not None else ""
+        """Extract text content from message content field."""
+        return common.extract_text_from_content(content)
 
     def _build_placeholder_prefix(self, multi_modal_data: dict[str, Any]) -> str:
-        """
-        Build the placeholder prefix string for all modalities in
-        multi_modal_data.
-
-        Returns a string like ``"<image>\\n<|audio|>\\n"`` with one
-        placeholder per item, or ``""`` if no multimodal items are
-        present.  Placeholder tokens default to ``<image>`` and
-        ``<|audio|>`` but can be overridden via
-        ``image_placeholder`` / ``audio_placeholder`` at construction.
-        """
-        parts: list[str] = []
-        images = multi_modal_data.get("image")
-        if images is not None:
-            num = len(images) if isinstance(images, list | tuple) else 1
-            if num > 0:
-                ph = self._args.image_placeholder
-                parts.extend([ph] * num)
-        audio = multi_modal_data.get("audio")
-        if audio is not None:
-            # Single audio item (numpy array) — not a list of items.
-            num = len(audio) if isinstance(audio, list | tuple) else 1
-            if num > 0:
-                ph = self._args.audio_placeholder
-                parts.extend([ph] * num)
-        if not parts:
-            return ""
-        return "\n".join(parts) + "\n"
+        """Build the placeholder prefix string for all modalities."""
+        return common.build_placeholder_prefix(
+            multi_modal_data,
+            self._args.image_placeholder,
+            self._args.audio_placeholder,
+        )
 
     @staticmethod
     def _format_column_blocks(
         column_data: list[Any], column_type: str
     ) -> list[dict[str, Any]]:
-        """Format data column items into vLLM-compatible content blocks.
-
-        Analogous to the HTTP backend's ``_format_prompts`` but emitting
-        vLLM-specific block types that chat templates can render into the
-        correct model-specific placeholder tokens.
-        """
-        blocks: list[dict[str, Any]] = []
-        for item in column_data:
-            if not item:
-                continue
-            if column_type == "text_column":
-                blocks.append({"type": "text", "text": str(item)})
-            elif column_type == "image_column":
-                blocks.append({"type": "image"})
-            elif column_type == "audio_column":
-                blocks.append({"type": "audio"})
-        return blocks
+        """Format data column items into vLLM-compatible content blocks."""
+        return common.format_column_blocks(column_data, column_type)
 
     def _inject_placeholders_into_messages(
         self,
         formatted_messages: list[dict[str, Any]],
         multi_modal_data: dict[str, Any],
     ) -> None:
-        """
-        Inject multimodal placeholder tokens into the last user message's content.
-
-        vLLM requires one placeholder per multimodal item in the prompt text so its
-        processor can apply prompt replacement. This must happen *before* the chat
-        template is applied so that placeholders end up inside the correct message
-        turn (not prepended to the entire formatted prompt).
-        """
-        prefix = self._build_placeholder_prefix(multi_modal_data)
-        if not prefix:
-            return
-        for msg in reversed(formatted_messages):
-            if msg.get("role") == "user":
-                msg["content"] = prefix + (msg.get("content") or "")
-                return
-        if formatted_messages:
-            formatted_messages[-1]["content"] = prefix + (
-                formatted_messages[-1].get("content") or ""
-            )
+        """Inject multimodal placeholder tokens into the last user message."""
+        common.inject_placeholders_into_messages(
+            formatted_messages,
+            multi_modal_data,
+            self._args.image_placeholder,
+            self._args.audio_placeholder,
+        )
 
     def _extract_prompt_chat_plain(
         self, formatted_messages: list[dict[str, Any]]
     ) -> str:
-        """Concatenate message content into a single raw prompt string.
-
-        Equivalent to the HTTP /v1/completions behaviour: prefix + text
-        with no role prefixes or trailing generation prompt.
-        """
-        return " ".join(
-            msg["content"] for msg in formatted_messages if msg.get("content")
-        )
+        """Concatenate message content into a single raw prompt string."""
+        return common.extract_prompt_chat_plain(formatted_messages)
 
     def _resolve_chat_template(self) -> str | None:
         """Resolve and validate request_format to a template string or None."""
diff --git a/src/guidellm/backends/vllm_python/vllm.py b/src/guidellm/backends/vllm_python/vllm.py
index c31c8aac2..136bee716 100644
--- a/src/guidellm/backends/vllm_python/vllm.py
+++ b/src/guidellm/backends/vllm_python/vllm.py
@@ -266,119 +266,42 @@ def _build_multi_modal_data_from_columns(
     def _extract_text_from_content(
         self, content: str | list[dict[str, Any]] | Any
     ) -> str:
-        """
-        Extract text content from message content field.
-
-        Handles both string content and list-based multimodal content blocks.
-        For list-based content, extracts text from blocks with type "text" and
-        concatenates them together.
-
-        :param content: Content field which can be a string or list of content blocks
-        :return: Extracted text string
-        """
-        if isinstance(content, str):
-            return content
-        if isinstance(content, list):
-            # Extract text from content blocks with type "text"
-            text_parts = []
-            for block in content:
-                if isinstance(block, dict):
-                    block_type = block.get("type")
-                    if block_type == "text":
-                        text = block.get("text")
-                        if text:
-                            text_parts.append(text)
-            return "".join(text_parts)
-        # Fallback: convert to string
-        return str(content) if content is not None else ""
+        """Extract text content from message content field."""
+        return common.extract_text_from_content(content)
 
     def _build_placeholder_prefix(self, multi_modal_data: dict[str, Any]) -> str:
-        """
-        Build the placeholder prefix string for all modalities in
-        multi_modal_data.
-
-        Returns a string like ``"<image>\\n<|audio|>\\n"`` with one
-        placeholder per item, or ``""`` if no multimodal items are
-        present.  Placeholder tokens default to ``<image>`` and
-        ``<|audio|>`` but can be overridden via
-        ``image_placeholder`` / ``audio_placeholder`` at construction.
-        """
-        parts: list[str] = []
-        images = multi_modal_data.get("image")
-        if images is not None:
-            num = len(images) if isinstance(images, list | tuple) else 1
-            if num > 0:
-                ph = self._args.image_placeholder
-                parts.extend([ph] * num)
-        audio = multi_modal_data.get("audio")
-        if audio is not None:
-            # Single audio item (numpy array) — not a list of items.
-            num = len(audio) if isinstance(audio, list | tuple) else 1
-            if num > 0:
-                ph = self._args.audio_placeholder
-                parts.extend([ph] * num)
-        if not parts:
-            return ""
-        return "\n".join(parts) + "\n"
+        """Build the placeholder prefix string for all modalities."""
+        return common.build_placeholder_prefix(
+            multi_modal_data,
+            self._args.image_placeholder,
+            self._args.audio_placeholder,
+        )
 
     @staticmethod
     def _format_column_blocks(
         column_data: list[Any], column_type: str
     ) -> list[dict[str, Any]]:
-        """Format data column items into vLLM-compatible content blocks.
-
-        Analogous to the HTTP backend's ``_format_prompts`` but emitting
-        vLLM-specific block types that chat templates can render into the
-        correct model-specific placeholder tokens.
-        """
-        blocks: list[dict[str, Any]] = []
-        for item in column_data:
-            if not item:
-                continue
-            if column_type == "text_column":
-                blocks.append({"type": "text", "text": str(item)})
-            elif column_type == "image_column":
-                blocks.append({"type": "image"})
-            elif column_type == "audio_column":
-                blocks.append({"type": "audio"})
-        return blocks
+        """Format data column items into vLLM-compatible content blocks."""
+        return common.format_column_blocks(column_data, column_type)
 
     def _inject_placeholders_into_messages(
         self,
         formatted_messages: list[dict[str, Any]],
         multi_modal_data: dict[str, Any],
     ) -> None:
-        """
-        Inject multimodal placeholder tokens into the last user message's content.
-
-        vLLM requires one placeholder per multimodal item in the prompt text so its
-        processor can apply prompt replacement. This must happen *before* the chat
-        template is applied so that placeholders end up inside the correct message
-        turn (not prepended to the entire formatted prompt).
-        """
-        prefix = self._build_placeholder_prefix(multi_modal_data)
-        if not prefix:
-            return
-        for msg in reversed(formatted_messages):
-            if msg.get("role") == "user":
-                msg["content"] = prefix + (msg.get("content") or "")
-                return
-        if formatted_messages:
-            formatted_messages[-1]["content"] = prefix + (
-                formatted_messages[-1].get("content") or ""
-            )
+        """Inject multimodal placeholder tokens into the last user message."""
+        common.inject_placeholders_into_messages(
+            formatted_messages,
+            multi_modal_data,
+            self._args.image_placeholder,
+            self._args.audio_placeholder,
+        )
 
     def _extract_prompt_chat_plain(
         self, formatted_messages: list[dict[str, Any]]
     ) -> str:
-        """Concatenate message content into a single raw prompt string.
-
-        Equivalent to the HTTP /v1/completions behaviour: prefix + text
-        with no role prefixes or trailing generation prompt.
-        """
-        return " ".join(
-            msg["content"] for msg in formatted_messages if msg.get("content")
-        )
+        """Concatenate message content into a single raw prompt string."""
+        return common.extract_prompt_chat_plain(formatted_messages)
 
     def _resolve_chat_template(self) -> str | None:
         """Resolve and validate request_format to a template string or None."""

From f27b0765823dc02d63d6826d19b1d565a7ebc4b4 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Thu, 25 Jun 2026 14:18:13 +0100
Subject: [PATCH 5/5] Fix mypy type errors for lazy-loaded vllm module

Add type: ignore comments for vllm.EngineArgs and vllm.LLM runtime usage
since these are lazy-loaded and mypy can't resolve them at static analysis time.
Use Any type for vllm.LLM annotations with inline comments documenting the
actual type.

Fixes CI type-check failures.

Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 src/guidellm/backends/vllm_python/offline.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/guidellm/backends/vllm_python/offline.py b/src/guidellm/backends/vllm_python/offline.py
index 0437bc7cf..3c67251dc 100644
--- a/src/guidellm/backends/vllm_python/offline.py
+++ b/src/guidellm/backends/vllm_python/offline.py
@@ -14,7 +14,7 @@
 import time
 import uuid
 from collections.abc import AsyncIterator
-from typing import Any, Literal, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 from more_itertools import roundrobin
 from pydantic import ConfigDict, Field, PositiveInt, model_validator
@@ -22,8 +22,12 @@
 from guidellm.backends.backend import Backend, BackendArgs
 from guidellm.backends.vllm_python import common
 from guidellm.backends.vllm_python.vllm_response import VLLMResponseHandler
-from guidellm.extras import vllm
 from guidellm.logger import logger
+
+if TYPE_CHECKING:
+    from guidellm.extras import vllm
+else:
+    from guidellm.extras import vllm
 from guidellm.schemas import (
     GenerationRequest,
     GenerationResponse,
@@ -164,7 +168,7 @@ def __init__(self, arguments: VLLMOfflineBackendArgs):
         # Runtime state
         self._in_process = False
         self._shutting_down = False
-        self._llm: vllm.LLM | None = None
+        self._llm: Any = None  # vllm.LLM | None
         self._batch_lock = asyncio.Lock()
         self._pending_batch: list[_BatchedRequest] = []
         self._processing_task: asyncio.Task | None = None
@@ -187,8 +191,8 @@ async def process_startup(self):
 
         # Initialize LLM in thread pool to avoid blocking
         def _init_llm():
-            engine_args = vllm.EngineArgs(**self._args.vllm_config)
-            return vllm.LLM.from_engine_args(engine_args)
+            engine_args = vllm.EngineArgs(**self._args.vllm_config)  # type: ignore[attr-defined]
+            return vllm.LLM.from_engine_args(engine_args)  # type: ignore[attr-defined]
 
         self._llm = await asyncio.to_thread(_init_llm)
         self._in_process = True
@@ -233,7 +237,7 @@ async def default_model(self) -> str:
         """Get the default model for this backend."""
         return self._args.model
 
-    def _validate_backend_initialized(self) -> vllm.LLM:
+    def _validate_backend_initialized(self) -> Any:  # vllm.LLM
         """
         Validate that the backend is initialized and return the LLM.