vllm-project · sjarvie · May 21, 2026
diff --git a/docs/getting-started/benchmark.md b/docs/getting-started/benchmark.md
@@ -55,19 +55,20 @@ GuideLLM offers a wide range of configuration options to customize your benchmar
 
 ### Key Parameters
 
-| Parameter        | Description                                    | Example                                                            |
-| ---------------- | ---------------------------------------------- | ------------------------------------------------------------------ |
-| `--target`       | URL of the OpenAI-compatible server            | `--target "http://localhost:8000"`                                 |
-| `--model`        | Model name to benchmark                        | `--model "Meta-Llama-3.1-8B-Instruct"`                             |
-| `--data`         | Data configuration for benchmarking            | `--data "kind=synthetic_text,prompt_tokens=256,output_tokens=128"` |
-| `--profile`      | Type of benchmark profile to run               | `--profile kind=sweep`                                             |
-| `--rate`         | Request rate or number of benchmarks for sweep | `--rate 10`                                                        |
-| `--random-seed`  | Random seed for reproducibility                | `--random-seed 42`                                                 |
-| `--max-seconds`  | Duration for each benchmark in seconds         | `--max-seconds 30`                                                 |
-| `--max-requests` | Maximum number of requests for each benchmark  | `--max-requests 1000`                                              |
-| `--data-samples` | Maximum number of dataset rows to load         | `--data-samples 1000`                                              |
-| `--output-dir`   | Directory path to save output files            | `--output-dir results/`                                            |
-| `--outputs`      | Output formats to generate                     | `--outputs json csv html`                                          |
+| Parameter              | Description                                    | Example                                                            |
+| ---------------------- | ---------------------------------------------- | ------------------------------------------------------------------ |
+| `--target`             | URL of the OpenAI-compatible server            | `--target "http://localhost:8000"`                                 |
+| `--model`              | Model name to benchmark                        | `--model "Meta-Llama-3.1-8B-Instruct"`                             |
+| `--data`               | Data configuration for benchmarking            | `--data "kind=synthetic_text,prompt_tokens=256,output_tokens=128"` |
+| `--profile`            | Type of benchmark profile to run               | `--profile kind=sweep`                                             |
+| `--rate`               | Request rate or number of benchmarks for sweep | `--rate 10`                                                        |
+| `--images-per-request` | Number of images per request for vision benchmarks | `--images-per-request "1,2,5"`                                 |
+| `--random-seed`        | Random seed for reproducibility                | `--random-seed 42`                                                 |
+| `--max-seconds`        | Duration for each benchmark in seconds         | `--max-seconds 30`                                                 |
+| `--max-requests`       | Maximum number of requests for each benchmark  | `--max-requests 1000`                                              |
+| `--data-samples`       | Maximum number of dataset rows to load         | `--data-samples 1000`                                              |
+| `--output-dir`         | Directory path to save output files            | `--output-dir results/`                                            |
+| `--outputs`            | Output formats to generate                     | `--outputs json csv html`                                          |
 
 ### Random Seed (`--random-seed`)
 
@@ -264,6 +265,55 @@ guidellm benchmark \
   --rate 5
 ```
 
+### Multi-Image Benchmarking
+
+When benchmarking vision-language models with multiple images per request, use `--images-per-request` to measure latency impact. This is useful for understanding how TTFT and ITL scale with increasing frame/image counts:
+
+```bash
+guidellm benchmark \
+  --target "http://localhost:8000" \
+  --data "prompt_tokens=256,output_tokens=128" \
+  --images-per-request 1,2,5 \
+  --profile constant \
+  --rate 10 \
+  --max-seconds 30
+```
+
+This runs three sequential benchmarks (1, 2, and 5 images per request) with synthetic 720p images and outputs comparative latency metrics in the report.
+
+**Single image count:**
+
+```bash
+guidellm benchmark \
+  --target "http://localhost:8000" \
+  --images-per-request 3 \
+  --profile constant \
+  --rate 5
+```
+
+**Programmatic usage:**
+
+```python
+from guidellm.benchmark import MultiImageBenchmark
+
+# Create multi-image benchmark configuration
+bench = MultiImageBenchmark(
+    image_counts=[1, 2, 5],
+    prompt_tokens=256,
+    output_tokens=128,
+)
+
+# Get configs for each image count
+configs = bench.get_configs()  # {1: config, 2: config, 5: config}
+
+# Get image statistics
+for img_count in [1, 2, 5]:
+    stats = bench.get_image_stats(img_count)
+    print(f"{img_count} images: {stats['total_bytes']} bytes total")
+```
+
+**Note:** Multi-image benchmarking requires the vision dependencies (`pip install guidellm[vision]`).
+
 ## Output Options
 
 By default, complete results are saved to `benchmarks.json`, `benchmarks.csv`, and `benchmarks.html` in your current directory. Use the `--output-dir` parameter to specify a different location and `--outputs` to control which formats are generated.

diff --git a/src/guidellm/benchmark/__init__.py b/src/guidellm/benchmark/__init__.py
@@ -12,6 +12,7 @@
 
 from .benchmarker import Benchmarker
 from .entrypoints import benchmark_generative_text, reimport_benchmarks_report
+from .multi_image import MultiImageBenchmark, MultiImageBenchmarkResults
 from .outputs import (
     GenerativeBenchmarkerConsole,
     GenerativeBenchmarkerCSV,
@@ -82,6 +83,8 @@
     "GenerativeRequestsAccumulator",
     "GenerativeTextMetricsSummary",
     "GenerativeVideoMetricsSummary",
+    "MultiImageBenchmark",
+    "MultiImageBenchmarkResults",
     "Profile",
     "RunningMetricStats",
     "SchedulerMetrics",

diff --git a/src/guidellm/benchmark/multi_image.py b/src/guidellm/benchmark/multi_image.py
@@ -0,0 +1,131 @@
+"""Programmatic API for multi-image benchmarking."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from guidellm.data.deserializers.multi_image import MultiImageDataArgs as MultiImageDatasetConfig
+from guidellm.data.generators.multi_image import generate_synthetic_images
+
+__all__ = ["MultiImageBenchmark", "MultiImageBenchmarkResults"]
+
+
+@dataclass
+class MultiImageBenchmarkResults:
+    """Results from multi-image benchmark comparing multiple frame counts."""
+
+    results: dict[int, Any]  # {image_count: benchmark_result}
+
+    def ttft_by_count(self) -> dict[int, float]:
+        """Return mean TTFT (ms) for each image count."""
+        ttft = {}
+        for img_count, result in self.results.items():
+            if hasattr(result, "requests") and result.requests and hasattr(result.requests, "stats"):
+                if hasattr(result.requests.stats, "ttft_ms"):
+                    ttft[img_count] = result.requests.stats.ttft_ms.mean
+        return ttft
+
+    def itl_by_count(self) -> dict[int, float]:
+        """Return mean ITL (ms) for each image count."""
+        itl = {}
+        for img_count, result in self.results.items():
+            if hasattr(result, "requests") and result.requests and hasattr(result.requests, "stats"):
+                if hasattr(result.requests.stats, "itl_ms"):
+                    itl[img_count] = result.requests.stats.itl_ms.mean
+        return itl
+
+
+class MultiImageBenchmark:
+    """
+    Benchmark latency impact of multiple images per request.
+
+    Example:
+        bench = MultiImageBenchmark(
+            image_counts=[1, 2, 5],
+            prompt_tokens=256,
+            output_tokens=128,
+        )
+        config_dict = bench.get_configs()
+        # Use configs with benchmark runner
+    """
+
+    def __init__(
+        self,
+        image_counts: list[int],
+        prompt_tokens: int = 256,
+        output_tokens: int = 128,
+        image_size: str = "720p",
+        random_seed: int | None = None,
+        **kwargs: Any,
+    ):
+        """
+        Initialize multi-image benchmark configuration.
+
+        Args:
+            image_counts: List of image counts to benchmark (e.g., [1, 2, 5])
+            prompt_tokens: Average prompt token count
+            output_tokens: Average output token count
+            image_size: Image resolution ("720p")
+            random_seed: Random seed for reproducible image generation
+            **kwargs: Additional arguments for MultiImageDatasetConfig
+        """
+        self.image_counts = sorted(image_counts)
+        self.prompt_tokens = prompt_tokens
+        self.output_tokens = output_tokens
+        self.image_size = image_size
+        self.random_seed = random_seed
+        self.kwargs = kwargs
+
+    def get_configs(self) -> dict[int, MultiImageDatasetConfig]:
+        """
+        Get MultiImageDatasetConfig for each image count.
+
+        Returns:
+            Dict mapping image_count to MultiImageDatasetConfig
+        """
+        configs = {}
+        for img_count in self.image_counts:
+            configs[img_count] = MultiImageDatasetConfig(
+                prompt_tokens=self.prompt_tokens,
+                output_tokens=self.output_tokens,
+                images_per_request=img_count,
+                image_size=self.image_size,
+                **self.kwargs,
+            )
+        return configs
+
+    def generate_images(self, img_count: int) -> tuple[list[dict], int, int]:
+        """
+        Generate synthetic images for a given count.
+
+        Args:
+            img_count: Number of images to generate
+
+        Returns:
+            Tuple of (images_list, total_pixels, total_bytes)
+        """
+        return generate_synthetic_images(
+            num_images=img_count,
+            image_size=self.image_size,
+            seed=self.random_seed,
+        )
+
+    def get_image_stats(self, img_count: int) -> dict[str, int]:
+        """
+        Get image statistics (pixels, bytes) for a given count.
+
+        Args:
+            img_count: Number of images
+
+        Returns:
+            Dict with 'total_pixels' and 'total_bytes'
+        """
+        _, total_pixels, total_bytes = self.generate_images(img_count)
+        return {
+            "image_count": img_count,
+            "total_pixels": total_pixels,
+            "total_bytes": total_bytes,
+            "pixels_per_image": (total_pixels // img_count) if img_count > 0 else 0,
+            "bytes_per_image": (total_bytes // img_count) if img_count > 0 else 0,
+        }
diff --git a/src/guidellm/cli/benchmark/run.py b/src/guidellm/cli/benchmark/run.py
@@ -170,6 +170,16 @@
     type=int,
     help="Random seed for reproducibility.",
 )
+@click.option(
+    "--images-per-request",
+    type=str,
+    default=None,
+    help=(
+        "Number of images per request for vision benchmarks. "
+        "Single value (e.g., '2') or comma-separated list (e.g., '1,2,5'). "
+        "When a list is provided, runs sequential benchmarks for each count."
+    ),
+)
 # Output configuration
 @click.option(
     "--output-dir",
@@ -325,6 +335,18 @@ def run(**kwargs):  # noqa: C901, PLR0915, PLR0912
     # Only set CLI args that differ from click defaults
     kwargs = cli_tools.set_if_not_default(ctx, **kwargs)
 
+    # Handle images_per_request parameter
+    images_per_request = kwargs.pop("images_per_request", None)
+    if images_per_request:
+        # Parse as single value or comma-separated list
+        try:
+            parts = [int(x.strip()) for x in images_per_request.split(",")]
+            kwargs["images_per_request"] = parts if len(parts) > 1 else parts[0]
+        except ValueError:
+            raise click.BadParameter(
+                f"--images-per-request must be an integer or comma-separated integers, got '{images_per_request}'"
+            )
+
     # Handle output path remapping
     if (output_path := kwargs.pop("output_path", None)) is not None:
         if kwargs.get("output_dir", None) is not None:

diff --git a/src/guidellm/data/deserializers/__init__.py b/src/guidellm/data/deserializers/__init__.py
@@ -23,6 +23,7 @@
     InMemoryItemListDataArgs,
     InMemoryItemListDatasetDeserializer,
 )
+from .multi_image import MultiImageDataArgs, MultiImageDatasetDeserializer
 from .synthetic import (
     SyntheticTextDataArgs,
     SyntheticTextDataset,
@@ -49,6 +50,8 @@
     "InMemoryItemListDataArgs",
     "InMemoryItemListDatasetDeserializer",
     "JSONFileDatasetDeserializer",
+    "MultiImageDataArgs",
+    "MultiImageDatasetDeserializer",
     "ParquetFileDatasetDeserializer",
     "SyntheticTextDataArgs",
     "SyntheticTextDataset",

diff --git a/src/guidellm/data/deserializers/multi_image.py b/src/guidellm/data/deserializers/multi_image.py
@@ -0,0 +1,82 @@
+"""Multi-image synthetic data deserializer for vision benchmarking."""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import Literal
+
+from datasets import IterableDataset
+from pydantic import Field, field_validator
+from transformers import PreTrainedTokenizerBase
+
+from guidellm.data.deserializers.deserializer import (
+    DatasetDeserializerFactory,
+)
+from guidellm.data.deserializers.synthetic import (
+    SyntheticTextDataArgs,
+    SyntheticTextDataset,
+    SyntheticTextDatasetDeserializer,
+)
+from guidellm.data.generators.multi_image import ImageSize
+from guidellm.data.schemas import DataArgs
+
+__all__ = [
+    "MultiImageDataArgs",
+    "MultiImageDatasetDeserializer",
+]
+
+_VALID_IMAGE_SIZES = sorted(ImageSize.SIZES.keys())
+
+
+@DataArgs.register("multi_image")
+class MultiImageDataArgs(SyntheticTextDataArgs):
+    """
+    Data args for generating synthetic multi-image prompts.
+
+    Extends SyntheticTextDataArgs with image count and resolution fields.
+    """
+
+    kind: Literal["multi_image"] = Field(  # type: ignore[assignment]
+        default="multi_image",
+        description="Type identifier for the multi-image dataset configuration.",
+    )
+    images_per_request: int = Field(
+        description="Number of images to include per request.",
+        ge=1,
+        le=10,
+        default=1,
+    )
+    image_size: str = Field(
+        description=(
+            f"Standard image resolution key. Valid values: {_VALID_IMAGE_SIZES}."
+        ),
+        default="720p",
+    )
+
+    @field_validator("image_size")
+    @classmethod
+    def validate_image_size(cls, value: str) -> str:
+        if value not in ImageSize.SIZES:
+            raise ValueError(
+                f"Invalid image_size {value!r}. Valid options: {_VALID_IMAGE_SIZES}"
+            )
+        return value
+
+
+# Keep the old name as an alias for backwards compatibility within this PR.
+MultiImageDatasetConfig = MultiImageDataArgs
+
+
+@DatasetDeserializerFactory.register("multi_image")
+class MultiImageDatasetDeserializer(SyntheticTextDatasetDeserializer):
+    def __call__(
+        self,
+        config: MultiImageDataArgs,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+    ) -> IterableDataset:
+        return SyntheticTextDataset(
+            config=config,
+            processor=processor_factory(),
+            random_seed=random_seed,
+        )
diff --git a/src/guidellm/data/generators/__init__.py b/src/guidellm/data/generators/__init__.py
@@ -0,0 +1,7 @@
+"""Data generators for GuideLLM."""
+
+from guidellm.data.generators.multi_image import generate_synthetic_images
+
+__all__ = [
+    "generate_synthetic_images",
+]