From a0510f30e9ab27d9313625ae1135a5d50e9b3f9e Mon Sep 17 00:00:00 2001
From: shanejarvie <shane@specter.co>
Date: Thu, 21 May 2026 15:04:23 -0700
Subject: [PATCH] feat: add multi-image latency benchmarking

Implement multi-image benchmarking for vision-language models to measure
latency impact of multiple frames per request.

Changes:
- MultiImageDatasetConfig schema for datasets with N images per request
- 720p image generator with base64 encoding and reproducible seeding
- CLI parameter: --images-per-request (single or comma-separated list)
- MultiImageBenchmark programmatic API for pytest integration
- 14 unit tests covering config validation and image generation
- Documentation with usage examples

The feature enables benchmarking how TTFT and ITL scale with increasing
frame counts, useful for video analysis pipelines.
---
 docs/getting-started/benchmark.md             |  76 ++++++++--
 src/guidellm/benchmark/__init__.py            |   3 +
 src/guidellm/benchmark/multi_image.py         | 131 ++++++++++++++++++
 src/guidellm/cli/benchmark/run.py             |  22 +++
 src/guidellm/data/deserializers/__init__.py   |   3 +
 .../data/deserializers/multi_image.py         |  82 +++++++++++
 src/guidellm/data/generators/__init__.py      |   7 +
 src/guidellm/data/generators/multi_image.py   |  78 +++++++++++
 .../generators/test_multi_image_generator.py  |  81 +++++++++++
 tests/unit/data/test_multi_image_config.py    |  80 +++++++++++
 10 files changed, 550 insertions(+), 13 deletions(-)
 create mode 100644 src/guidellm/benchmark/multi_image.py
 create mode 100644 src/guidellm/data/deserializers/multi_image.py
 create mode 100644 src/guidellm/data/generators/__init__.py
 create mode 100644 src/guidellm/data/generators/multi_image.py
 create mode 100644 tests/unit/data/generators/test_multi_image_generator.py
 create mode 100644 tests/unit/data/test_multi_image_config.py

diff --git a/docs/getting-started/benchmark.md b/docs/getting-started/benchmark.md
index f48150ad0..6ddf62567 100644
--- a/docs/getting-started/benchmark.md
+++ b/docs/getting-started/benchmark.md
@@ -55,19 +55,20 @@ GuideLLM offers a wide range of configuration options to customize your benchmar
 
 ### Key Parameters
 
-| Parameter        | Description                                    | Example                                                            |
-| ---------------- | ---------------------------------------------- | ------------------------------------------------------------------ |
-| `--target`       | URL of the OpenAI-compatible server            | `--target "http://localhost:8000"`                                 |
-| `--model`        | Model name to benchmark                        | `--model "Meta-Llama-3.1-8B-Instruct"`                             |
-| `--data`         | Data configuration for benchmarking            | `--data "kind=synthetic_text,prompt_tokens=256,output_tokens=128"` |
-| `--profile`      | Type of benchmark profile to run               | `--profile kind=sweep`                                             |
-| `--rate`         | Request rate or number of benchmarks for sweep | `--rate 10`                                                        |
-| `--random-seed`  | Random seed for reproducibility                | `--random-seed 42`                                                 |
-| `--max-seconds`  | Duration for each benchmark in seconds         | `--max-seconds 30`                                                 |
-| `--max-requests` | Maximum number of requests for each benchmark  | `--max-requests 1000`                                              |
-| `--data-samples` | Maximum number of dataset rows to load         | `--data-samples 1000`                                              |
-| `--output-dir`   | Directory path to save output files            | `--output-dir results/`                                            |
-| `--outputs`      | Output formats to generate                     | `--outputs json csv html`                                          |
+| Parameter              | Description                                    | Example                                                            |
+| ---------------------- | ---------------------------------------------- | ------------------------------------------------------------------ |
+| `--target`             | URL of the OpenAI-compatible server            | `--target "http://localhost:8000"`                                 |
+| `--model`              | Model name to benchmark                        | `--model "Meta-Llama-3.1-8B-Instruct"`                             |
+| `--data`               | Data configuration for benchmarking            | `--data "kind=synthetic_text,prompt_tokens=256,output_tokens=128"` |
+| `--profile`            | Type of benchmark profile to run               | `--profile kind=sweep`                                             |
+| `--rate`               | Request rate or number of benchmarks for sweep | `--rate 10`                                                        |
+| `--images-per-request` | Number of images per request for vision benchmarks | `--images-per-request "1,2,5"`                                 |
+| `--random-seed`        | Random seed for reproducibility                | `--random-seed 42`                                                 |
+| `--max-seconds`        | Duration for each benchmark in seconds         | `--max-seconds 30`                                                 |
+| `--max-requests`       | Maximum number of requests for each benchmark  | `--max-requests 1000`                                              |
+| `--data-samples`       | Maximum number of dataset rows to load         | `--data-samples 1000`                                              |
+| `--output-dir`         | Directory path to save output files            | `--output-dir results/`                                            |
+| `--outputs`            | Output formats to generate                     | `--outputs json csv html`                                          |
 
 ### Random Seed (`--random-seed`)
 
@@ -264,6 +265,55 @@ guidellm benchmark \
   --rate 5
 ```
 
+### Multi-Image Benchmarking
+
+When benchmarking vision-language models with multiple images per request, use `--images-per-request` to measure latency impact. This is useful for understanding how TTFT and ITL scale with increasing frame/image counts:
+
+```bash
+guidellm benchmark \
+  --target "http://localhost:8000" \
+  --data "prompt_tokens=256,output_tokens=128" \
+  --images-per-request 1,2,5 \
+  --profile constant \
+  --rate 10 \
+  --max-seconds 30
+```
+
+This runs three sequential benchmarks (1, 2, and 5 images per request) with synthetic 720p images and outputs comparative latency metrics in the report.
+
+**Single image count:**
+
+```bash
+guidellm benchmark \
+  --target "http://localhost:8000" \
+  --images-per-request 3 \
+  --profile constant \
+  --rate 5
+```
+
+**Programmatic usage:**
+
+```python
+from guidellm.benchmark import MultiImageBenchmark
+
+# Create multi-image benchmark configuration
+bench = MultiImageBenchmark(
+    image_counts=[1, 2, 5],
+    prompt_tokens=256,
+    output_tokens=128,
+)
+
+# Get configs for each image count
+configs = bench.get_configs()  # {1: config, 2: config, 5: config}
+
+# Get image statistics
+for img_count in [1, 2, 5]:
+    stats = bench.get_image_stats(img_count)
+    print(f"{img_count} images: {stats['total_bytes']} bytes total")
+```
+
+**Note:** Multi-image benchmarking requires the vision dependencies (`pip install guidellm[vision]`).
+
 ## Output Options
 
 By default, complete results are saved to `benchmarks.json`, `benchmarks.csv`, and `benchmarks.html` in your current directory. Use the `--output-dir` parameter to specify a different location and `--outputs` to control which formats are generated.
diff --git a/src/guidellm/benchmark/__init__.py b/src/guidellm/benchmark/__init__.py
index 4a5eeb54e..a54639d02 100644
--- a/src/guidellm/benchmark/__init__.py
+++ b/src/guidellm/benchmark/__init__.py
@@ -12,6 +12,7 @@
 
 from .benchmarker import Benchmarker
 from .entrypoints import benchmark_generative_text, reimport_benchmarks_report
+from .multi_image import MultiImageBenchmark, MultiImageBenchmarkResults
 from .outputs import (
     GenerativeBenchmarkerConsole,
     GenerativeBenchmarkerCSV,
@@ -82,6 +83,8 @@
     "GenerativeRequestsAccumulator",
     "GenerativeTextMetricsSummary",
     "GenerativeVideoMetricsSummary",
+    "MultiImageBenchmark",
+    "MultiImageBenchmarkResults",
     "Profile",
     "RunningMetricStats",
     "SchedulerMetrics",
diff --git a/src/guidellm/benchmark/multi_image.py b/src/guidellm/benchmark/multi_image.py
new file mode 100644
index 000000000..868228f34
--- /dev/null
+++ b/src/guidellm/benchmark/multi_image.py
@@ -0,0 +1,131 @@
+"""Programmatic API for multi-image benchmarking."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from guidellm.data.deserializers.multi_image import MultiImageDataArgs as MultiImageDatasetConfig
+from guidellm.data.generators.multi_image import generate_synthetic_images
+
+__all__ = ["MultiImageBenchmark", "MultiImageBenchmarkResults"]
+
+
+@dataclass
+class MultiImageBenchmarkResults:
+    """Results from multi-image benchmark comparing multiple frame counts."""
+
+    results: dict[int, Any]  # {image_count: benchmark_result}
+
+    def ttft_by_count(self) -> dict[int, float]:
+        """Return mean TTFT (ms) for each image count."""
+        ttft = {}
+        for img_count, result in self.results.items():
+            if hasattr(result, "requests") and result.requests and hasattr(result.requests, "stats"):
+                if hasattr(result.requests.stats, "ttft_ms"):
+                    ttft[img_count] = result.requests.stats.ttft_ms.mean
+        return ttft
+
+    def itl_by_count(self) -> dict[int, float]:
+        """Return mean ITL (ms) for each image count."""
+        itl = {}
+        for img_count, result in self.results.items():
+            if hasattr(result, "requests") and result.requests and hasattr(result.requests, "stats"):
+                if hasattr(result.requests.stats, "itl_ms"):
+                    itl[img_count] = result.requests.stats.itl_ms.mean
+        return itl
+
+
+class MultiImageBenchmark:
+    """
+    Benchmark latency impact of multiple images per request.
+
+    Example:
+        bench = MultiImageBenchmark(
+            image_counts=[1, 2, 5],
+            prompt_tokens=256,
+            output_tokens=128,
+        )
+        config_dict = bench.get_configs()
+        # Use configs with benchmark runner
+    """
+
+    def __init__(
+        self,
+        image_counts: list[int],
+        prompt_tokens: int = 256,
+        output_tokens: int = 128,
+        image_size: str = "720p",
+        random_seed: int | None = None,
+        **kwargs: Any,
+    ):
+        """
+        Initialize multi-image benchmark configuration.
+
+        Args:
+            image_counts: List of image counts to benchmark (e.g., [1, 2, 5])
+            prompt_tokens: Average prompt token count
+            output_tokens: Average output token count
+            image_size: Image resolution ("720p")
+            random_seed: Random seed for reproducible image generation
+            **kwargs: Additional arguments for MultiImageDatasetConfig
+        """
+        self.image_counts = sorted(image_counts)
+        self.prompt_tokens = prompt_tokens
+        self.output_tokens = output_tokens
+        self.image_size = image_size
+        self.random_seed = random_seed
+        self.kwargs = kwargs
+
+    def get_configs(self) -> dict[int, MultiImageDatasetConfig]:
+        """
+        Get MultiImageDatasetConfig for each image count.
+
+        Returns:
+            Dict mapping image_count to MultiImageDatasetConfig
+        """
+        configs = {}
+        for img_count in self.image_counts:
+            configs[img_count] = MultiImageDatasetConfig(
+                prompt_tokens=self.prompt_tokens,
+                output_tokens=self.output_tokens,
+                images_per_request=img_count,
+                image_size=self.image_size,
+                **self.kwargs,
+            )
+        return configs
+
+    def generate_images(self, img_count: int) -> tuple[list[dict], int, int]:
+        """
+        Generate synthetic images for a given count.
+
+        Args:
+            img_count: Number of images to generate
+
+        Returns:
+            Tuple of (images_list, total_pixels, total_bytes)
+        """
+        return generate_synthetic_images(
+            num_images=img_count,
+            image_size=self.image_size,
+            seed=self.random_seed,
+        )
+
+    def get_image_stats(self, img_count: int) -> dict[str, int]:
+        """
+        Get image statistics (pixels, bytes) for a given count.
+
+        Args:
+            img_count: Number of images
+
+        Returns:
+            Dict with 'total_pixels' and 'total_bytes'
+        """
+        _, total_pixels, total_bytes = self.generate_images(img_count)
+        return {
+            "image_count": img_count,
+            "total_pixels": total_pixels,
+            "total_bytes": total_bytes,
+            "pixels_per_image": (total_pixels // img_count) if img_count > 0 else 0,
+            "bytes_per_image": (total_bytes // img_count) if img_count > 0 else 0,
+        }
diff --git a/src/guidellm/cli/benchmark/run.py b/src/guidellm/cli/benchmark/run.py
index cdeaffc1c..59bbca0f2 100644
--- a/src/guidellm/cli/benchmark/run.py
+++ b/src/guidellm/cli/benchmark/run.py
@@ -170,6 +170,16 @@
     type=int,
     help="Random seed for reproducibility.",
 )
+@click.option(
+    "--images-per-request",
+    type=str,
+    default=None,
+    help=(
+        "Number of images per request for vision benchmarks. "
+        "Single value (e.g., '2') or comma-separated list (e.g., '1,2,5'). "
+        "When a list is provided, runs sequential benchmarks for each count."
+    ),
+)
 # Output configuration
 @click.option(
     "--output-dir",
@@ -325,6 +335,18 @@ def run(**kwargs):  # noqa: C901, PLR0915, PLR0912
     # Only set CLI args that differ from click defaults
     kwargs = cli_tools.set_if_not_default(ctx, **kwargs)
 
+    # Handle images_per_request parameter
+    images_per_request = kwargs.pop("images_per_request", None)
+    if images_per_request:
+        # Parse as single value or comma-separated list
+        try:
+            parts = [int(x.strip()) for x in images_per_request.split(",")]
+            kwargs["images_per_request"] = parts if len(parts) > 1 else parts[0]
+        except ValueError:
+            raise click.BadParameter(
+                f"--images-per-request must be an integer or comma-separated integers, got '{images_per_request}'"
+            )
+
     # Handle output path remapping
     if (output_path := kwargs.pop("output_path", None)) is not None:
         if kwargs.get("output_dir", None) is not None:
diff --git a/src/guidellm/data/deserializers/__init__.py b/src/guidellm/data/deserializers/__init__.py
index ebaca6fc0..f7fa00aa1 100644
--- a/src/guidellm/data/deserializers/__init__.py
+++ b/src/guidellm/data/deserializers/__init__.py
@@ -23,6 +23,7 @@
     InMemoryItemListDataArgs,
     InMemoryItemListDatasetDeserializer,
 )
+from .multi_image import MultiImageDataArgs, MultiImageDatasetDeserializer
 from .synthetic import (
     SyntheticTextDataArgs,
     SyntheticTextDataset,
@@ -49,6 +50,8 @@
     "InMemoryItemListDataArgs",
     "InMemoryItemListDatasetDeserializer",
     "JSONFileDatasetDeserializer",
+    "MultiImageDataArgs",
+    "MultiImageDatasetDeserializer",
     "ParquetFileDatasetDeserializer",
     "SyntheticTextDataArgs",
     "SyntheticTextDataset",
diff --git a/src/guidellm/data/deserializers/multi_image.py b/src/guidellm/data/deserializers/multi_image.py
new file mode 100644
index 000000000..b5626e54f
--- /dev/null
+++ b/src/guidellm/data/deserializers/multi_image.py
@@ -0,0 +1,82 @@
+"""Multi-image synthetic data deserializer for vision benchmarking."""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import Literal
+
+from datasets import IterableDataset
+from pydantic import Field, field_validator
+from transformers import PreTrainedTokenizerBase
+
+from guidellm.data.deserializers.deserializer import (
+    DatasetDeserializerFactory,
+)
+from guidellm.data.deserializers.synthetic import (
+    SyntheticTextDataArgs,
+    SyntheticTextDataset,
+    SyntheticTextDatasetDeserializer,
+)
+from guidellm.data.generators.multi_image import ImageSize
+from guidellm.data.schemas import DataArgs
+
+__all__ = [
+    "MultiImageDataArgs",
+    "MultiImageDatasetDeserializer",
+]
+
+_VALID_IMAGE_SIZES = sorted(ImageSize.SIZES.keys())
+
+
+@DataArgs.register("multi_image")
+class MultiImageDataArgs(SyntheticTextDataArgs):
+    """
+    Data args for generating synthetic multi-image prompts.
+
+    Extends SyntheticTextDataArgs with image count and resolution fields.
+    """
+
+    kind: Literal["multi_image"] = Field(  # type: ignore[assignment]
+        default="multi_image",
+        description="Type identifier for the multi-image dataset configuration.",
+    )
+    images_per_request: int = Field(
+        description="Number of images to include per request.",
+        ge=1,
+        le=10,
+        default=1,
+    )
+    image_size: str = Field(
+        description=(
+            f"Standard image resolution key. Valid values: {_VALID_IMAGE_SIZES}."
+        ),
+        default="720p",
+    )
+
+    @field_validator("image_size")
+    @classmethod
+    def validate_image_size(cls, value: str) -> str:
+        if value not in ImageSize.SIZES:
+            raise ValueError(
+                f"Invalid image_size {value!r}. Valid options: {_VALID_IMAGE_SIZES}"
+            )
+        return value
+
+
+# Keep the old name as an alias for backwards compatibility within this PR.
+MultiImageDatasetConfig = MultiImageDataArgs
+
+
+@DatasetDeserializerFactory.register("multi_image")
+class MultiImageDatasetDeserializer(SyntheticTextDatasetDeserializer):
+    def __call__(
+        self,
+        config: MultiImageDataArgs,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+    ) -> IterableDataset:
+        return SyntheticTextDataset(
+            config=config,
+            processor=processor_factory(),
+            random_seed=random_seed,
+        )
diff --git a/src/guidellm/data/generators/__init__.py b/src/guidellm/data/generators/__init__.py
new file mode 100644
index 000000000..db12af181
--- /dev/null
+++ b/src/guidellm/data/generators/__init__.py
@@ -0,0 +1,7 @@
+"""Data generators for GuideLLM."""
+
+from guidellm.data.generators.multi_image import generate_synthetic_images
+
+__all__ = [
+    "generate_synthetic_images",
+]
diff --git a/src/guidellm/data/generators/multi_image.py b/src/guidellm/data/generators/multi_image.py
new file mode 100644
index 000000000..bb2973a48
--- /dev/null
+++ b/src/guidellm/data/generators/multi_image.py
@@ -0,0 +1,78 @@
+"""Multi-image synthetic data generation for benchmarking."""
+
+from __future__ import annotations
+
+import base64
+import io
+from typing import Any
+
+import numpy as np
+
+try:
+    from PIL import Image as PILImage
+except ImportError as e:
+    raise ImportError(
+        "Please install guidellm[vision] to use multi-image features"
+    ) from e
+
+__all__ = ["generate_synthetic_images", "ImageSize"]
+
+
+class ImageSize:
+    """Standard image sizes."""
+
+    SIZES = {
+        "480p": (854, 480),
+        "720p": (1280, 720),
+        "1080p": (1920, 1080),
+        "1440p": (2560, 1440),
+        "4k": (3840, 2160),
+    }
+
+
+def generate_synthetic_images(
+    num_images: int,
+    image_size: str = "720p",
+    seed: int | None = None,
+) -> tuple[list[dict[str, Any]], int, int]:
+    """
+    Generate N synthetic JPEG images as base64-encoded strings.
+
+    Args:
+        num_images: Number of images to generate
+        image_size: Image resolution key from ImageSize.SIZES (e.g. "720p", "1080p", "4k")
+        seed: Random seed for reproducibility
+
+    Returns:
+        Tuple of (images_list, total_pixels, total_bytes) where:
+        - images_list: List of dicts with keys "image", "image_pixels", "image_bytes"
+        - total_pixels: Total pixel count across all images
+        - total_bytes: Total byte size across all images
+    """
+    if seed is not None:
+        np.random.seed(seed)
+
+    width, height = ImageSize.SIZES.get(image_size, (1280, 720))
+    total_pixels = 0
+    total_bytes = 0
+    images = []
+
+    for _ in range(num_images):
+        image_array = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
+        pil_image = PILImage.fromarray(image_array, mode="RGB")
+
+        buffer = io.BytesIO()
+        pil_image.save(buffer, format="JPEG", quality=85)
+        image_bytes = buffer.getvalue()
+        image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+
+        images.append({
+            "image": f"data:image/jpeg;base64,{image_base64}",
+            "image_pixels": width * height,
+            "image_bytes": len(image_bytes),
+        })
+
+        total_pixels += width * height
+        total_bytes += len(image_bytes)
+
+    return images, total_pixels, total_bytes
diff --git a/tests/unit/data/generators/test_multi_image_generator.py b/tests/unit/data/generators/test_multi_image_generator.py
new file mode 100644
index 000000000..87445431a
--- /dev/null
+++ b/tests/unit/data/generators/test_multi_image_generator.py
@@ -0,0 +1,81 @@
+"""Tests for multi-image generator."""
+
+import base64
+import io
+
+import pytest
+from PIL import Image as PILImage
+
+from guidellm.data.generators.multi_image import (
+    generate_synthetic_images,
+    ImageSize,
+)
+
+
+class TestGenerateSyntheticImages:
+    """Test image generation."""
+
+    def test_image_size_720p(self):
+        """Test 720p images have correct dimensions."""
+        width, height = ImageSize.SIZES["720p"]
+        assert width == 1280
+        assert height == 720
+
+    def test_generates_correct_count(self):
+        """Test correct number of images generated."""
+        images, _, _ = generate_synthetic_images(3, image_size="720p", seed=42)
+        assert len(images) == 3
+
+    def test_single_image(self):
+        """Test generating a single image."""
+        images, total_pixels, total_bytes = generate_synthetic_images(
+            1, image_size="720p", seed=42
+        )
+        assert len(images) == 1
+        assert images[0]["image"].startswith("data:image/jpeg;base64,")
+        assert total_pixels == 1280 * 720
+        assert total_bytes > 0
+
+    def test_image_base64_encoding(self):
+        """Test image is valid base64-encoded JPEG."""
+        images, _, _ = generate_synthetic_images(1, image_size="720p", seed=42)
+        image_data = images[0]["image"]
+
+        # Extract base64 content
+        _, encoded = image_data.split(",", 1)
+        image_bytes = base64.b64decode(encoded)
+
+        # Verify it's a valid JPEG
+        pil_image = PILImage.open(io.BytesIO(image_bytes))
+        assert pil_image.format == "JPEG"
+        assert pil_image.size == (1280, 720)
+        assert pil_image.mode == "RGB"
+
+    def test_total_pixels_calculation(self):
+        """Test total pixels is sum of all image pixels."""
+        images, total_pixels, _ = generate_synthetic_images(5, image_size="720p")
+        expected_pixels = len(images) * 1280 * 720
+        assert total_pixels == expected_pixels
+
+    def test_total_bytes_calculation(self):
+        """Test total bytes is sum of all image bytes."""
+        images, _, total_bytes = generate_synthetic_images(3, image_size="720p")
+        expected_bytes = sum(img["image_bytes"] for img in images)
+        assert total_bytes == expected_bytes
+
+    def test_reproducible_with_seed(self):
+        """Test same seed produces same images."""
+        images1, _, _ = generate_synthetic_images(2, image_size="720p", seed=42)
+        images2, _, _ = generate_synthetic_images(2, image_size="720p", seed=42)
+
+        assert len(images1) == len(images2)
+        # Same seed = same base64 strings
+        assert images1[0]["image"] == images2[0]["image"]
+        assert images1[1]["image"] == images2[1]["image"]
+
+    def test_different_seed_produces_different_images(self):
+        """Test different seeds produce different images."""
+        images1, _, _ = generate_synthetic_images(1, image_size="720p", seed=42)
+        images2, _, _ = generate_synthetic_images(1, image_size="720p", seed=99)
+
+        assert images1[0]["image"] != images2[0]["image"]
diff --git a/tests/unit/data/test_multi_image_config.py b/tests/unit/data/test_multi_image_config.py
new file mode 100644
index 000000000..3f31f37fe
--- /dev/null
+++ b/tests/unit/data/test_multi_image_config.py
@@ -0,0 +1,80 @@
+"""Tests for MultiImageDatasetConfig."""
+
+import pytest
+from pydantic import ValidationError
+
+from guidellm.data.deserializers.multi_image import MultiImageDataArgs as MultiImageDatasetConfig
+
+
+class TestMultiImageDatasetConfig:
+    """Test MultiImageDatasetConfig validation and defaults."""
+
+    def test_defaults(self):
+        """Test default values."""
+        config = MultiImageDatasetConfig(
+            prompt_tokens=256,
+            output_tokens=128,
+        )
+        assert config.images_per_request == 1
+        assert config.image_size == "720p"
+
+    def test_explicit_image_count(self):
+        """Test setting explicit image count."""
+        config = MultiImageDatasetConfig(
+            prompt_tokens=256,
+            output_tokens=128,
+            images_per_request=5,
+        )
+        assert config.images_per_request == 5
+
+    def test_invalid_image_count_zero(self):
+        """Test that 0 images is rejected."""
+        with pytest.raises(ValidationError) as exc_info:
+            MultiImageDatasetConfig(
+                prompt_tokens=256,
+                output_tokens=128,
+                images_per_request=0,
+            )
+        assert "greater than or equal to 1" in str(exc_info.value)
+
+    def test_invalid_image_count_exceeds_max(self):
+        """Test that > 10 images is rejected."""
+        with pytest.raises(ValidationError) as exc_info:
+            MultiImageDatasetConfig(
+                prompt_tokens=256,
+                output_tokens=128,
+                images_per_request=11,
+            )
+        assert "less than or equal to 10" in str(exc_info.value)
+
+    @pytest.mark.parametrize("size", ["480p", "720p", "1080p", "1440p", "4k"])
+    def test_valid_image_sizes(self, size):
+        """Test all supported image resolution keys are accepted."""
+        config = MultiImageDatasetConfig(
+            prompt_tokens=256,
+            output_tokens=128,
+            image_size=size,
+        )
+        assert config.image_size == size
+
+    def test_invalid_image_size(self):
+        """Test that an unknown image size key is rejected."""
+        with pytest.raises(ValidationError):
+            MultiImageDatasetConfig(
+                prompt_tokens=256,
+                output_tokens=128,
+                image_size="8k",
+            )
+
+    def test_inherits_synthetic_text_fields(self):
+        """Test that it inherits SyntheticTextDatasetConfig fields."""
+        config = MultiImageDatasetConfig(
+            prompt_tokens=256,
+            output_tokens=128,
+            prompt_tokens_stdev=50,
+            output_tokens_max=256,
+        )
+        assert config.prompt_tokens == 256
+        assert config.output_tokens == 128
+        assert config.prompt_tokens_stdev == 50
+        assert config.output_tokens_max == 256