From a0510f30e9ab27d9313625ae1135a5d50e9b3f9e Mon Sep 17 00:00:00 2001 From: shanejarvie Date: Thu, 21 May 2026 15:04:23 -0700 Subject: [PATCH] feat: add multi-image latency benchmarking Implement multi-image benchmarking for vision-language models to measure latency impact of multiple frames per request. Changes: - MultiImageDatasetConfig schema for datasets with N images per request - 720p image generator with base64 encoding and reproducible seeding - CLI parameter: --images-per-request (single or comma-separated list) - MultiImageBenchmark programmatic API for pytest integration - 14 unit tests covering config validation and image generation - Documentation with usage examples The feature enables benchmarking how TTFT and ITL scale with increasing frame counts, useful for video analysis pipelines. --- docs/getting-started/benchmark.md | 76 ++++++++-- src/guidellm/benchmark/__init__.py | 3 + src/guidellm/benchmark/multi_image.py | 131 ++++++++++++++++++ src/guidellm/cli/benchmark/run.py | 22 +++ src/guidellm/data/deserializers/__init__.py | 3 + .../data/deserializers/multi_image.py | 82 +++++++++++ src/guidellm/data/generators/__init__.py | 7 + src/guidellm/data/generators/multi_image.py | 78 +++++++++++ .../generators/test_multi_image_generator.py | 81 +++++++++++ tests/unit/data/test_multi_image_config.py | 80 +++++++++++ 10 files changed, 550 insertions(+), 13 deletions(-) create mode 100644 src/guidellm/benchmark/multi_image.py create mode 100644 src/guidellm/data/deserializers/multi_image.py create mode 100644 src/guidellm/data/generators/__init__.py create mode 100644 src/guidellm/data/generators/multi_image.py create mode 100644 tests/unit/data/generators/test_multi_image_generator.py create mode 100644 tests/unit/data/test_multi_image_config.py diff --git a/docs/getting-started/benchmark.md b/docs/getting-started/benchmark.md index f48150ad0..6ddf62567 100644 --- a/docs/getting-started/benchmark.md +++ b/docs/getting-started/benchmark.md @@ -55,19 +55,20 @@ GuideLLM offers a wide range of configuration options to customize your benchmar ### Key Parameters -| Parameter | Description | Example | -| ---------------- | ---------------------------------------------- | ------------------------------------------------------------------ | -| `--target` | URL of the OpenAI-compatible server | `--target "http://localhost:8000"` | -| `--model` | Model name to benchmark | `--model "Meta-Llama-3.1-8B-Instruct"` | -| `--data` | Data configuration for benchmarking | `--data "kind=synthetic_text,prompt_tokens=256,output_tokens=128"` | -| `--profile` | Type of benchmark profile to run | `--profile kind=sweep` | -| `--rate` | Request rate or number of benchmarks for sweep | `--rate 10` | -| `--random-seed` | Random seed for reproducibility | `--random-seed 42` | -| `--max-seconds` | Duration for each benchmark in seconds | `--max-seconds 30` | -| `--max-requests` | Maximum number of requests for each benchmark | `--max-requests 1000` | -| `--data-samples` | Maximum number of dataset rows to load | `--data-samples 1000` | -| `--output-dir` | Directory path to save output files | `--output-dir results/` | -| `--outputs` | Output formats to generate | `--outputs json csv html` | +| Parameter | Description | Example | +| ---------------------- | ---------------------------------------------- | ------------------------------------------------------------------ | +| `--target` | URL of the OpenAI-compatible server | `--target "http://localhost:8000"` | +| `--model` | Model name to benchmark | `--model "Meta-Llama-3.1-8B-Instruct"` | +| `--data` | Data configuration for benchmarking | `--data "kind=synthetic_text,prompt_tokens=256,output_tokens=128"` | +| `--profile` | Type of benchmark profile to run | `--profile kind=sweep` | +| `--rate` | Request rate or number of benchmarks for sweep | `--rate 10` | +| `--images-per-request` | Number of images per request for vision benchmarks | `--images-per-request "1,2,5"` | +| `--random-seed` | Random seed for reproducibility | `--random-seed 42` | +| `--max-seconds` | Duration for each benchmark in seconds | `--max-seconds 30` | +| `--max-requests` | Maximum number of requests for each benchmark | `--max-requests 1000` | +| `--data-samples` | Maximum number of dataset rows to load | `--data-samples 1000` | +| `--output-dir` | Directory path to save output files | `--output-dir results/` | +| `--outputs` | Output formats to generate | `--outputs json csv html` | ### Random Seed (`--random-seed`) @@ -264,6 +265,55 @@ guidellm benchmark \ --rate 5 ``` +### Multi-Image Benchmarking + +When benchmarking vision-language models with multiple images per request, use `--images-per-request` to measure latency impact. This is useful for understanding how TTFT and ITL scale with increasing frame/image counts: + +```bash +guidellm benchmark \ + --target "http://localhost:8000" \ + --data "prompt_tokens=256,output_tokens=128" \ + --images-per-request 1,2,5 \ + --profile constant \ + --rate 10 \ + --max-seconds 30 +``` + +This runs three sequential benchmarks (1, 2, and 5 images per request) with synthetic 720p images and outputs comparative latency metrics in the report. + +**Single image count:** + +```bash +guidellm benchmark \ + --target "http://localhost:8000" \ + --images-per-request 3 \ + --profile constant \ + --rate 5 +``` + +**Programmatic usage:** + +```python +from guidellm.benchmark import MultiImageBenchmark + +# Create multi-image benchmark configuration +bench = MultiImageBenchmark( + image_counts=[1, 2, 5], + prompt_tokens=256, + output_tokens=128, +) + +# Get configs for each image count +configs = bench.get_configs() # {1: config, 2: config, 5: config} + +# Get image statistics +for img_count in [1, 2, 5]: + stats = bench.get_image_stats(img_count) + print(f"{img_count} images: {stats['total_bytes']} bytes total") +``` + +**Note:** Multi-image benchmarking requires the vision dependencies (`pip install guidellm[vision]`). + ## Output Options By default, complete results are saved to `benchmarks.json`, `benchmarks.csv`, and `benchmarks.html` in your current directory. Use the `--output-dir` parameter to specify a different location and `--outputs` to control which formats are generated. diff --git a/src/guidellm/benchmark/__init__.py b/src/guidellm/benchmark/__init__.py index 4a5eeb54e..a54639d02 100644 --- a/src/guidellm/benchmark/__init__.py +++ b/src/guidellm/benchmark/__init__.py @@ -12,6 +12,7 @@ from .benchmarker import Benchmarker from .entrypoints import benchmark_generative_text, reimport_benchmarks_report +from .multi_image import MultiImageBenchmark, MultiImageBenchmarkResults from .outputs import ( GenerativeBenchmarkerConsole, GenerativeBenchmarkerCSV, @@ -82,6 +83,8 @@ "GenerativeRequestsAccumulator", "GenerativeTextMetricsSummary", "GenerativeVideoMetricsSummary", + "MultiImageBenchmark", + "MultiImageBenchmarkResults", "Profile", "RunningMetricStats", "SchedulerMetrics", diff --git a/src/guidellm/benchmark/multi_image.py b/src/guidellm/benchmark/multi_image.py new file mode 100644 index 000000000..868228f34 --- /dev/null +++ b/src/guidellm/benchmark/multi_image.py @@ -0,0 +1,131 @@ +"""Programmatic API for multi-image benchmarking.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +from guidellm.data.deserializers.multi_image import MultiImageDataArgs as MultiImageDatasetConfig +from guidellm.data.generators.multi_image import generate_synthetic_images + +__all__ = ["MultiImageBenchmark", "MultiImageBenchmarkResults"] + + +@dataclass +class MultiImageBenchmarkResults: + """Results from multi-image benchmark comparing multiple frame counts.""" + + results: dict[int, Any] # {image_count: benchmark_result} + + def ttft_by_count(self) -> dict[int, float]: + """Return mean TTFT (ms) for each image count.""" + ttft = {} + for img_count, result in self.results.items(): + if hasattr(result, "requests") and result.requests and hasattr(result.requests, "stats"): + if hasattr(result.requests.stats, "ttft_ms"): + ttft[img_count] = result.requests.stats.ttft_ms.mean + return ttft + + def itl_by_count(self) -> dict[int, float]: + """Return mean ITL (ms) for each image count.""" + itl = {} + for img_count, result in self.results.items(): + if hasattr(result, "requests") and result.requests and hasattr(result.requests, "stats"): + if hasattr(result.requests.stats, "itl_ms"): + itl[img_count] = result.requests.stats.itl_ms.mean + return itl + + +class MultiImageBenchmark: + """ + Benchmark latency impact of multiple images per request. + + Example: + bench = MultiImageBenchmark( + image_counts=[1, 2, 5], + prompt_tokens=256, + output_tokens=128, + ) + config_dict = bench.get_configs() + # Use configs with benchmark runner + """ + + def __init__( + self, + image_counts: list[int], + prompt_tokens: int = 256, + output_tokens: int = 128, + image_size: str = "720p", + random_seed: int | None = None, + **kwargs: Any, + ): + """ + Initialize multi-image benchmark configuration. + + Args: + image_counts: List of image counts to benchmark (e.g., [1, 2, 5]) + prompt_tokens: Average prompt token count + output_tokens: Average output token count + image_size: Image resolution ("720p") + random_seed: Random seed for reproducible image generation + **kwargs: Additional arguments for MultiImageDatasetConfig + """ + self.image_counts = sorted(image_counts) + self.prompt_tokens = prompt_tokens + self.output_tokens = output_tokens + self.image_size = image_size + self.random_seed = random_seed + self.kwargs = kwargs + + def get_configs(self) -> dict[int, MultiImageDatasetConfig]: + """ + Get MultiImageDatasetConfig for each image count. + + Returns: + Dict mapping image_count to MultiImageDatasetConfig + """ + configs = {} + for img_count in self.image_counts: + configs[img_count] = MultiImageDatasetConfig( + prompt_tokens=self.prompt_tokens, + output_tokens=self.output_tokens, + images_per_request=img_count, + image_size=self.image_size, + **self.kwargs, + ) + return configs + + def generate_images(self, img_count: int) -> tuple[list[dict], int, int]: + """ + Generate synthetic images for a given count. + + Args: + img_count: Number of images to generate + + Returns: + Tuple of (images_list, total_pixels, total_bytes) + """ + return generate_synthetic_images( + num_images=img_count, + image_size=self.image_size, + seed=self.random_seed, + ) + + def get_image_stats(self, img_count: int) -> dict[str, int]: + """ + Get image statistics (pixels, bytes) for a given count. + + Args: + img_count: Number of images + + Returns: + Dict with 'total_pixels' and 'total_bytes' + """ + _, total_pixels, total_bytes = self.generate_images(img_count) + return { + "image_count": img_count, + "total_pixels": total_pixels, + "total_bytes": total_bytes, + "pixels_per_image": (total_pixels // img_count) if img_count > 0 else 0, + "bytes_per_image": (total_bytes // img_count) if img_count > 0 else 0, + } diff --git a/src/guidellm/cli/benchmark/run.py b/src/guidellm/cli/benchmark/run.py index cdeaffc1c..59bbca0f2 100644 --- a/src/guidellm/cli/benchmark/run.py +++ b/src/guidellm/cli/benchmark/run.py @@ -170,6 +170,16 @@ type=int, help="Random seed for reproducibility.", ) +@click.option( + "--images-per-request", + type=str, + default=None, + help=( + "Number of images per request for vision benchmarks. " + "Single value (e.g., '2') or comma-separated list (e.g., '1,2,5'). " + "When a list is provided, runs sequential benchmarks for each count." + ), +) # Output configuration @click.option( "--output-dir", @@ -325,6 +335,18 @@ def run(**kwargs): # noqa: C901, PLR0915, PLR0912 # Only set CLI args that differ from click defaults kwargs = cli_tools.set_if_not_default(ctx, **kwargs) + # Handle images_per_request parameter + images_per_request = kwargs.pop("images_per_request", None) + if images_per_request: + # Parse as single value or comma-separated list + try: + parts = [int(x.strip()) for x in images_per_request.split(",")] + kwargs["images_per_request"] = parts if len(parts) > 1 else parts[0] + except ValueError: + raise click.BadParameter( + f"--images-per-request must be an integer or comma-separated integers, got '{images_per_request}'" + ) + # Handle output path remapping if (output_path := kwargs.pop("output_path", None)) is not None: if kwargs.get("output_dir", None) is not None: diff --git a/src/guidellm/data/deserializers/__init__.py b/src/guidellm/data/deserializers/__init__.py index ebaca6fc0..f7fa00aa1 100644 --- a/src/guidellm/data/deserializers/__init__.py +++ b/src/guidellm/data/deserializers/__init__.py @@ -23,6 +23,7 @@ InMemoryItemListDataArgs, InMemoryItemListDatasetDeserializer, ) +from .multi_image import MultiImageDataArgs, MultiImageDatasetDeserializer from .synthetic import ( SyntheticTextDataArgs, SyntheticTextDataset, @@ -49,6 +50,8 @@ "InMemoryItemListDataArgs", "InMemoryItemListDatasetDeserializer", "JSONFileDatasetDeserializer", + "MultiImageDataArgs", + "MultiImageDatasetDeserializer", "ParquetFileDatasetDeserializer", "SyntheticTextDataArgs", "SyntheticTextDataset", diff --git a/src/guidellm/data/deserializers/multi_image.py b/src/guidellm/data/deserializers/multi_image.py new file mode 100644 index 000000000..b5626e54f --- /dev/null +++ b/src/guidellm/data/deserializers/multi_image.py @@ -0,0 +1,82 @@ +"""Multi-image synthetic data deserializer for vision benchmarking.""" + +from __future__ import annotations + +from collections.abc import Callable +from typing import Literal + +from datasets import IterableDataset +from pydantic import Field, field_validator +from transformers import PreTrainedTokenizerBase + +from guidellm.data.deserializers.deserializer import ( + DatasetDeserializerFactory, +) +from guidellm.data.deserializers.synthetic import ( + SyntheticTextDataArgs, + SyntheticTextDataset, + SyntheticTextDatasetDeserializer, +) +from guidellm.data.generators.multi_image import ImageSize +from guidellm.data.schemas import DataArgs + +__all__ = [ + "MultiImageDataArgs", + "MultiImageDatasetDeserializer", +] + +_VALID_IMAGE_SIZES = sorted(ImageSize.SIZES.keys()) + + +@DataArgs.register("multi_image") +class MultiImageDataArgs(SyntheticTextDataArgs): + """ + Data args for generating synthetic multi-image prompts. + + Extends SyntheticTextDataArgs with image count and resolution fields. + """ + + kind: Literal["multi_image"] = Field( # type: ignore[assignment] + default="multi_image", + description="Type identifier for the multi-image dataset configuration.", + ) + images_per_request: int = Field( + description="Number of images to include per request.", + ge=1, + le=10, + default=1, + ) + image_size: str = Field( + description=( + f"Standard image resolution key. Valid values: {_VALID_IMAGE_SIZES}." + ), + default="720p", + ) + + @field_validator("image_size") + @classmethod + def validate_image_size(cls, value: str) -> str: + if value not in ImageSize.SIZES: + raise ValueError( + f"Invalid image_size {value!r}. Valid options: {_VALID_IMAGE_SIZES}" + ) + return value + + +# Keep the old name as an alias for backwards compatibility within this PR. +MultiImageDatasetConfig = MultiImageDataArgs + + +@DatasetDeserializerFactory.register("multi_image") +class MultiImageDatasetDeserializer(SyntheticTextDatasetDeserializer): + def __call__( + self, + config: MultiImageDataArgs, + processor_factory: Callable[[], PreTrainedTokenizerBase], + random_seed: int, + ) -> IterableDataset: + return SyntheticTextDataset( + config=config, + processor=processor_factory(), + random_seed=random_seed, + ) diff --git a/src/guidellm/data/generators/__init__.py b/src/guidellm/data/generators/__init__.py new file mode 100644 index 000000000..db12af181 --- /dev/null +++ b/src/guidellm/data/generators/__init__.py @@ -0,0 +1,7 @@ +"""Data generators for GuideLLM.""" + +from guidellm.data.generators.multi_image import generate_synthetic_images + +__all__ = [ + "generate_synthetic_images", +] diff --git a/src/guidellm/data/generators/multi_image.py b/src/guidellm/data/generators/multi_image.py new file mode 100644 index 000000000..bb2973a48 --- /dev/null +++ b/src/guidellm/data/generators/multi_image.py @@ -0,0 +1,78 @@ +"""Multi-image synthetic data generation for benchmarking.""" + +from __future__ import annotations + +import base64 +import io +from typing import Any + +import numpy as np + +try: + from PIL import Image as PILImage +except ImportError as e: + raise ImportError( + "Please install guidellm[vision] to use multi-image features" + ) from e + +__all__ = ["generate_synthetic_images", "ImageSize"] + + +class ImageSize: + """Standard image sizes.""" + + SIZES = { + "480p": (854, 480), + "720p": (1280, 720), + "1080p": (1920, 1080), + "1440p": (2560, 1440), + "4k": (3840, 2160), + } + + +def generate_synthetic_images( + num_images: int, + image_size: str = "720p", + seed: int | None = None, +) -> tuple[list[dict[str, Any]], int, int]: + """ + Generate N synthetic JPEG images as base64-encoded strings. + + Args: + num_images: Number of images to generate + image_size: Image resolution key from ImageSize.SIZES (e.g. "720p", "1080p", "4k") + seed: Random seed for reproducibility + + Returns: + Tuple of (images_list, total_pixels, total_bytes) where: + - images_list: List of dicts with keys "image", "image_pixels", "image_bytes" + - total_pixels: Total pixel count across all images + - total_bytes: Total byte size across all images + """ + if seed is not None: + np.random.seed(seed) + + width, height = ImageSize.SIZES.get(image_size, (1280, 720)) + total_pixels = 0 + total_bytes = 0 + images = [] + + for _ in range(num_images): + image_array = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8) + pil_image = PILImage.fromarray(image_array, mode="RGB") + + buffer = io.BytesIO() + pil_image.save(buffer, format="JPEG", quality=85) + image_bytes = buffer.getvalue() + image_base64 = base64.b64encode(image_bytes).decode("utf-8") + + images.append({ + "image": f"data:image/jpeg;base64,{image_base64}", + "image_pixels": width * height, + "image_bytes": len(image_bytes), + }) + + total_pixels += width * height + total_bytes += len(image_bytes) + + return images, total_pixels, total_bytes diff --git a/tests/unit/data/generators/test_multi_image_generator.py b/tests/unit/data/generators/test_multi_image_generator.py new file mode 100644 index 000000000..87445431a --- /dev/null +++ b/tests/unit/data/generators/test_multi_image_generator.py @@ -0,0 +1,81 @@ +"""Tests for multi-image generator.""" + +import base64 +import io + +import pytest +from PIL import Image as PILImage + +from guidellm.data.generators.multi_image import ( + generate_synthetic_images, + ImageSize, +) + + +class TestGenerateSyntheticImages: + """Test image generation.""" + + def test_image_size_720p(self): + """Test 720p images have correct dimensions.""" + width, height = ImageSize.SIZES["720p"] + assert width == 1280 + assert height == 720 + + def test_generates_correct_count(self): + """Test correct number of images generated.""" + images, _, _ = generate_synthetic_images(3, image_size="720p", seed=42) + assert len(images) == 3 + + def test_single_image(self): + """Test generating a single image.""" + images, total_pixels, total_bytes = generate_synthetic_images( + 1, image_size="720p", seed=42 + ) + assert len(images) == 1 + assert images[0]["image"].startswith("data:image/jpeg;base64,") + assert total_pixels == 1280 * 720 + assert total_bytes > 0 + + def test_image_base64_encoding(self): + """Test image is valid base64-encoded JPEG.""" + images, _, _ = generate_synthetic_images(1, image_size="720p", seed=42) + image_data = images[0]["image"] + + # Extract base64 content + _, encoded = image_data.split(",", 1) + image_bytes = base64.b64decode(encoded) + + # Verify it's a valid JPEG + pil_image = PILImage.open(io.BytesIO(image_bytes)) + assert pil_image.format == "JPEG" + assert pil_image.size == (1280, 720) + assert pil_image.mode == "RGB" + + def test_total_pixels_calculation(self): + """Test total pixels is sum of all image pixels.""" + images, total_pixels, _ = generate_synthetic_images(5, image_size="720p") + expected_pixels = len(images) * 1280 * 720 + assert total_pixels == expected_pixels + + def test_total_bytes_calculation(self): + """Test total bytes is sum of all image bytes.""" + images, _, total_bytes = generate_synthetic_images(3, image_size="720p") + expected_bytes = sum(img["image_bytes"] for img in images) + assert total_bytes == expected_bytes + + def test_reproducible_with_seed(self): + """Test same seed produces same images.""" + images1, _, _ = generate_synthetic_images(2, image_size="720p", seed=42) + images2, _, _ = generate_synthetic_images(2, image_size="720p", seed=42) + + assert len(images1) == len(images2) + # Same seed = same base64 strings + assert images1[0]["image"] == images2[0]["image"] + assert images1[1]["image"] == images2[1]["image"] + + def test_different_seed_produces_different_images(self): + """Test different seeds produce different images.""" + images1, _, _ = generate_synthetic_images(1, image_size="720p", seed=42) + images2, _, _ = generate_synthetic_images(1, image_size="720p", seed=99) + + assert images1[0]["image"] != images2[0]["image"] diff --git a/tests/unit/data/test_multi_image_config.py b/tests/unit/data/test_multi_image_config.py new file mode 100644 index 000000000..3f31f37fe --- /dev/null +++ b/tests/unit/data/test_multi_image_config.py @@ -0,0 +1,80 @@ +"""Tests for MultiImageDatasetConfig.""" + +import pytest +from pydantic import ValidationError + +from guidellm.data.deserializers.multi_image import MultiImageDataArgs as MultiImageDatasetConfig + + +class TestMultiImageDatasetConfig: + """Test MultiImageDatasetConfig validation and defaults.""" + + def test_defaults(self): + """Test default values.""" + config = MultiImageDatasetConfig( + prompt_tokens=256, + output_tokens=128, + ) + assert config.images_per_request == 1 + assert config.image_size == "720p" + + def test_explicit_image_count(self): + """Test setting explicit image count.""" + config = MultiImageDatasetConfig( + prompt_tokens=256, + output_tokens=128, + images_per_request=5, + ) + assert config.images_per_request == 5 + + def test_invalid_image_count_zero(self): + """Test that 0 images is rejected.""" + with pytest.raises(ValidationError) as exc_info: + MultiImageDatasetConfig( + prompt_tokens=256, + output_tokens=128, + images_per_request=0, + ) + assert "greater than or equal to 1" in str(exc_info.value) + + def test_invalid_image_count_exceeds_max(self): + """Test that > 10 images is rejected.""" + with pytest.raises(ValidationError) as exc_info: + MultiImageDatasetConfig( + prompt_tokens=256, + output_tokens=128, + images_per_request=11, + ) + assert "less than or equal to 10" in str(exc_info.value) + + @pytest.mark.parametrize("size", ["480p", "720p", "1080p", "1440p", "4k"]) + def test_valid_image_sizes(self, size): + """Test all supported image resolution keys are accepted.""" + config = MultiImageDatasetConfig( + prompt_tokens=256, + output_tokens=128, + image_size=size, + ) + assert config.image_size == size + + def test_invalid_image_size(self): + """Test that an unknown image size key is rejected.""" + with pytest.raises(ValidationError): + MultiImageDatasetConfig( + prompt_tokens=256, + output_tokens=128, + image_size="8k", + ) + + def test_inherits_synthetic_text_fields(self): + """Test that it inherits SyntheticTextDatasetConfig fields.""" + config = MultiImageDatasetConfig( + prompt_tokens=256, + output_tokens=128, + prompt_tokens_stdev=50, + output_tokens_max=256, + ) + assert config.prompt_tokens == 256 + assert config.output_tokens == 128 + assert config.prompt_tokens_stdev == 50 + assert config.output_tokens_max == 256