Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 63 additions & 13 deletions docs/getting-started/benchmark.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,19 +55,20 @@ GuideLLM offers a wide range of configuration options to customize your benchmar

### Key Parameters

| Parameter | Description | Example |
| ---------------- | ---------------------------------------------- | ------------------------------------------------------------------ |
| `--target` | URL of the OpenAI-compatible server | `--target "http://localhost:8000"` |
| `--model` | Model name to benchmark | `--model "Meta-Llama-3.1-8B-Instruct"` |
| `--data` | Data configuration for benchmarking | `--data "kind=synthetic_text,prompt_tokens=256,output_tokens=128"` |
| `--profile` | Type of benchmark profile to run | `--profile kind=sweep` |
| `--rate` | Request rate or number of benchmarks for sweep | `--rate 10` |
| `--random-seed` | Random seed for reproducibility | `--random-seed 42` |
| `--max-seconds` | Duration for each benchmark in seconds | `--max-seconds 30` |
| `--max-requests` | Maximum number of requests for each benchmark | `--max-requests 1000` |
| `--data-samples` | Maximum number of dataset rows to load | `--data-samples 1000` |
| `--output-dir` | Directory path to save output files | `--output-dir results/` |
| `--outputs` | Output formats to generate | `--outputs json csv html` |
| Parameter | Description | Example |
| ---------------------- | ---------------------------------------------- | ------------------------------------------------------------------ |
| `--target` | URL of the OpenAI-compatible server | `--target "http://localhost:8000"` |
| `--model` | Model name to benchmark | `--model "Meta-Llama-3.1-8B-Instruct"` |
| `--data` | Data configuration for benchmarking | `--data "kind=synthetic_text,prompt_tokens=256,output_tokens=128"` |
| `--profile` | Type of benchmark profile to run | `--profile kind=sweep` |
| `--rate` | Request rate or number of benchmarks for sweep | `--rate 10` |
| `--images-per-request` | Number of images per request for vision benchmarks | `--images-per-request "1,2,5"` |
| `--random-seed` | Random seed for reproducibility | `--random-seed 42` |
| `--max-seconds` | Duration for each benchmark in seconds | `--max-seconds 30` |
| `--max-requests` | Maximum number of requests for each benchmark | `--max-requests 1000` |
| `--data-samples` | Maximum number of dataset rows to load | `--data-samples 1000` |
| `--output-dir` | Directory path to save output files | `--output-dir results/` |
| `--outputs` | Output formats to generate | `--outputs json csv html` |

### Random Seed (`--random-seed`)

Expand Down Expand Up @@ -264,6 +265,55 @@ guidellm benchmark \
--rate 5
```

### Multi-Image Benchmarking

When benchmarking vision-language models with multiple images per request, use `--images-per-request` to measure latency impact. This is useful for understanding how TTFT and ITL scale with increasing frame/image counts:

```bash
guidellm benchmark \
--target "http://localhost:8000" \
--data "prompt_tokens=256,output_tokens=128" \
--images-per-request 1,2,5 \
--profile constant \
--rate 10 \
--max-seconds 30
```

This runs three sequential benchmarks (1, 2, and 5 images per request) with synthetic 720p images and outputs comparative latency metrics in the report.

**Single image count:**

```bash
guidellm benchmark \
--target "http://localhost:8000" \
--images-per-request 3 \
--profile constant \
--rate 5
```

**Programmatic usage:**

```python
from guidellm.benchmark import MultiImageBenchmark

# Create multi-image benchmark configuration
bench = MultiImageBenchmark(
image_counts=[1, 2, 5],
prompt_tokens=256,
output_tokens=128,
)

# Get configs for each image count
configs = bench.get_configs() # {1: config, 2: config, 5: config}

# Get image statistics
for img_count in [1, 2, 5]:
stats = bench.get_image_stats(img_count)
print(f"{img_count} images: {stats['total_bytes']} bytes total")
```

**Note:** Multi-image benchmarking requires the vision dependencies (`pip install guidellm[vision]`).

## Output Options

By default, complete results are saved to `benchmarks.json`, `benchmarks.csv`, and `benchmarks.html` in your current directory. Use the `--output-dir` parameter to specify a different location and `--outputs` to control which formats are generated.
Expand Down
3 changes: 3 additions & 0 deletions src/guidellm/benchmark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from .benchmarker import Benchmarker
from .entrypoints import benchmark_generative_text, reimport_benchmarks_report
from .multi_image import MultiImageBenchmark, MultiImageBenchmarkResults
from .outputs import (
GenerativeBenchmarkerConsole,
GenerativeBenchmarkerCSV,
Expand Down Expand Up @@ -82,6 +83,8 @@
"GenerativeRequestsAccumulator",
"GenerativeTextMetricsSummary",
"GenerativeVideoMetricsSummary",
"MultiImageBenchmark",
"MultiImageBenchmarkResults",
"Profile",
"RunningMetricStats",
"SchedulerMetrics",
Expand Down
131 changes: 131 additions & 0 deletions src/guidellm/benchmark/multi_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""Programmatic API for multi-image benchmarking."""

from __future__ import annotations

from dataclasses import dataclass
from typing import Any

from guidellm.data.deserializers.multi_image import MultiImageDataArgs as MultiImageDatasetConfig
from guidellm.data.generators.multi_image import generate_synthetic_images

__all__ = ["MultiImageBenchmark", "MultiImageBenchmarkResults"]


@dataclass
class MultiImageBenchmarkResults:
"""Results from multi-image benchmark comparing multiple frame counts."""

results: dict[int, Any] # {image_count: benchmark_result}

def ttft_by_count(self) -> dict[int, float]:
"""Return mean TTFT (ms) for each image count."""
ttft = {}
for img_count, result in self.results.items():
if hasattr(result, "requests") and result.requests and hasattr(result.requests, "stats"):
if hasattr(result.requests.stats, "ttft_ms"):
ttft[img_count] = result.requests.stats.ttft_ms.mean
return ttft

def itl_by_count(self) -> dict[int, float]:
"""Return mean ITL (ms) for each image count."""
itl = {}
for img_count, result in self.results.items():
if hasattr(result, "requests") and result.requests and hasattr(result.requests, "stats"):
if hasattr(result.requests.stats, "itl_ms"):
itl[img_count] = result.requests.stats.itl_ms.mean
return itl


class MultiImageBenchmark:
"""
Benchmark latency impact of multiple images per request.

Example:
bench = MultiImageBenchmark(
image_counts=[1, 2, 5],
prompt_tokens=256,
output_tokens=128,
)
config_dict = bench.get_configs()
# Use configs with benchmark runner
"""

def __init__(
self,
image_counts: list[int],
prompt_tokens: int = 256,
output_tokens: int = 128,
image_size: str = "720p",
random_seed: int | None = None,
**kwargs: Any,
):
"""
Initialize multi-image benchmark configuration.

Args:
image_counts: List of image counts to benchmark (e.g., [1, 2, 5])
prompt_tokens: Average prompt token count
output_tokens: Average output token count
image_size: Image resolution ("720p")
random_seed: Random seed for reproducible image generation
**kwargs: Additional arguments for MultiImageDatasetConfig
"""
self.image_counts = sorted(image_counts)
self.prompt_tokens = prompt_tokens
self.output_tokens = output_tokens
self.image_size = image_size
self.random_seed = random_seed
self.kwargs = kwargs

def get_configs(self) -> dict[int, MultiImageDatasetConfig]:
"""
Get MultiImageDatasetConfig for each image count.

Returns:
Dict mapping image_count to MultiImageDatasetConfig
"""
configs = {}
for img_count in self.image_counts:
configs[img_count] = MultiImageDatasetConfig(
prompt_tokens=self.prompt_tokens,
output_tokens=self.output_tokens,
images_per_request=img_count,
image_size=self.image_size,
**self.kwargs,
)
return configs

def generate_images(self, img_count: int) -> tuple[list[dict], int, int]:
"""
Generate synthetic images for a given count.

Args:
img_count: Number of images to generate

Returns:
Tuple of (images_list, total_pixels, total_bytes)
"""
return generate_synthetic_images(
num_images=img_count,
image_size=self.image_size,
seed=self.random_seed,
)

def get_image_stats(self, img_count: int) -> dict[str, int]:
"""
Get image statistics (pixels, bytes) for a given count.

Args:
img_count: Number of images

Returns:
Dict with 'total_pixels' and 'total_bytes'
"""
_, total_pixels, total_bytes = self.generate_images(img_count)
return {
"image_count": img_count,
"total_pixels": total_pixels,
"total_bytes": total_bytes,
"pixels_per_image": (total_pixels // img_count) if img_count > 0 else 0,
"bytes_per_image": (total_bytes // img_count) if img_count > 0 else 0,
}
22 changes: 22 additions & 0 deletions src/guidellm/cli/benchmark/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,16 @@
type=int,
help="Random seed for reproducibility.",
)
@click.option(
"--images-per-request",
type=str,
default=None,
help=(
"Number of images per request for vision benchmarks. "
"Single value (e.g., '2') or comma-separated list (e.g., '1,2,5'). "
"When a list is provided, runs sequential benchmarks for each count."
),
)
# Output configuration
@click.option(
"--output-dir",
Expand Down Expand Up @@ -325,6 +335,18 @@ def run(**kwargs): # noqa: C901, PLR0915, PLR0912
# Only set CLI args that differ from click defaults
kwargs = cli_tools.set_if_not_default(ctx, **kwargs)

# Handle images_per_request parameter
images_per_request = kwargs.pop("images_per_request", None)
if images_per_request:
# Parse as single value or comma-separated list
try:
parts = [int(x.strip()) for x in images_per_request.split(",")]
kwargs["images_per_request"] = parts if len(parts) > 1 else parts[0]
except ValueError:
raise click.BadParameter(
f"--images-per-request must be an integer or comma-separated integers, got '{images_per_request}'"
)

# Handle output path remapping
if (output_path := kwargs.pop("output_path", None)) is not None:
if kwargs.get("output_dir", None) is not None:
Expand Down
3 changes: 3 additions & 0 deletions src/guidellm/data/deserializers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
InMemoryItemListDataArgs,
InMemoryItemListDatasetDeserializer,
)
from .multi_image import MultiImageDataArgs, MultiImageDatasetDeserializer
from .synthetic import (
SyntheticTextDataArgs,
SyntheticTextDataset,
Expand All @@ -49,6 +50,8 @@
"InMemoryItemListDataArgs",
"InMemoryItemListDatasetDeserializer",
"JSONFileDatasetDeserializer",
"MultiImageDataArgs",
"MultiImageDatasetDeserializer",
"ParquetFileDatasetDeserializer",
"SyntheticTextDataArgs",
"SyntheticTextDataset",
Expand Down
82 changes: 82 additions & 0 deletions src/guidellm/data/deserializers/multi_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""Multi-image synthetic data deserializer for vision benchmarking."""

from __future__ import annotations

from collections.abc import Callable
from typing import Literal

from datasets import IterableDataset
from pydantic import Field, field_validator
from transformers import PreTrainedTokenizerBase

from guidellm.data.deserializers.deserializer import (
DatasetDeserializerFactory,
)
from guidellm.data.deserializers.synthetic import (
SyntheticTextDataArgs,
SyntheticTextDataset,
SyntheticTextDatasetDeserializer,
)
from guidellm.data.generators.multi_image import ImageSize
from guidellm.data.schemas import DataArgs

__all__ = [
"MultiImageDataArgs",
"MultiImageDatasetDeserializer",
]

_VALID_IMAGE_SIZES = sorted(ImageSize.SIZES.keys())


@DataArgs.register("multi_image")
class MultiImageDataArgs(SyntheticTextDataArgs):
"""
Data args for generating synthetic multi-image prompts.

Extends SyntheticTextDataArgs with image count and resolution fields.
"""

kind: Literal["multi_image"] = Field( # type: ignore[assignment]
default="multi_image",
description="Type identifier for the multi-image dataset configuration.",
)
images_per_request: int = Field(
description="Number of images to include per request.",
ge=1,
le=10,
default=1,
)
image_size: str = Field(
description=(
f"Standard image resolution key. Valid values: {_VALID_IMAGE_SIZES}."
),
default="720p",
)

@field_validator("image_size")
@classmethod
def validate_image_size(cls, value: str) -> str:
if value not in ImageSize.SIZES:
raise ValueError(
f"Invalid image_size {value!r}. Valid options: {_VALID_IMAGE_SIZES}"
)
return value


# Keep the old name as an alias for backwards compatibility within this PR.
MultiImageDatasetConfig = MultiImageDataArgs


@DatasetDeserializerFactory.register("multi_image")
class MultiImageDatasetDeserializer(SyntheticTextDatasetDeserializer):
def __call__(
self,
config: MultiImageDataArgs,
processor_factory: Callable[[], PreTrainedTokenizerBase],
random_seed: int,
) -> IterableDataset:
return SyntheticTextDataset(
config=config,
processor=processor_factory(),
random_seed=random_seed,
)
7 changes: 7 additions & 0 deletions src/guidellm/data/generators/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""Data generators for GuideLLM."""

from guidellm.data.generators.multi_image import generate_synthetic_images

__all__ = [
"generate_synthetic_images",
]
Loading