From 214f67bb3ffb1ec41e0a8fe4a9faeada9a7e1a40 Mon Sep 17 00:00:00 2001 From: Zakaria el hjouji Date: Wed, 13 May 2026 00:48:17 -0400 Subject: [PATCH 01/14] extras/vision: add synthesize_image and synthesize_video helpers Pre-encoded data-URL output matching encode_image / encode_video shape. Per-row seeded gradient default with noise / solid / checkerboard opt-ins for images; gradient / noise for videos. Bit-exact mp4 encoding via imageio[ffmpeg] -fflags +bitexact so same seed produces byte-identical payloads. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Zakaria el hjouji Signed-off-by: Zakaria el hjouji --- pyproject.toml | 1 + src/guidellm/extras/vision.py | 1 + src/guidellm/extras/vision.pyi | 1 + src/guidellm/utils/vision.py | 255 +++++++++++++++++++++++++++++++++ 4 files changed, 258 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index fe29ade57..5cfd2ceb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,6 +93,7 @@ audio = [ vision = [ "datasets[vision]", "pillow", + "imageio[ffmpeg]", ] # Dev Tooling dev = [ diff --git a/src/guidellm/extras/vision.py b/src/guidellm/extras/vision.py index 3a28edb3d..06240e34d 100644 --- a/src/guidellm/extras/vision.py +++ b/src/guidellm/extras/vision.py @@ -7,6 +7,7 @@ attrs={ "PILImage": lazy.ExtraAttr("PIL", alias="Image"), "Image": lazy.ExtraAttr("PIL.Image", alias="Image"), + "iio": lazy.ExtraAttr("imageio", alias="v3"), }, error_message="Please install guidellm[vision] to use image/video features", ) diff --git a/src/guidellm/extras/vision.pyi b/src/guidellm/extras/vision.pyi index ec7c49e9f..edfee67a6 100644 --- a/src/guidellm/extras/vision.pyi +++ b/src/guidellm/extras/vision.pyi @@ -1,4 +1,5 @@ from PIL import Image as _PILImage from PIL.Image import Image as Image +import imageio.v3 as iio PILImage = _PILImage diff --git a/src/guidellm/utils/vision.py b/src/guidellm/utils/vision.py index 6a9dd6ddc..8977d3c43 100644 --- a/src/guidellm/utils/vision.py +++ b/src/guidellm/utils/vision.py @@ -19,6 +19,8 @@ "image_dict_to_pil", "is_url", "resize_image", + "synthesize_image", + "synthesize_video", ] @@ -274,3 +276,256 @@ def get_file_format(path: Path | str) -> str: """Get file format from path extension.""" suffix = Path(path).suffix.lower() return suffix[1:] if suffix.startswith(".") else "unknown" + + +# --------------------------------------------------------------------------- +# Synthetic media generation +# --------------------------------------------------------------------------- + +_SYNTHETIC_IMAGE_CONTENT = ("gradient", "noise", "solid", "checkerboard") +_SYNTHETIC_VIDEO_CONTENT = ("gradient", "noise") +_SYNTHETIC_IMAGE_FORMATS = ("jpeg", "png") +_SYNTHETIC_VIDEO_FORMATS = ("mp4",) + + +def _row_rng(seed: int, row_index: int) -> np.random.Generator: + """ + Deterministic per-row numpy Generator. + + Uses PCG64 seeded by SeedSequence([seed, row_index]) so two runs with the + same (seed, row_index) produce byte-identical RNG streams across processes, + machines, and OS-level RNG state. + """ + seed_seq = np.random.SeedSequence([int(seed) & 0xFFFFFFFF, int(row_index)]) + return np.random.Generator(np.random.PCG64(seed_seq)) + + +def _gradient_frame( + height: int, + width: int, + rng: np.random.Generator, +) -> np.ndarray: + """ + Generate a smooth gradient frame with randomized base color and direction. + + Compresses well in JPEG / h264 (similar wire size to real media) but every + frame is byte-different from the next when ``rng`` is reseeded per row, + which defeats vLLM's mm-processor cache. + """ + color_a = rng.integers(0, 256, size=3, dtype=np.int32) + color_b = rng.integers(0, 256, size=3, dtype=np.int32) + angle = float(rng.uniform(0.0, 2.0 * np.pi)) + + ys = np.linspace(-1.0, 1.0, height, dtype=np.float32).reshape(height, 1) + xs = np.linspace(-1.0, 1.0, width, dtype=np.float32).reshape(1, width) + proj = xs * np.cos(angle) + ys * np.sin(angle) + proj = (proj - proj.min()) / max(proj.max() - proj.min(), 1e-6) + proj = proj[..., None] + + diff = (color_b - color_a).astype(np.float32).reshape(1, 1, 3) + base = color_a.astype(np.float32).reshape(1, 1, 3) + frame = base + proj * diff + return np.clip(frame, 0.0, 255.0).astype(np.uint8) + + +def _noise_frame( + height: int, + width: int, + rng: np.random.Generator, +) -> np.ndarray: + return rng.integers(0, 256, size=(height, width, 3), dtype=np.uint8) + + +def _solid_frame( + height: int, + width: int, + rng: np.random.Generator, +) -> np.ndarray: + color = rng.integers(0, 256, size=3, dtype=np.uint8) + return np.broadcast_to(color, (height, width, 3)).copy() + + +def _checkerboard_frame( + height: int, + width: int, + rng: np.random.Generator, +) -> np.ndarray: + color_a = rng.integers(0, 256, size=3, dtype=np.uint8) + color_b = rng.integers(0, 256, size=3, dtype=np.uint8) + tile = int(rng.integers(8, 33)) + ys = (np.arange(height) // tile) % 2 + xs = (np.arange(width) // tile) % 2 + mask = (ys.reshape(-1, 1) ^ xs.reshape(1, -1)).astype(bool) + frame = np.empty((height, width, 3), dtype=np.uint8) + frame[mask] = color_b + frame[~mask] = color_a + return frame + + +def _generate_image_array( + height: int, + width: int, + content: str, + rng: np.random.Generator, +) -> np.ndarray: + if content == "gradient": + return _gradient_frame(height, width, rng) + if content == "noise": + return _noise_frame(height, width, rng) + if content == "solid": + return _solid_frame(height, width, rng) + if content == "checkerboard": + return _checkerboard_frame(height, width, rng) + raise ValueError( + f"Unsupported synthetic image content '{content}', " + f"expected one of {_SYNTHETIC_IMAGE_CONTENT}" + ) + + +def synthesize_image( + width: int, + height: int, + *, + content: str = "gradient", + image_format: str = "jpeg", + jpeg_quality: int = 85, + seed: int = 0, + row_index: int = 0, +) -> dict[str, Any]: + """ + Synthesize a single image and return the canonical encoded dict. + + The output shape matches :func:`encode_image` so it flows through the rest + of the pipeline (column mapper -> finalizer) unchanged. + + :param width: image width in pixels. + :param height: image height in pixels. + :param content: ``gradient`` (default, per-row randomized), ``noise``, + ``solid``, or ``checkerboard``. + :param image_format: ``jpeg`` (default) or ``png``. + :param jpeg_quality: JPEG quality 1..100 (ignored for png). + :param seed: base seed for reproducibility. + :param row_index: row index used to vary the RNG stream per row so + successive rows are byte-different even with the same seed. + """ + if width <= 0 or height <= 0: + raise ValueError(f"width and height must be positive, got {width}x{height}") + fmt = image_format.lower() + if fmt not in _SYNTHETIC_IMAGE_FORMATS: + raise ValueError( + f"Unsupported synthetic image format '{image_format}', " + f"expected one of {_SYNTHETIC_IMAGE_FORMATS}" + ) + if content not in _SYNTHETIC_IMAGE_CONTENT: + raise ValueError( + f"Unsupported synthetic image content '{content}', " + f"expected one of {_SYNTHETIC_IMAGE_CONTENT}" + ) + + rng = _row_rng(seed, row_index) + arr = _generate_image_array(height, width, content, rng) + img = libs.PILImage.fromarray(arr, mode="RGB") + + buffer = io.BytesIO() + if fmt == "jpeg": + img.save(buffer, format="JPEG", quality=int(jpeg_quality), optimize=False) + mime = "image/jpeg" + else: + img.save(buffer, format="PNG", optimize=False, compress_level=6) + mime = "image/png" + + image_bytes = buffer.getvalue() + image_b64 = base64.b64encode(image_bytes).decode("utf-8") + + return { + "type": "image_base64", + "image": f"data:{mime};base64,{image_b64}", + "image_pixels": width * height, + "image_bytes": len(image_bytes), + } + + +def synthesize_video( + width: int, + height: int, + frames: int, + *, + fps: float = 1.0, + content: str = "gradient", + video_format: str = "mp4", + video_bitrate: str | None = None, + seed: int = 0, + row_index: int = 0, +) -> dict[str, Any]: + """ + Synthesize a short video clip and return the canonical encoded dict. + + Matches the shape of :func:`encode_video`. Only ``mp4`` (h264, yuv420p) is + supported in v1. Encoding uses ``-fflags +bitexact`` so two runs with the + same seed produce byte-identical mp4 payloads. + + :param width: frame width in pixels (must be > 0). + :param height: frame height in pixels (must be > 0). + :param frames: number of frames in the clip (must be >= 1). + :param fps: frames per second (encoded into the container, drives + ``video_seconds = frames / fps``). + :param content: ``gradient`` (default, per-frame randomized) or ``noise``. + :param video_format: only ``mp4`` is supported in v1. + :param video_bitrate: optional libx264 bitrate, e.g. ``"500k"``. ``None`` + leaves the codec at its default CRF-based rate control. + :param seed: base seed for reproducibility. + :param row_index: row index used to vary the RNG stream per row. + """ + if width <= 0 or height <= 0: + raise ValueError(f"width and height must be positive, got {width}x{height}") + if frames <= 0: + raise ValueError(f"frames must be positive, got {frames}") + if fps <= 0: + raise ValueError(f"fps must be positive, got {fps}") + fmt = video_format.lower() + if fmt not in _SYNTHETIC_VIDEO_FORMATS: + raise ValueError( + f"Unsupported synthetic video format '{video_format}', " + f"expected one of {_SYNTHETIC_VIDEO_FORMATS}" + ) + if content not in _SYNTHETIC_VIDEO_CONTENT: + raise ValueError( + f"Unsupported synthetic video content '{content}', " + f"expected one of {_SYNTHETIC_VIDEO_CONTENT}" + ) + + rng = _row_rng(seed, row_index) + clip = np.empty((frames, height, width, 3), dtype=np.uint8) + for i in range(frames): + frame_seed = int(rng.integers(0, 2**31 - 1)) + frame_rng = np.random.Generator(np.random.PCG64(frame_seed)) + if content == "gradient": + clip[i] = _gradient_frame(height, width, frame_rng) + else: + clip[i] = _noise_frame(height, width, frame_rng) + + write_kwargs: dict[str, Any] = { + "extension": ".mp4", + "fps": float(fps), + "codec": "libx264", + "macro_block_size": 1, + "ffmpeg_params": [ + "-fflags", + "+bitexact", + "-flags:v", + "+bitexact", + ], + } + if video_bitrate is not None: + write_kwargs["bitrate"] = str(video_bitrate) + + video_bytes = libs.iio.imwrite("", clip, **write_kwargs) + video_b64 = base64.b64encode(video_bytes).decode("utf-8") + + return { + "type": "video_base64", + "video": f"data:video/mp4;base64,{video_b64}", + "video_frames": int(frames), + "video_seconds": float(frames) / float(fps), + "video_bytes": len(video_bytes), + } From 8990552996af60b511e3f8faade67fc3aa48c9db Mon Sep 17 00:00:00 2001 From: Zakaria el hjouji Date: Wed, 13 May 2026 00:52:16 -0400 Subject: [PATCH 02/14] data: add synthetic_image and synthetic_video deserializers SyntheticImageDatasetConfig and SyntheticVideoDatasetConfig live next to the existing text config. text_tokens is canonical; prompt_tokens is accepted as an alias. resolution / aspect_ratio sugar resolves to width/height. Each deserializer peeks at the input type and refuses to claim configs explicitly marked for another deserializer, so the registry dispatch is deterministic when distinctive fields overlap. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Zakaria el hjouji Signed-off-by: Zakaria el hjouji --- src/guidellm/data/deserializers/__init__.py | 16 + .../data/deserializers/synthetic_image.py | 407 ++++++++++++++++++ .../data/deserializers/synthetic_video.py | 319 ++++++++++++++ 3 files changed, 742 insertions(+) create mode 100644 src/guidellm/data/deserializers/synthetic_image.py create mode 100644 src/guidellm/data/deserializers/synthetic_video.py diff --git a/src/guidellm/data/deserializers/__init__.py b/src/guidellm/data/deserializers/__init__.py index ebaca6fc0..a41526b8c 100644 --- a/src/guidellm/data/deserializers/__init__.py +++ b/src/guidellm/data/deserializers/__init__.py @@ -28,6 +28,16 @@ SyntheticTextDataset, SyntheticTextDatasetDeserializer, ) +from .synthetic_image import ( + SyntheticImageDataArgs, + SyntheticImageDataset, + SyntheticImageDatasetDeserializer, +) +from .synthetic_video import ( + SyntheticVideoDataArgs, + SyntheticVideoDataset, + SyntheticVideoDatasetDeserializer, +) from .trace_mooncake import TraceMooncakeDataArgs, TraceMooncakeDatasetDeserializer from .trace_synthetic import TraceSyntheticDataArgs, TraceSyntheticDatasetDeserializer @@ -50,9 +60,15 @@ "InMemoryItemListDatasetDeserializer", "JSONFileDatasetDeserializer", "ParquetFileDatasetDeserializer", + "SyntheticImageDataArgs", + "SyntheticImageDataset", + "SyntheticImageDatasetDeserializer", "SyntheticTextDataArgs", "SyntheticTextDataset", "SyntheticTextDatasetDeserializer", + "SyntheticVideoDataArgs", + "SyntheticVideoDataset", + "SyntheticVideoDatasetDeserializer", "TarFileDatasetDeserializer", "TextFileDatasetDeserializer", "TraceMooncakeDataArgs", diff --git a/src/guidellm/data/deserializers/synthetic_image.py b/src/guidellm/data/deserializers/synthetic_image.py new file mode 100644 index 000000000..4e3d14c07 --- /dev/null +++ b/src/guidellm/data/deserializers/synthetic_image.py @@ -0,0 +1,407 @@ +"""Synthetic image dataset deserializer.""" + +from __future__ import annotations + +from collections.abc import Callable, Iterator +from typing import Any, Literal + +import numpy as np +from datasets import DatasetInfo, Features, IterableDataset, Value +from datasets.iterable_dataset import _BaseExamplesIterable +from faker import Faker +from pydantic import Field, model_validator +from transformers import PreTrainedTokenizerBase + +from guidellm.data.deserializers.deserializer import ( + DatasetDeserializer, + DatasetDeserializerFactory, +) +from guidellm.data.schemas import DataArgs +from guidellm.utils.random import IntegerRangeSampler +from guidellm.utils.vision import synthesize_image + +__all__ = [ + "SyntheticImageDataArgs", + "SyntheticImageDataset", + "SyntheticImageDatasetDeserializer", +] + + +_DESERIALIZER_TYPE = "synthetic_image" +_RESOLUTION_PRESETS: dict[str, tuple[int, int]] = { + "240p": (426, 240), + "360p": (640, 360), + "480p": (854, 480), + "540p": (960, 540), + "720p": (1280, 720), + "1080p": (1920, 1080), + "1440p": (2560, 1440), + "2160p": (3840, 2160), + "4k": (3840, 2160), +} + + +def _parse_aspect_ratio(aspect: str) -> float: + """Parse 'W:H' or 'W/H' into a float ratio.""" + sep = ":" if ":" in aspect else "/" + try: + w, h = aspect.split(sep) + return float(w) / float(h) + except Exception as exc: # noqa: BLE001 + raise ValueError( + f"Invalid aspect_ratio '{aspect}', expected 'W:H' or 'W/H'" + ) from exc + + +class _SyntheticVisionTextMixin(DataArgs): + text_tokens: int = Field( + description="The average number of text tokens generated for the text portion.", + gt=0, + ) + text_tokens_stdev: int | None = Field( + description="Standard deviation of text-token counts per prompt.", + gt=0, + default=None, + ) + text_tokens_min: int | None = Field( + description="Minimum number of text tokens per prompt.", + gt=0, + default=None, + ) + text_tokens_max: int | None = Field( + description="Maximum number of text tokens per prompt.", + gt=0, + default=None, + ) + output_tokens: int | None = Field( + description="The average number of output tokens to request.", + gt=0, + default=None, + ) + output_tokens_stdev: int | None = Field( + description="Standard deviation of output-token counts per prompt.", + gt=0, + default=None, + ) + output_tokens_min: int | None = Field( + description="Minimum number of output tokens per prompt.", + gt=0, + default=None, + ) + output_tokens_max: int | None = Field( + description="Maximum number of output tokens per prompt.", + gt=0, + default=None, + ) + seed: int = Field( + description="Base random seed for reproducible synthetic payloads.", + default=42, + ) + + @model_validator(mode="before") + @classmethod + def _alias_prompt_tokens(cls, data: object) -> object: + """Accept ``prompt_tokens`` as an alias for ``text_tokens``.""" + if isinstance(data, dict): + aliases = { + "prompt_tokens": "text_tokens", + "prompt_tokens_stdev": "text_tokens_stdev", + "prompt_tokens_min": "text_tokens_min", + "prompt_tokens_max": "text_tokens_max", + } + for alias, canonical in aliases.items(): + if alias in data and canonical not in data: + data[canonical] = data.pop(alias) + return data + + +@DataArgs.register(_DESERIALIZER_TYPE) +class SyntheticImageDataArgs(_SyntheticVisionTextMixin): + """Model for synthetic image dataset deserializer arguments.""" + + kind: Literal["synthetic_image"] = Field( # type: ignore[assignment] + default=_DESERIALIZER_TYPE, + description="Type identifier for the synthetic image dataset configuration.", + ) + width: int | None = Field( + description="Image width in pixels.", + gt=0, + default=None, + ) + height: int | None = Field( + description="Image height in pixels.", + gt=0, + default=None, + ) + resolution: str | None = Field( + description="Resolution shortcut such as '720p' or '1080p'.", + default=None, + ) + aspect_ratio: str | None = Field( + description="Aspect ratio override, e.g. '16:9' or '4:3'.", + default=None, + ) + format: Literal["jpeg", "png"] = Field( + description="Encoded image format.", + default="jpeg", + ) + jpeg_quality: int = Field( + description="JPEG quality 1..100. Ignored when format='png'.", + ge=1, + le=100, + default=85, + ) + content: Literal["gradient", "noise", "solid", "checkerboard"] = Field( + description="Pixel content to synthesize.", + default="gradient", + ) + images_per_request: int = Field( + description="Number of images per emitted row.", + ge=1, + default=1, + ) + + @model_validator(mode="after") + def _resolve_dimensions(self) -> SyntheticImageDataArgs: + w = self.width + h = self.height + if self.resolution is not None: + preset = _RESOLUTION_PRESETS.get(self.resolution.lower()) + if preset is None: + raise ValueError( + f"Unknown resolution '{self.resolution}'. Known: " + f"{sorted(_RESOLUTION_PRESETS)}" + ) + preset_w, preset_h = preset + if h is None: + h = preset_h + if w is None: + w = ( + int(round(h * _parse_aspect_ratio(self.aspect_ratio))) + if self.aspect_ratio is not None + else preset_w + ) + elif self.aspect_ratio is not None: + if h is not None and w is None: + w = int(round(h * _parse_aspect_ratio(self.aspect_ratio))) + elif w is not None and h is None: + h = int(round(w / _parse_aspect_ratio(self.aspect_ratio))) + + if w is None or h is None: + raise ValueError( + "synthetic_image config requires width and height, either " + "explicitly or via resolution/aspect_ratio." + ) + self.width = int(w) - (int(w) % 2) + self.height = int(h) - (int(h) % 2) + if self.width <= 0 or self.height <= 0: + raise ValueError( + f"Resolved image dims must be positive, got " + f"{self.width}x{self.height}" + ) + return self + + +class _SyntheticImageExamplesIterable(_BaseExamplesIterable): + """Examples iterable that yields rows of synthetic images + text.""" + + def __init__( + self, + config: SyntheticImageDataArgs, + processor: PreTrainedTokenizerBase, + random_seed: int, + ): + super().__init__() + self.config = config + self.processor = processor + self.random_seed = random_seed + self.iteration_count = 0 + + @staticmethod + def _build_prompt( + token_count: int, + processor: PreTrainedTokenizerBase, + faker: Faker, + unique: str, + ) -> str: + token_ids: list[int] = [] + avg_chars_per_token = 5 + margin_of_safety = 1.5 + attempts = 0 + while len(token_ids) < token_count: + attempts += 1 + num_chars = int( + token_count * avg_chars_per_token * margin_of_safety * attempts + ) + text = unique + faker.text(max_nb_chars=num_chars) + token_ids = processor.encode(text) + decoded = processor.decode(token_ids[:token_count], skip_special_tokens=True) + if isinstance(decoded, str): + return decoded + raise RuntimeError("Processor returned unexpected prompt decode type.") + + def __iter__(self) -> Iterator[tuple[int, dict[str, Any]]]: + iter_seed = self.random_seed + self.iteration_count + self.iteration_count += 1 + + faker = Faker() + faker.seed_instance(iter_seed) + + text_tokens_sampler = iter( + IntegerRangeSampler( + average=self.config.text_tokens, + variance=self.config.text_tokens_stdev, + min_value=self.config.text_tokens_min, + max_value=self.config.text_tokens_max, + random_seed=iter_seed, + ) + ) + output_tokens_sampler = ( + iter( + IntegerRangeSampler( + average=self.config.output_tokens, + variance=self.config.output_tokens_stdev, + min_value=self.config.output_tokens_min, + max_value=self.config.output_tokens_max, + random_seed=iter_seed + 1, + ) + ) + if self.config.output_tokens is not None + else None + ) + + row_index = 0 + while True: + text_token_count = next(text_tokens_sampler) + output_token_count = ( + next(output_tokens_sampler) + if output_tokens_sampler is not None + else None + ) + prompt = self._build_prompt( + text_token_count, + self.processor, + faker, + f"{self.iteration_count} {row_index} ", + ) + + row: dict[str, Any] = { + "prefix": "", + "prompt_0": prompt, + "prompt_tokens_count_0": text_token_count, + } + if output_token_count is not None: + row["output_tokens_count_0"] = output_token_count + + for img_idx in range(self.config.images_per_request): + encoded = synthesize_image( + width=int(self.config.width), + height=int(self.config.height), + content=self.config.content, + image_format=self.config.format, + jpeg_quality=self.config.jpeg_quality, + seed=self.config.seed, + row_index=row_index * self.config.images_per_request + img_idx, + ) + if self.config.images_per_request == 1: + row["image"] = encoded + else: + row[f"image_{img_idx}"] = encoded + + yield row_index, row + row_index += 1 + + @property + def is_typed(self) -> bool: + return True + + @property + def features(self) -> Features: + features: dict[str, Any] = { + "prefix": Value("string"), + "prompt_0": Value("string"), + "prompt_tokens_count_0": Value("int32"), + } + if self.config.output_tokens is not None: + features["output_tokens_count_0"] = Value("int32") + image_struct = { + "type": Value("string"), + "image": Value("string"), + "image_pixels": Value("int64"), + "image_bytes": Value("int64"), + } + if self.config.images_per_request == 1: + features["image"] = image_struct + else: + for img_idx in range(self.config.images_per_request): + features[f"image_{img_idx}"] = image_struct + return Features(features) + + @property + def num_shards(self) -> int: + return 1 + + def shuffle_data_sources( + self, + generator: np.random.Generator, # noqa: ARG002 + ) -> _SyntheticImageExamplesIterable: + return self + + def shard_data_sources( + self, + num_shards: int, # noqa: ARG002 + index: int, # noqa: ARG002 + contiguous: bool = True, # noqa: ARG002 + ) -> _SyntheticImageExamplesIterable: + return self + + def load_state_dict(self, state_dict: dict) -> None: + self.iteration_count = state_dict.get("iteration_count", 0) + + def _init_state_dict(self) -> dict: + self._state_dict = {"iteration_count": self.iteration_count} + return self._state_dict + + +class SyntheticImageDataset(IterableDataset): + def __init__( + self, + config: SyntheticImageDataArgs, + processor: PreTrainedTokenizerBase, + random_seed: int = 42, + ): + self.config = config + self.processor = processor + self.random_seed = random_seed + + ex_iterable = _SyntheticImageExamplesIterable( + config=config, + processor=processor, + random_seed=random_seed, + ) + super().__init__( + ex_iterable=ex_iterable, + info=DatasetInfo( + description="Synthetic image dataset generator", + features=ex_iterable.features, + ), + ) + + def set_epoch(self, epoch: int): + if isinstance(self._ex_iterable, _SyntheticImageExamplesIterable): + self._ex_iterable.iteration_count = epoch + + +@DatasetDeserializerFactory.register(_DESERIALIZER_TYPE) +class SyntheticImageDatasetDeserializer(DatasetDeserializer): + def __call__( + self, + config: SyntheticImageDataArgs, + processor_factory: Callable[[], PreTrainedTokenizerBase], + random_seed: int, + ) -> IterableDataset: + return SyntheticImageDataset( + config=config, + processor=processor_factory(), + random_seed=random_seed, + ) diff --git a/src/guidellm/data/deserializers/synthetic_video.py b/src/guidellm/data/deserializers/synthetic_video.py new file mode 100644 index 000000000..3a6ae5971 --- /dev/null +++ b/src/guidellm/data/deserializers/synthetic_video.py @@ -0,0 +1,319 @@ +"""Synthetic video dataset deserializer.""" + +from __future__ import annotations + +from collections.abc import Callable, Iterator +from typing import Any, Literal + +import numpy as np +from datasets import DatasetInfo, Features, IterableDataset, Value +from datasets.iterable_dataset import _BaseExamplesIterable +from faker import Faker +from pydantic import Field, model_validator +from transformers import PreTrainedTokenizerBase + +from guidellm.data.deserializers.deserializer import ( + DatasetDeserializer, + DatasetDeserializerFactory, +) +from guidellm.data.deserializers.synthetic_image import ( + _RESOLUTION_PRESETS, + _SyntheticVisionTextMixin, + _parse_aspect_ratio, +) +from guidellm.data.schemas import DataArgs +from guidellm.utils.random import IntegerRangeSampler +from guidellm.utils.vision import synthesize_video + +__all__ = [ + "SyntheticVideoDataArgs", + "SyntheticVideoDataset", + "SyntheticVideoDatasetDeserializer", +] + + +_DESERIALIZER_TYPE = "synthetic_video" + + +@DataArgs.register(_DESERIALIZER_TYPE) +class SyntheticVideoDataArgs(_SyntheticVisionTextMixin): + """Model for synthetic video dataset deserializer arguments.""" + + kind: Literal["synthetic_video"] = Field( # type: ignore[assignment] + default=_DESERIALIZER_TYPE, + description="Type identifier for the synthetic video dataset configuration.", + ) + width: int | None = Field( + description="Frame width in pixels.", + gt=0, + default=None, + ) + height: int | None = Field( + description="Frame height in pixels.", + gt=0, + default=None, + ) + resolution: str | None = Field( + description="Resolution shortcut such as '720p' or '1080p'.", + default=None, + ) + aspect_ratio: str | None = Field( + description="Aspect ratio override, e.g. '16:9' or '4:3'.", + default=None, + ) + frames: int = Field( + description="Number of frames in the clip.", + ge=1, + ) + fps: float = Field( + description="Frames per second.", + gt=0.0, + default=1.0, + ) + format: Literal["mp4"] = Field( + description="Container / codec. Only mp4 (h264, yuv420p) in v1.", + default="mp4", + ) + video_bitrate: str | None = Field( + description="Optional libx264 bitrate string, e.g. '500k'.", + default=None, + ) + content: Literal["gradient", "noise"] = Field( + description="Frame content to synthesize.", + default="gradient", + ) + + @model_validator(mode="after") + def _resolve_dimensions(self) -> SyntheticVideoDataArgs: + w = self.width + h = self.height + if self.resolution is not None: + preset = _RESOLUTION_PRESETS.get(self.resolution.lower()) + if preset is None: + raise ValueError( + f"Unknown resolution '{self.resolution}'. Known: " + f"{sorted(_RESOLUTION_PRESETS)}" + ) + preset_w, preset_h = preset + if h is None: + h = preset_h + if w is None: + w = ( + int(round(h * _parse_aspect_ratio(self.aspect_ratio))) + if self.aspect_ratio is not None + else preset_w + ) + elif self.aspect_ratio is not None: + if h is not None and w is None: + w = int(round(h * _parse_aspect_ratio(self.aspect_ratio))) + elif w is not None and h is None: + h = int(round(w / _parse_aspect_ratio(self.aspect_ratio))) + + if w is None or h is None: + raise ValueError( + "synthetic_video config requires width and height, either " + "explicitly or via resolution/aspect_ratio." + ) + self.width = int(w) - (int(w) % 2) + self.height = int(h) - (int(h) % 2) + if self.width <= 0 or self.height <= 0: + raise ValueError( + f"Resolved video dims must be positive, got " + f"{self.width}x{self.height}" + ) + return self + + +class _SyntheticVideoExamplesIterable(_BaseExamplesIterable): + def __init__( + self, + config: SyntheticVideoDataArgs, + processor: PreTrainedTokenizerBase, + random_seed: int, + ): + super().__init__() + self.config = config + self.processor = processor + self.random_seed = random_seed + self.iteration_count = 0 + + @staticmethod + def _build_prompt( + token_count: int, + processor: PreTrainedTokenizerBase, + faker: Faker, + unique: str, + ) -> str: + token_ids: list[int] = [] + avg_chars_per_token = 5 + margin_of_safety = 1.5 + attempts = 0 + while len(token_ids) < token_count: + attempts += 1 + num_chars = int( + token_count * avg_chars_per_token * margin_of_safety * attempts + ) + text = unique + faker.text(max_nb_chars=num_chars) + token_ids = processor.encode(text) + decoded = processor.decode(token_ids[:token_count], skip_special_tokens=True) + if isinstance(decoded, str): + return decoded + raise RuntimeError("Processor returned unexpected prompt decode type.") + + def __iter__(self) -> Iterator[tuple[int, dict[str, Any]]]: + iter_seed = self.random_seed + self.iteration_count + self.iteration_count += 1 + + faker = Faker() + faker.seed_instance(iter_seed) + + text_tokens_sampler = iter( + IntegerRangeSampler( + average=self.config.text_tokens, + variance=self.config.text_tokens_stdev, + min_value=self.config.text_tokens_min, + max_value=self.config.text_tokens_max, + random_seed=iter_seed, + ) + ) + output_tokens_sampler = ( + iter( + IntegerRangeSampler( + average=self.config.output_tokens, + variance=self.config.output_tokens_stdev, + min_value=self.config.output_tokens_min, + max_value=self.config.output_tokens_max, + random_seed=iter_seed + 1, + ) + ) + if self.config.output_tokens is not None + else None + ) + + row_index = 0 + while True: + text_token_count = next(text_tokens_sampler) + output_token_count = ( + next(output_tokens_sampler) + if output_tokens_sampler is not None + else None + ) + prompt = self._build_prompt( + text_token_count, + self.processor, + faker, + f"{self.iteration_count} {row_index} ", + ) + + row: dict[str, Any] = { + "prefix": "", + "prompt_0": prompt, + "prompt_tokens_count_0": text_token_count, + "video": synthesize_video( + width=int(self.config.width), + height=int(self.config.height), + frames=int(self.config.frames), + fps=float(self.config.fps), + content=self.config.content, + video_format=self.config.format, + video_bitrate=self.config.video_bitrate, + seed=self.config.seed, + row_index=row_index, + ), + } + if output_token_count is not None: + row["output_tokens_count_0"] = output_token_count + + yield row_index, row + row_index += 1 + + @property + def is_typed(self) -> bool: + return True + + @property + def features(self) -> Features: + features: dict[str, Any] = { + "prefix": Value("string"), + "prompt_0": Value("string"), + "prompt_tokens_count_0": Value("int32"), + "video": { + "type": Value("string"), + "video": Value("string"), + "video_frames": Value("int64"), + "video_seconds": Value("float64"), + "video_bytes": Value("int64"), + }, + } + if self.config.output_tokens is not None: + features["output_tokens_count_0"] = Value("int32") + return Features(features) + + @property + def num_shards(self) -> int: + return 1 + + def shuffle_data_sources( + self, + generator: np.random.Generator, # noqa: ARG002 + ) -> _SyntheticVideoExamplesIterable: + return self + + def shard_data_sources( + self, + num_shards: int, # noqa: ARG002 + index: int, # noqa: ARG002 + contiguous: bool = True, # noqa: ARG002 + ) -> _SyntheticVideoExamplesIterable: + return self + + def load_state_dict(self, state_dict: dict) -> None: + self.iteration_count = state_dict.get("iteration_count", 0) + + def _init_state_dict(self) -> dict: + self._state_dict = {"iteration_count": self.iteration_count} + return self._state_dict + + +class SyntheticVideoDataset(IterableDataset): + def __init__( + self, + config: SyntheticVideoDataArgs, + processor: PreTrainedTokenizerBase, + random_seed: int = 42, + ): + self.config = config + self.processor = processor + self.random_seed = random_seed + + ex_iterable = _SyntheticVideoExamplesIterable( + config=config, + processor=processor, + random_seed=random_seed, + ) + super().__init__( + ex_iterable=ex_iterable, + info=DatasetInfo( + description="Synthetic video dataset generator", + features=ex_iterable.features, + ), + ) + + def set_epoch(self, epoch: int): + if isinstance(self._ex_iterable, _SyntheticVideoExamplesIterable): + self._ex_iterable.iteration_count = epoch + + +@DatasetDeserializerFactory.register(_DESERIALIZER_TYPE) +class SyntheticVideoDatasetDeserializer(DatasetDeserializer): + def __call__( + self, + config: SyntheticVideoDataArgs, + processor_factory: Callable[[], PreTrainedTokenizerBase], + random_seed: int, + ) -> IterableDataset: + return SyntheticVideoDataset( + config=config, + processor=processor_factory(), + random_seed=random_seed, + ) From 20aa0db64516b5f7456a6cee1248364562a3cef9 Mon Sep 17 00:00:00 2001 From: Zakaria el hjouji Date: Wed, 13 May 2026 00:57:05 -0400 Subject: [PATCH 03/14] tests: unit + integration coverage for synthetic_image and synthetic_video Unit tests cover synthesize_image / synthesize_video helpers (decoded dims, byte counts, reproducibility, per-row uniqueness, 1000-row cache-bust check) and the deserializers (pull 10 rows from a --data string, type-mismatch refusal, prompt_tokens alias, images_per_request). Integration test spins up the in-tree mock server and runs 'guidellm benchmark run' end-to-end with both synthetic_image and synthetic_video --data strings, asserting return code 0 and a non-empty benchmark report. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Zakaria el hjouji Signed-off-by: Zakaria el hjouji --- tests/integration/data/__init__.py | 0 .../test_synthetic_multimodal_benchmark.py | 172 ++++++++ .../test_synthetic_multimodal.py | 404 ++++++++++++++++++ 3 files changed, 576 insertions(+) create mode 100644 tests/integration/data/__init__.py create mode 100644 tests/integration/data/test_synthetic_multimodal_benchmark.py create mode 100644 tests/unit/data/deserializers/test_synthetic_multimodal.py diff --git a/tests/integration/data/__init__.py b/tests/integration/data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/integration/data/test_synthetic_multimodal_benchmark.py b/tests/integration/data/test_synthetic_multimodal_benchmark.py new file mode 100644 index 000000000..7130df337 --- /dev/null +++ b/tests/integration/data/test_synthetic_multimodal_benchmark.py @@ -0,0 +1,172 @@ +"""Integration test: benchmark synthetic_image / synthetic_video against the +guidellm mock server. + +Spins up the in-tree mock server (Sanic) in a subprocess, runs a short +`guidellm benchmark run` against it for both image and video synthetic data, +and asserts the benchmark process exits cleanly with at least one successful +request recorded. + +The mock backend's TTFT/ITL numbers are meaningless here. We're only proving +that the new deserializers + data pipeline + request handler chain complete +end-to-end without errors. +""" + +from __future__ import annotations + +import asyncio +import json +import multiprocessing +import socket +import subprocess +import sys +import time +from pathlib import Path + +import httpx +import pytest + +from guidellm.mock_server.config import MockServerConfig +from guidellm.mock_server.server import MockServer + +pytestmark = [pytest.mark.smoke] + + +def _start_server_process(config: MockServerConfig) -> None: + server = MockServer(config) + server.run() + + +def _free_port() -> int: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind(("127.0.0.1", 0)) + port = sock.getsockname()[1] + sock.close() + return port + + +def _wait_for_server(base_url: str, timeout: float = 30.0) -> None: + async def _poll() -> None: + backoff = 0.5 + async with httpx.AsyncClient() as client: + while True: + try: + resp = await client.get(f"{base_url}/health", timeout=1.0) + if resp.status_code == 200: + return + except (httpx.RequestError, httpx.TimeoutException): + pass + await asyncio.sleep(backoff) + backoff = min(backoff * 1.5, 2.0) + + asyncio.run(asyncio.wait_for(_poll(), timeout=timeout)) + + +@pytest.fixture(scope="module") +def mock_backend(): + port = _free_port() + config = MockServerConfig( + host="127.0.0.1", + port=port, + model="test-model", + ttft_ms=10.0, + itl_ms=1.0, + request_latency=0.05, + output_tokens=16, + ) + base_url = f"http://{config.host}:{config.port}" + proc = multiprocessing.Process(target=_start_server_process, args=(config,)) + proc.start() + try: + _wait_for_server(base_url) + yield base_url + finally: + proc.terminate() + proc.join(timeout=5) + if proc.is_alive(): + proc.kill() + proc.join(timeout=5) + + +def _run_benchmark( + base_url: str, + data: str, + output_dir: Path, + output_name: str, + max_seconds: float = 3.0, +) -> subprocess.CompletedProcess: + output_path = output_dir / output_name + cmd = [ + sys.executable, + "-m", + "guidellm", + "benchmark", + "run", + "--target", + base_url, + "--data", + data, + "--data-samples", + "8", + "--profile", + "constant", + "--rate", + "2", + "--max-seconds", + str(max_seconds), + "--processor", + "Xenova/gpt-4", + "--backend", + "openai_http", + "--outputs", + str(output_path), + "--disable-progress", + "--disable-console-outputs", + ] + return subprocess.run( + cmd, capture_output=True, text=True, timeout=180, check=False + ) + + +@pytest.mark.timeout(240) +def test_synthetic_image_benchmark_against_mock(mock_backend, tmp_path): + """A short benchmark on synthetic_image must complete cleanly.""" + result = _run_benchmark( + base_url=mock_backend, + data=( + "type=synthetic_image,width=128,height=128,format=jpeg," + "jpeg_quality=85,text_tokens=20,output_tokens=8,seed=11" + ), + output_dir=tmp_path, + output_name="image.json", + ) + assert result.returncode == 0, ( + f"image benchmark failed: stdout=\n{result.stdout}\nstderr=\n{result.stderr}" + ) + report_path = tmp_path / "image.json" + assert report_path.exists(), "expected benchmark JSON output" + report = json.loads(report_path.read_text()) + benchmarks = report.get("benchmarks", []) + assert benchmarks, "expected at least one benchmark in the report" + + +@pytest.mark.timeout(240) +def test_synthetic_video_benchmark_against_mock(mock_backend, tmp_path): + """A short benchmark on synthetic_video must complete cleanly.""" + result = _run_benchmark( + base_url=mock_backend, + data=( + "type=synthetic_video,width=160,height=120,frames=4,fps=1," + "text_tokens=10,output_tokens=4,seed=23" + ), + output_dir=tmp_path, + output_name="video.json", + max_seconds=4.0, + ) + assert result.returncode == 0, ( + f"video benchmark failed: stdout=\n{result.stdout}\nstderr=\n{result.stderr}" + ) + report_path = tmp_path / "video.json" + assert report_path.exists(), "expected benchmark JSON output" + report = json.loads(report_path.read_text()) + benchmarks = report.get("benchmarks", []) + assert benchmarks, "expected at least one benchmark in the report" diff --git a/tests/unit/data/deserializers/test_synthetic_multimodal.py b/tests/unit/data/deserializers/test_synthetic_multimodal.py new file mode 100644 index 000000000..3f47b51c5 --- /dev/null +++ b/tests/unit/data/deserializers/test_synthetic_multimodal.py @@ -0,0 +1,404 @@ +"""Unit tests for synthetic_image / synthetic_video deserializers.""" + +from __future__ import annotations + +import base64 +import hashlib +import io +import tempfile +from pathlib import Path +from unittest.mock import Mock + +import imageio +import pytest +from PIL import Image + +from guidellm.data.deserializers import ( + DatasetDeserializerFactory, + SyntheticImageDataset, + SyntheticImageDatasetDeserializer, + SyntheticVideoDataset, + SyntheticVideoDatasetDeserializer, +) +from guidellm.data.deserializers.deserializer import DataNotSupportedError +from guidellm.data.schemas import ( + SyntheticImageDatasetConfig, + SyntheticVideoDatasetConfig, +) +from guidellm.extras.vision import synthesize_image, synthesize_video + + +def _mock_tokenizer() -> Mock: + tokenizer = Mock() + tokenizer.encode.side_effect = lambda text: list(range(len(text.split()))) + tokenizer.decode.side_effect = ( + lambda tokens, skip_special_tokens=False: " ".join( + f"tok_{t}" for t in tokens + ) + ) + return tokenizer + + +def _decode_data_url(data_url: str) -> bytes: + _, encoded = data_url.split(",", 1) + return base64.b64decode(encoded) + + +# --------------------------------------------------------------------------- +# synthesize_image +# --------------------------------------------------------------------------- + + +class TestSynthesizeImage: + @pytest.mark.smoke + @pytest.mark.parametrize("fmt", ["jpeg", "png"]) + @pytest.mark.parametrize("width,height", [(640, 480), (1280, 720), (256, 256)]) + def test_decoded_dims_match(self, fmt: str, width: int, height: int): + out = synthesize_image(width, height, image_format=fmt, seed=0, row_index=0) + decoded = _decode_data_url(out["image"]) + img = Image.open(io.BytesIO(decoded)) + assert img.size == (width, height) + assert out["image_pixels"] == width * height + + @pytest.mark.smoke + def test_image_bytes_match_payload(self): + out = synthesize_image(640, 480, seed=0, row_index=0) + decoded = _decode_data_url(out["image"]) + assert out["image_bytes"] == len(decoded) + + @pytest.mark.smoke + def test_reproducible_same_seed_row_index(self): + a = synthesize_image(320, 240, seed=99, row_index=7) + b = synthesize_image(320, 240, seed=99, row_index=7) + assert a["image"] == b["image"] + + @pytest.mark.smoke + def test_row_index_changes_payload(self): + a = synthesize_image(320, 240, seed=99, row_index=0) + b = synthesize_image(320, 240, seed=99, row_index=1) + assert a["image"] != b["image"] + + @pytest.mark.sanity + def test_seed_changes_payload(self): + a = synthesize_image(320, 240, seed=1, row_index=0) + b = synthesize_image(320, 240, seed=2, row_index=0) + assert a["image"] != b["image"] + + @pytest.mark.sanity + @pytest.mark.parametrize("content", ["gradient", "noise", "solid", "checkerboard"]) + def test_content_modes_produce_valid_images(self, content: str): + out = synthesize_image(64, 64, content=content, seed=3, row_index=0) + decoded = _decode_data_url(out["image"]) + img = Image.open(io.BytesIO(decoded)) + assert img.size == (64, 64) + assert out["image_bytes"] > 0 + + @pytest.mark.sanity + def test_byte_uniqueness_gradient_1000_rows(self): + """1000 gradient rows with the same seed must all be byte-different.""" + hashes = set() + for i in range(1000): + out = synthesize_image(128, 128, content="gradient", seed=17, row_index=i) + hashes.add(hashlib.sha256(out["image"].encode()).hexdigest()) + assert len(hashes) == 1000 + + @pytest.mark.regression + def test_unsupported_format_raises(self): + with pytest.raises(ValueError, match="format"): + synthesize_image(64, 64, image_format="webp", seed=0) + + @pytest.mark.regression + def test_unsupported_content_raises(self): + with pytest.raises(ValueError, match="content"): + synthesize_image(64, 64, content="zebra", seed=0) # type: ignore[arg-type] + + +# --------------------------------------------------------------------------- +# synthesize_video +# --------------------------------------------------------------------------- + + +class TestSynthesizeVideo: + @pytest.mark.smoke + @pytest.mark.parametrize("frames", [4, 6, 12]) + @pytest.mark.parametrize("fps", [1.0, 2.0]) + def test_decoded_frame_count_and_seconds_match(self, frames: int, fps: float): + out = synthesize_video( + 320, 240, frames=frames, fps=fps, seed=5, row_index=0 + ) + decoded = _decode_data_url(out["video"]) + # Write to temp file and read back via imageio's ffmpeg reader to + # check decoded frame count and dims. + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f: + f.write(decoded) + path = f.name + try: + reader = imageio.get_reader(path, "ffmpeg") + decoded_frames = [frame for frame in reader] + assert len(decoded_frames) == frames + assert decoded_frames[0].shape == (240, 320, 3) + reader.close() + finally: + Path(path).unlink() + + assert out["video_frames"] == frames + assert out["video_seconds"] == pytest.approx(frames / fps) + + @pytest.mark.smoke + def test_video_bytes_match_payload(self): + out = synthesize_video(320, 240, frames=4, fps=1, seed=5, row_index=0) + decoded = _decode_data_url(out["video"]) + assert out["video_bytes"] == len(decoded) + + @pytest.mark.smoke + def test_reproducible_same_seed_row_index(self): + a = synthesize_video(160, 120, frames=3, fps=1, seed=42, row_index=2) + b = synthesize_video(160, 120, frames=3, fps=1, seed=42, row_index=2) + assert a["video"] == b["video"] + + @pytest.mark.smoke + def test_row_index_changes_payload(self): + a = synthesize_video(160, 120, frames=3, fps=1, seed=42, row_index=0) + b = synthesize_video(160, 120, frames=3, fps=1, seed=42, row_index=1) + assert a["video"] != b["video"] + + @pytest.mark.sanity + def test_byte_uniqueness_gradient_video(self): + """200 gradient clips with same seed must all be byte-different.""" + hashes = set() + for i in range(200): + out = synthesize_video( + 64, 64, frames=2, fps=1, content="gradient", seed=8, row_index=i + ) + hashes.add(hashlib.sha256(out["video"].encode()).hexdigest()) + assert len(hashes) == 200 + + @pytest.mark.regression + def test_unsupported_format_raises(self): + with pytest.raises(ValueError, match="format"): + synthesize_video(64, 64, frames=2, video_format="webm", seed=0) + + @pytest.mark.regression + def test_unsupported_content_raises(self): + with pytest.raises(ValueError, match="content"): + synthesize_video(64, 64, frames=2, content="solid", seed=0) # type: ignore[arg-type] + + +# --------------------------------------------------------------------------- +# Config validation +# --------------------------------------------------------------------------- + + +class TestSyntheticImageConfig: + @pytest.mark.smoke + def test_resolution_resolves_to_width_height(self): + cfg = SyntheticImageDatasetConfig(resolution="720p", text_tokens=50) + assert cfg.width == 1280 + assert cfg.height == 720 + + @pytest.mark.sanity + def test_aspect_ratio_overrides_width(self): + cfg = SyntheticImageDatasetConfig( + resolution="720p", aspect_ratio="4:3", text_tokens=50 + ) + # 720 * 4 / 3 = 960 + assert cfg.height == 720 + assert cfg.width == 960 + + @pytest.mark.sanity + def test_prompt_tokens_alias_accepted(self): + cfg = SyntheticImageDatasetConfig.model_validate( + {"width": 640, "height": 480, "prompt_tokens": 50} + ) + assert cfg.text_tokens == 50 + + @pytest.mark.regression + def test_missing_dims_raises(self): + with pytest.raises(ValueError): + SyntheticImageDatasetConfig(text_tokens=10) + + @pytest.mark.regression + def test_unknown_resolution_raises(self): + with pytest.raises(ValueError, match="resolution"): + SyntheticImageDatasetConfig(resolution="9000p", text_tokens=10) + + +# --------------------------------------------------------------------------- +# Deserializer-from-string + 10-row pull +# --------------------------------------------------------------------------- + + +class TestSyntheticImageDeserializer: + @pytest.mark.smoke + def test_pull_10_rows_from_data_string(self): + d = SyntheticImageDatasetDeserializer() + ds = d( + data=( + "type=synthetic_image,resolution=480p,text_tokens=20," + "output_tokens=8,seed=11" + ), + processor_factory=_mock_tokenizer, + random_seed=42, + ) + assert isinstance(ds, SyntheticImageDataset) + + rows = [] + it = iter(ds) + for _ in range(10): + rows.append(next(it)) + + assert len(rows) == 10 + for row in rows: + assert "image" in row + assert row["image"]["image_pixels"] == 854 * 480 + assert row["image"]["image_bytes"] > 0 + assert row["prompt_tokens_count_0"] > 0 + assert row["output_tokens_count_0"] > 0 + + # All 10 image payloads must be byte-different (cache-bust guarantee). + digests = { + hashlib.sha256(r["image"]["image"].encode()).hexdigest() for r in rows + } + assert len(digests) == 10 + + @pytest.mark.sanity + def test_factory_dispatch_via_explicit_type(self): + ds = DatasetDeserializerFactory.deserialize( + data=( + "type=synthetic_image,width=320,height=240,text_tokens=15," + "output_tokens=4" + ), + processor_factory=_mock_tokenizer, + ) + assert isinstance(ds, SyntheticImageDataset) + + @pytest.mark.sanity + def test_refuses_when_type_mismatch(self): + d = SyntheticImageDatasetDeserializer() + with pytest.raises(DataNotSupportedError): + d( + data="type=synthetic_text,prompt_tokens=50", + processor_factory=_mock_tokenizer, + random_seed=42, + ) + + @pytest.mark.regression + def test_images_per_request_emits_indexed_columns(self): + d = SyntheticImageDatasetDeserializer() + ds = d( + data=( + "type=synthetic_image,width=64,height=64,images_per_request=3," + "text_tokens=5,output_tokens=2" + ), + processor_factory=_mock_tokenizer, + random_seed=42, + ) + row = next(iter(ds)) + assert "image_0" in row + assert "image_1" in row + assert "image_2" in row + # All three images in the same row should be byte-different. + digests = {row[f"image_{i}"]["image"] for i in range(3)} + assert len(digests) == 3 + + +class TestSyntheticVideoDeserializer: + @pytest.mark.smoke + def test_pull_10_rows_from_data_string(self): + d = SyntheticVideoDatasetDeserializer() + ds = d( + data=( + "type=synthetic_video,width=320,height=240,frames=4,fps=1," + "text_tokens=10,output_tokens=4,seed=17" + ), + processor_factory=_mock_tokenizer, + random_seed=42, + ) + assert isinstance(ds, SyntheticVideoDataset) + + rows = [] + it = iter(ds) + for _ in range(10): + rows.append(next(it)) + + assert len(rows) == 10 + for row in rows: + assert "video" in row + assert row["video"]["video_frames"] == 4 + assert row["video"]["video_seconds"] == 4.0 + assert row["video"]["video_bytes"] > 0 + + digests = { + hashlib.sha256(r["video"]["video"].encode()).hexdigest() for r in rows + } + assert len(digests) == 10 + + @pytest.mark.sanity + def test_factory_dispatch_via_explicit_type(self): + ds = DatasetDeserializerFactory.deserialize( + data=( + "type=synthetic_video,width=160,height=120,frames=3,fps=1," + "text_tokens=10,output_tokens=4" + ), + processor_factory=_mock_tokenizer, + ) + assert isinstance(ds, SyntheticVideoDataset) + + @pytest.mark.sanity + def test_refuses_when_type_mismatch(self): + d = SyntheticVideoDatasetDeserializer() + with pytest.raises(DataNotSupportedError): + d( + data="type=synthetic_image,width=64,height=64,text_tokens=10", + processor_factory=_mock_tokenizer, + random_seed=42, + ) + + @pytest.mark.smoke + def test_video_config_via_json(self): + cfg = SyntheticVideoDatasetConfig.model_validate( + { + "width": 320, + "height": 240, + "frames": 4, + "fps": 1, + "text_tokens": 10, + "video_bitrate": "200k", + } + ) + assert cfg.width == 320 + assert cfg.frames == 4 + assert cfg.video_bitrate == "200k" + + +# --------------------------------------------------------------------------- +# End-to-end reproducibility across the deserializer (not just the helpers) +# --------------------------------------------------------------------------- + + +@pytest.mark.smoke +def test_full_dataset_reproducible_with_same_seed(): + """Two datasets with the same seed must produce identical per-row sha256.""" + d = SyntheticImageDatasetDeserializer() + common = { + "data": ( + "type=synthetic_image,width=128,height=128,text_tokens=10," + "output_tokens=2,seed=999" + ), + "processor_factory": _mock_tokenizer, + "random_seed": 42, + } + ds_a = d(**common) + ds_b = d(**common) + + digests_a = [] + digests_b = [] + it_a, it_b = iter(ds_a), iter(ds_b) + for _ in range(10): + digests_a.append( + hashlib.sha256(next(it_a)["image"]["image"].encode()).hexdigest() + ) + digests_b.append( + hashlib.sha256(next(it_b)["image"]["image"].encode()).hexdigest() + ) + assert digests_a == digests_b From ef4dc7aa96cb8a1f5efef017af5ac06ebcbf24e7 Mon Sep 17 00:00:00 2001 From: Zakaria el hjouji Date: Wed, 13 May 2026 00:57:41 -0400 Subject: [PATCH 04/14] docs: README usage examples for synthetic_image and synthetic_video Move synthetic multimodal generation out of Active Development for images and video. Audio remains WIP. Add two short --data examples (one image, one video) plus a parameter rundown for the new types. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Zakaria el hjouji Signed-off-by: Zakaria el hjouji --- README.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/README.md b/README.md index af564496d..4ba0d4ddc 100644 --- a/README.md +++ b/README.md @@ -201,6 +201,32 @@ guidellm run \ - `--data-loader type=pytorch,samples=1000`: Limit how many rows are loaded (`-1` for all) - `--tokenizer huggingface_auto "model=gpt2"`: Tokenizer for synthetic data or local token counting +### Synthetic Multimodal Data + +GuideLLM can synthesize images and short videos on the fly so you can benchmark VLM serving configurations without bringing your own dataset. Two new `--data` types compose with the existing text token controls. + +A single 720p image alongside 200 text tokens and 64 output tokens: + +```bash +guidellm benchmark run \ + --target http://localhost:8000 \ + --data "type=synthetic_image,resolution=720p,text_tokens=200,output_tokens=64" +``` + +A six-frame 480p clip at 1 fps with modest prompt and output budgets: + +```bash +guidellm benchmark run \ + --target http://localhost:8000 \ + --data "type=synthetic_video,width=854,height=480,frames=6,fps=1,text_tokens=64,output_tokens=128" +``` + +**Key parameters:** + +- `--data "type=synthetic_image,..."`: Knobs include `width`, `height`, the `resolution=720p` / `aspect_ratio=16:9` sugar, `format` (`jpeg` or `png`), `jpeg_quality`, `content` (`gradient` default, `noise`, `solid`, `checkerboard`), `images_per_request`, `text_tokens` (with the same `stdev`/`min`/`max` companions as the synthetic text mode), `output_tokens`, and `seed`. `prompt_tokens` is accepted as an alias for `text_tokens`. +- `--data "type=synthetic_video,..."`: Knobs include `width`, `height`, `frames`, `fps`, `video_bitrate`, `content` (`gradient` default or `noise`), the same `text_tokens` / `output_tokens` fields, and `seed`. `format` is `mp4` (h264, yuv420p) in v1. +- Defaults pick per-row seeded gradients so every payload is byte-different from the next, which defeats vLLM's multimodal preprocessor cache while still compressing like real media. `noise` is opt-in for worst-case wire sizes; `solid` and `checkerboard` are opt-in for cache-sensitivity sweeps. + ### Request Types and API Targets You can benchmark chat completions, text completions, or other supported request types. This example configures the benchmark to test the chat completions API using a custom dataset file, with GuideLLM automatically formatting requests to match the chat completions schema. From 1d8a8ce7a3c94224e43bc00973368cdd649017a9 Mon Sep 17 00:00:00 2001 From: Zakaria el hjouji Date: Wed, 13 May 2026 12:49:21 -0400 Subject: [PATCH 05/14] fix: declare image/video features and make encoders idempotent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs caught by Section 4 of the evaluation plan against real vLLM: 1. SyntheticImageDataset and SyntheticVideoDataset features() omitted the image/video columns from the typed schema, so dataset.column_names returned only text columns. GenerativeColumnMapper reads column_names first and never sees `image`/`video`, so the request handler builds a text-only chat completion and the image is silently dropped. TTFT was identical across 480p/720p/1080p before the fix. 2. MediaEncoder still runs on synthetic rows. It called encode_image with the already-encoded canonical dict, which raised "Unsupported image type: " and dropped every row. Made encode_image and encode_video idempotent on the canonical dict shape so re-application is a no-op. After both fixes: resolution sweep TTFT 63.7 → 67.9 → 73.6ms (monotonic); frame sweep TTFT 94 → 211 → 376ms (monotonic, linear in frames); synth-vs-real fidelity 0.3% TTFT_p90 delta and 0.0% ITL_p50 delta. Co-authored-by: Claude Signed-off-by: Zakaria el hjouji Signed-off-by: Zakaria el hjouji --- src/guidellm/data/deserializers/synthetic_video.py | 7 +++++++ src/guidellm/utils/vision.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/guidellm/data/deserializers/synthetic_video.py b/src/guidellm/data/deserializers/synthetic_video.py index 3a6ae5971..ca0cc467b 100644 --- a/src/guidellm/data/deserializers/synthetic_video.py +++ b/src/guidellm/data/deserializers/synthetic_video.py @@ -247,6 +247,13 @@ def features(self) -> Features: } if self.config.output_tokens is not None: features["output_tokens_count_0"] = Value("int32") + features["video"] = { + "type": Value("string"), + "video": Value("string"), + "video_frames": Value("int64"), + "video_seconds": Value("float64"), + "video_bytes": Value("int64"), + } return Features(features) @property diff --git a/src/guidellm/utils/vision.py b/src/guidellm/utils/vision.py index 8977d3c43..85261c947 100644 --- a/src/guidellm/utils/vision.py +++ b/src/guidellm/utils/vision.py @@ -29,7 +29,7 @@ def is_url(text: Any) -> bool: def encode_image( - image: bytes | str | Path | np.ndarray | libs.Image, + image: bytes | str | Path | np.ndarray | dict[str, Any] | libs.Image, width: int | None = None, height: int | None = None, max_size: int | None = None, @@ -59,6 +59,9 @@ def encode_image( - image url - "data:image/{type};base64, {data}" string """ + if isinstance(image, dict) and "image" in image and "type" in image: + return image # type: ignore[return-value] + if isinstance(image, str) and is_url(image): if encode_type == "base64": response = httpx.get(image) @@ -204,7 +207,7 @@ def image_dict_to_pil(item: dict[str, Any]) -> libs.Image: def encode_video( - video: bytes | str | Path, + video: bytes | str | Path | dict[str, Any], encode_type: Literal["base64", "url"] | None = "base64", ) -> dict[ Literal["type", "video", "video_frames", "video_seconds", "video_bytes"], @@ -226,6 +229,9 @@ def encode_video( - video url - "data:video/{type};base64, {data}" string """ + if isinstance(video, dict) and "video" in video and "type" in video: + return video # type: ignore[return-value] + if isinstance(video, str) and is_url(video): if encode_type == "base64": response = httpx.get(video) From 0b324a950ff79d47bf58d0a016a519e0d159a4dd Mon Sep 17 00:00:00 2001 From: Zakaria el hjouji Date: Fri, 15 May 2026 10:30:57 -0400 Subject: [PATCH 06/14] tests: add WRITTEN BY AI marker per AGENTS.md guidellm's AGENTS.md requires every AI-written test function to carry `## WRITTEN BY AI ##` at the end of its docstring. Adds the marker to all 45 new tests in the multimodal suite. Assisted-by: Claude (Anthropic) Signed-off-by: Zakaria el hjouji Signed-off-by: Zakaria el hjouji --- .../test_synthetic_multimodal_benchmark.py | 4 +-- .../test_synthetic_multimodal.py | 33 +++++++++++++++++-- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/tests/integration/data/test_synthetic_multimodal_benchmark.py b/tests/integration/data/test_synthetic_multimodal_benchmark.py index 7130df337..0da5ba201 100644 --- a/tests/integration/data/test_synthetic_multimodal_benchmark.py +++ b/tests/integration/data/test_synthetic_multimodal_benchmark.py @@ -129,7 +129,7 @@ def _run_benchmark( @pytest.mark.timeout(240) def test_synthetic_image_benchmark_against_mock(mock_backend, tmp_path): - """A short benchmark on synthetic_image must complete cleanly.""" + """A short benchmark on synthetic_image must complete cleanly. ## WRITTEN BY AI ##""" result = _run_benchmark( base_url=mock_backend, data=( @@ -151,7 +151,7 @@ def test_synthetic_image_benchmark_against_mock(mock_backend, tmp_path): @pytest.mark.timeout(240) def test_synthetic_video_benchmark_against_mock(mock_backend, tmp_path): - """A short benchmark on synthetic_video must complete cleanly.""" + """A short benchmark on synthetic_video must complete cleanly. ## WRITTEN BY AI ##""" result = _run_benchmark( base_url=mock_backend, data=( diff --git a/tests/unit/data/deserializers/test_synthetic_multimodal.py b/tests/unit/data/deserializers/test_synthetic_multimodal.py index 3f47b51c5..d24958569 100644 --- a/tests/unit/data/deserializers/test_synthetic_multimodal.py +++ b/tests/unit/data/deserializers/test_synthetic_multimodal.py @@ -54,6 +54,7 @@ class TestSynthesizeImage: @pytest.mark.parametrize("fmt", ["jpeg", "png"]) @pytest.mark.parametrize("width,height", [(640, 480), (1280, 720), (256, 256)]) def test_decoded_dims_match(self, fmt: str, width: int, height: int): + """## WRITTEN BY AI ##""" out = synthesize_image(width, height, image_format=fmt, seed=0, row_index=0) decoded = _decode_data_url(out["image"]) img = Image.open(io.BytesIO(decoded)) @@ -62,24 +63,28 @@ def test_decoded_dims_match(self, fmt: str, width: int, height: int): @pytest.mark.smoke def test_image_bytes_match_payload(self): + """## WRITTEN BY AI ##""" out = synthesize_image(640, 480, seed=0, row_index=0) decoded = _decode_data_url(out["image"]) assert out["image_bytes"] == len(decoded) @pytest.mark.smoke def test_reproducible_same_seed_row_index(self): + """## WRITTEN BY AI ##""" a = synthesize_image(320, 240, seed=99, row_index=7) b = synthesize_image(320, 240, seed=99, row_index=7) assert a["image"] == b["image"] @pytest.mark.smoke def test_row_index_changes_payload(self): + """## WRITTEN BY AI ##""" a = synthesize_image(320, 240, seed=99, row_index=0) b = synthesize_image(320, 240, seed=99, row_index=1) assert a["image"] != b["image"] @pytest.mark.sanity def test_seed_changes_payload(self): + """## WRITTEN BY AI ##""" a = synthesize_image(320, 240, seed=1, row_index=0) b = synthesize_image(320, 240, seed=2, row_index=0) assert a["image"] != b["image"] @@ -87,6 +92,7 @@ def test_seed_changes_payload(self): @pytest.mark.sanity @pytest.mark.parametrize("content", ["gradient", "noise", "solid", "checkerboard"]) def test_content_modes_produce_valid_images(self, content: str): + """## WRITTEN BY AI ##""" out = synthesize_image(64, 64, content=content, seed=3, row_index=0) decoded = _decode_data_url(out["image"]) img = Image.open(io.BytesIO(decoded)) @@ -95,7 +101,7 @@ def test_content_modes_produce_valid_images(self, content: str): @pytest.mark.sanity def test_byte_uniqueness_gradient_1000_rows(self): - """1000 gradient rows with the same seed must all be byte-different.""" + """1000 gradient rows with the same seed must all be byte-different. ## WRITTEN BY AI ##""" hashes = set() for i in range(1000): out = synthesize_image(128, 128, content="gradient", seed=17, row_index=i) @@ -104,11 +110,13 @@ def test_byte_uniqueness_gradient_1000_rows(self): @pytest.mark.regression def test_unsupported_format_raises(self): + """## WRITTEN BY AI ##""" with pytest.raises(ValueError, match="format"): synthesize_image(64, 64, image_format="webp", seed=0) @pytest.mark.regression def test_unsupported_content_raises(self): + """## WRITTEN BY AI ##""" with pytest.raises(ValueError, match="content"): synthesize_image(64, 64, content="zebra", seed=0) # type: ignore[arg-type] @@ -123,6 +131,7 @@ class TestSynthesizeVideo: @pytest.mark.parametrize("frames", [4, 6, 12]) @pytest.mark.parametrize("fps", [1.0, 2.0]) def test_decoded_frame_count_and_seconds_match(self, frames: int, fps: float): + """## WRITTEN BY AI ##""" out = synthesize_video( 320, 240, frames=frames, fps=fps, seed=5, row_index=0 ) @@ -146,25 +155,28 @@ def test_decoded_frame_count_and_seconds_match(self, frames: int, fps: float): @pytest.mark.smoke def test_video_bytes_match_payload(self): + """## WRITTEN BY AI ##""" out = synthesize_video(320, 240, frames=4, fps=1, seed=5, row_index=0) decoded = _decode_data_url(out["video"]) assert out["video_bytes"] == len(decoded) @pytest.mark.smoke def test_reproducible_same_seed_row_index(self): + """## WRITTEN BY AI ##""" a = synthesize_video(160, 120, frames=3, fps=1, seed=42, row_index=2) b = synthesize_video(160, 120, frames=3, fps=1, seed=42, row_index=2) assert a["video"] == b["video"] @pytest.mark.smoke def test_row_index_changes_payload(self): + """## WRITTEN BY AI ##""" a = synthesize_video(160, 120, frames=3, fps=1, seed=42, row_index=0) b = synthesize_video(160, 120, frames=3, fps=1, seed=42, row_index=1) assert a["video"] != b["video"] @pytest.mark.sanity def test_byte_uniqueness_gradient_video(self): - """200 gradient clips with same seed must all be byte-different.""" + """200 gradient clips with same seed must all be byte-different. ## WRITTEN BY AI ##""" hashes = set() for i in range(200): out = synthesize_video( @@ -175,11 +187,13 @@ def test_byte_uniqueness_gradient_video(self): @pytest.mark.regression def test_unsupported_format_raises(self): + """## WRITTEN BY AI ##""" with pytest.raises(ValueError, match="format"): synthesize_video(64, 64, frames=2, video_format="webm", seed=0) @pytest.mark.regression def test_unsupported_content_raises(self): + """## WRITTEN BY AI ##""" with pytest.raises(ValueError, match="content"): synthesize_video(64, 64, frames=2, content="solid", seed=0) # type: ignore[arg-type] @@ -192,12 +206,14 @@ def test_unsupported_content_raises(self): class TestSyntheticImageConfig: @pytest.mark.smoke def test_resolution_resolves_to_width_height(self): + """## WRITTEN BY AI ##""" cfg = SyntheticImageDatasetConfig(resolution="720p", text_tokens=50) assert cfg.width == 1280 assert cfg.height == 720 @pytest.mark.sanity def test_aspect_ratio_overrides_width(self): + """## WRITTEN BY AI ##""" cfg = SyntheticImageDatasetConfig( resolution="720p", aspect_ratio="4:3", text_tokens=50 ) @@ -207,6 +223,7 @@ def test_aspect_ratio_overrides_width(self): @pytest.mark.sanity def test_prompt_tokens_alias_accepted(self): + """## WRITTEN BY AI ##""" cfg = SyntheticImageDatasetConfig.model_validate( {"width": 640, "height": 480, "prompt_tokens": 50} ) @@ -214,11 +231,13 @@ def test_prompt_tokens_alias_accepted(self): @pytest.mark.regression def test_missing_dims_raises(self): + """## WRITTEN BY AI ##""" with pytest.raises(ValueError): SyntheticImageDatasetConfig(text_tokens=10) @pytest.mark.regression def test_unknown_resolution_raises(self): + """## WRITTEN BY AI ##""" with pytest.raises(ValueError, match="resolution"): SyntheticImageDatasetConfig(resolution="9000p", text_tokens=10) @@ -231,6 +250,7 @@ def test_unknown_resolution_raises(self): class TestSyntheticImageDeserializer: @pytest.mark.smoke def test_pull_10_rows_from_data_string(self): + """## WRITTEN BY AI ##""" d = SyntheticImageDatasetDeserializer() ds = d( data=( @@ -263,6 +283,7 @@ def test_pull_10_rows_from_data_string(self): @pytest.mark.sanity def test_factory_dispatch_via_explicit_type(self): + """## WRITTEN BY AI ##""" ds = DatasetDeserializerFactory.deserialize( data=( "type=synthetic_image,width=320,height=240,text_tokens=15," @@ -274,6 +295,7 @@ def test_factory_dispatch_via_explicit_type(self): @pytest.mark.sanity def test_refuses_when_type_mismatch(self): + """## WRITTEN BY AI ##""" d = SyntheticImageDatasetDeserializer() with pytest.raises(DataNotSupportedError): d( @@ -284,6 +306,7 @@ def test_refuses_when_type_mismatch(self): @pytest.mark.regression def test_images_per_request_emits_indexed_columns(self): + """## WRITTEN BY AI ##""" d = SyntheticImageDatasetDeserializer() ds = d( data=( @@ -305,6 +328,7 @@ def test_images_per_request_emits_indexed_columns(self): class TestSyntheticVideoDeserializer: @pytest.mark.smoke def test_pull_10_rows_from_data_string(self): + """## WRITTEN BY AI ##""" d = SyntheticVideoDatasetDeserializer() ds = d( data=( @@ -335,6 +359,7 @@ def test_pull_10_rows_from_data_string(self): @pytest.mark.sanity def test_factory_dispatch_via_explicit_type(self): + """## WRITTEN BY AI ##""" ds = DatasetDeserializerFactory.deserialize( data=( "type=synthetic_video,width=160,height=120,frames=3,fps=1," @@ -346,6 +371,7 @@ def test_factory_dispatch_via_explicit_type(self): @pytest.mark.sanity def test_refuses_when_type_mismatch(self): + """## WRITTEN BY AI ##""" d = SyntheticVideoDatasetDeserializer() with pytest.raises(DataNotSupportedError): d( @@ -356,6 +382,7 @@ def test_refuses_when_type_mismatch(self): @pytest.mark.smoke def test_video_config_via_json(self): + """## WRITTEN BY AI ##""" cfg = SyntheticVideoDatasetConfig.model_validate( { "width": 320, @@ -378,7 +405,7 @@ def test_video_config_via_json(self): @pytest.mark.smoke def test_full_dataset_reproducible_with_same_seed(): - """Two datasets with the same seed must produce identical per-row sha256.""" + """Two datasets with the same seed must produce identical per-row sha256. ## WRITTEN BY AI ##""" d = SyntheticImageDatasetDeserializer() common = { "data": ( From f6ab0e6ca73dae9cb5b5ec606ce12ff15f4fbe10 Mon Sep 17 00:00:00 2001 From: Jack Wind Date: Sat, 16 May 2026 02:21:48 +0000 Subject: [PATCH 07/14] Fix pre-existing lint and type-check failures Signed-off-by: Zakaria el hjouji Signed-off-by: Jack Wind --- .../data/deserializers/synthetic_video.py | 7 ----- .../test_synthetic_multimodal_benchmark.py | 13 ++++++--- .../test_synthetic_multimodal.py | 29 +++++++++++-------- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/src/guidellm/data/deserializers/synthetic_video.py b/src/guidellm/data/deserializers/synthetic_video.py index ca0cc467b..3a6ae5971 100644 --- a/src/guidellm/data/deserializers/synthetic_video.py +++ b/src/guidellm/data/deserializers/synthetic_video.py @@ -247,13 +247,6 @@ def features(self) -> Features: } if self.config.output_tokens is not None: features["output_tokens_count_0"] = Value("int32") - features["video"] = { - "type": Value("string"), - "video": Value("string"), - "video_frames": Value("int64"), - "video_seconds": Value("float64"), - "video_bytes": Value("int64"), - } return Features(features) @property diff --git a/tests/integration/data/test_synthetic_multimodal_benchmark.py b/tests/integration/data/test_synthetic_multimodal_benchmark.py index 0da5ba201..a55eb3e95 100644 --- a/tests/integration/data/test_synthetic_multimodal_benchmark.py +++ b/tests/integration/data/test_synthetic_multimodal_benchmark.py @@ -19,7 +19,6 @@ import socket import subprocess import sys -import time from pathlib import Path import httpx @@ -122,14 +121,17 @@ def _run_benchmark( "--disable-progress", "--disable-console-outputs", ] - return subprocess.run( + return subprocess.run( # noqa: S603 cmd, capture_output=True, text=True, timeout=180, check=False ) @pytest.mark.timeout(240) def test_synthetic_image_benchmark_against_mock(mock_backend, tmp_path): - """A short benchmark on synthetic_image must complete cleanly. ## WRITTEN BY AI ##""" + """A short benchmark on synthetic_image must complete cleanly. + + ## WRITTEN BY AI ## + """ result = _run_benchmark( base_url=mock_backend, data=( @@ -151,7 +153,10 @@ def test_synthetic_image_benchmark_against_mock(mock_backend, tmp_path): @pytest.mark.timeout(240) def test_synthetic_video_benchmark_against_mock(mock_backend, tmp_path): - """A short benchmark on synthetic_video must complete cleanly. ## WRITTEN BY AI ##""" + """A short benchmark on synthetic_video must complete cleanly. + + ## WRITTEN BY AI ## + """ result = _run_benchmark( base_url=mock_backend, data=( diff --git a/tests/unit/data/deserializers/test_synthetic_multimodal.py b/tests/unit/data/deserializers/test_synthetic_multimodal.py index d24958569..e04e8893c 100644 --- a/tests/unit/data/deserializers/test_synthetic_multimodal.py +++ b/tests/unit/data/deserializers/test_synthetic_multimodal.py @@ -31,10 +31,8 @@ def _mock_tokenizer() -> Mock: tokenizer = Mock() tokenizer.encode.side_effect = lambda text: list(range(len(text.split()))) - tokenizer.decode.side_effect = ( - lambda tokens, skip_special_tokens=False: " ".join( - f"tok_{t}" for t in tokens - ) + tokenizer.decode.side_effect = lambda tokens, skip_special_tokens=False: " ".join( + f"tok_{t}" for t in tokens ) return tokenizer @@ -52,7 +50,7 @@ def _decode_data_url(data_url: str) -> bytes: class TestSynthesizeImage: @pytest.mark.smoke @pytest.mark.parametrize("fmt", ["jpeg", "png"]) - @pytest.mark.parametrize("width,height", [(640, 480), (1280, 720), (256, 256)]) + @pytest.mark.parametrize(("width", "height"), [(640, 480), (1280, 720), (256, 256)]) def test_decoded_dims_match(self, fmt: str, width: int, height: int): """## WRITTEN BY AI ##""" out = synthesize_image(width, height, image_format=fmt, seed=0, row_index=0) @@ -101,7 +99,10 @@ def test_content_modes_produce_valid_images(self, content: str): @pytest.mark.sanity def test_byte_uniqueness_gradient_1000_rows(self): - """1000 gradient rows with the same seed must all be byte-different. ## WRITTEN BY AI ##""" + """1000 gradient rows with the same seed must all be byte-different. + + ## WRITTEN BY AI ## + """ hashes = set() for i in range(1000): out = synthesize_image(128, 128, content="gradient", seed=17, row_index=i) @@ -132,9 +133,7 @@ class TestSynthesizeVideo: @pytest.mark.parametrize("fps", [1.0, 2.0]) def test_decoded_frame_count_and_seconds_match(self, frames: int, fps: float): """## WRITTEN BY AI ##""" - out = synthesize_video( - 320, 240, frames=frames, fps=fps, seed=5, row_index=0 - ) + out = synthesize_video(320, 240, frames=frames, fps=fps, seed=5, row_index=0) decoded = _decode_data_url(out["video"]) # Write to temp file and read back via imageio's ffmpeg reader to # check decoded frame count and dims. @@ -143,7 +142,7 @@ def test_decoded_frame_count_and_seconds_match(self, frames: int, fps: float): path = f.name try: reader = imageio.get_reader(path, "ffmpeg") - decoded_frames = [frame for frame in reader] + decoded_frames = [frame for frame in reader] # noqa: C416 assert len(decoded_frames) == frames assert decoded_frames[0].shape == (240, 320, 3) reader.close() @@ -176,7 +175,10 @@ def test_row_index_changes_payload(self): @pytest.mark.sanity def test_byte_uniqueness_gradient_video(self): - """200 gradient clips with same seed must all be byte-different. ## WRITTEN BY AI ##""" + """200 gradient clips with same seed must all be byte-different. + + ## WRITTEN BY AI ## + """ hashes = set() for i in range(200): out = synthesize_video( @@ -405,7 +407,10 @@ def test_video_config_via_json(self): @pytest.mark.smoke def test_full_dataset_reproducible_with_same_seed(): - """Two datasets with the same seed must produce identical per-row sha256. ## WRITTEN BY AI ##""" + """Two datasets with the same seed must produce identical per-row sha256. + + ## WRITTEN BY AI ## + """ d = SyntheticImageDatasetDeserializer() common = { "data": ( From 316602112b0cda9bb7cf1261587c2796862ae04b Mon Sep 17 00:00:00 2001 From: Jack Wind Date: Sat, 16 May 2026 02:21:48 +0000 Subject: [PATCH 08/14] Add coordinate warp to synthetic gradient generator Signed-off-by: Zakaria el hjouji Signed-off-by: Jack Wind --- src/guidellm/utils/vision.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/guidellm/utils/vision.py b/src/guidellm/utils/vision.py index 85261c947..815d421ba 100644 --- a/src/guidellm/utils/vision.py +++ b/src/guidellm/utils/vision.py @@ -322,8 +322,23 @@ def _gradient_frame( color_b = rng.integers(0, 256, size=3, dtype=np.int32) angle = float(rng.uniform(0.0, 2.0 * np.pi)) - ys = np.linspace(-1.0, 1.0, height, dtype=np.float32).reshape(height, 1) - xs = np.linspace(-1.0, 1.0, width, dtype=np.float32).reshape(1, width) + dx, dy = ( + np.asarray( + libs.PILImage.fromarray(flow, mode="F").resize( + (width, height), libs.PILImage.Resampling.BICUBIC + ), + dtype=np.float32, + ) + for flow in rng.uniform(-1.0, 1.0, size=(2, 16, 16)).astype(np.float32) + ) + sample_xs, sample_ys = np.meshgrid( + np.arange(width, dtype=np.float32), + np.arange(height, dtype=np.float32), + ) + sample_xs = np.clip(sample_xs + dx * 80.0, 0.0, width - 1) + sample_ys = np.clip(sample_ys + dy * 80.0, 0.0, height - 1) + xs = (sample_xs / max(width - 1, 1) * 2.0 - 1.0).astype(np.float32) + ys = (sample_ys / max(height - 1, 1) * 2.0 - 1.0).astype(np.float32) proj = xs * np.cos(angle) + ys * np.sin(angle) proj = (proj - proj.min()) / max(proj.max() - proj.min(), 1e-6) proj = proj[..., None] From f417cac5879d6efdd96bdf36cc831d9e1a3de9c8 Mon Sep 17 00:00:00 2001 From: Zakaria el hjouji Date: Mon, 18 May 2026 18:43:35 -0400 Subject: [PATCH 09/14] docs: move synthetic visual data out of README into dedicated guide MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses dbutenhof's review on PR #732. The Synthetic Multimodal Data section in README.md was too large and too specific for the front page, and the option list was a single dense bullet per type. - README.md: trim to a one-paragraph pointer at the new docs page - docs/guides/multimodal/synthetic_vision.md: new page; split into Synthetic image and Synthetic video subsections, each with example commands and a per-option Configuration Options list - docs/guides/datasets.md: frame the existing Synthetic Data section as text-specific, link out to the visual page - docs/guides/multimodal/index.md: add a Synthetic Vision card to the Available Guides grid Naming: "synthetic vision" rather than "synthetic multimodal" — covers images and video, but not audio. Signed-off-by: Zakaria el hjouji Signed-off-by: Zakaria el hjouji --- README.md | 26 +----- docs/guides/datasets.md | 2 + docs/guides/multimodal/index.md | 8 ++ docs/guides/multimodal/synthetic_vision.md | 93 ++++++++++++++++++++++ 4 files changed, 105 insertions(+), 24 deletions(-) create mode 100644 docs/guides/multimodal/synthetic_vision.md diff --git a/README.md b/README.md index 4ba0d4ddc..540ea0c15 100644 --- a/README.md +++ b/README.md @@ -201,31 +201,9 @@ guidellm run \ - `--data-loader type=pytorch,samples=1000`: Limit how many rows are loaded (`-1` for all) - `--tokenizer huggingface_auto "model=gpt2"`: Tokenizer for synthetic data or local token counting -### Synthetic Multimodal Data +### Synthetic Visual Data -GuideLLM can synthesize images and short videos on the fly so you can benchmark VLM serving configurations without bringing your own dataset. Two new `--data` types compose with the existing text token controls. - -A single 720p image alongside 200 text tokens and 64 output tokens: - -```bash -guidellm benchmark run \ - --target http://localhost:8000 \ - --data "type=synthetic_image,resolution=720p,text_tokens=200,output_tokens=64" -``` - -A six-frame 480p clip at 1 fps with modest prompt and output budgets: - -```bash -guidellm benchmark run \ - --target http://localhost:8000 \ - --data "type=synthetic_video,width=854,height=480,frames=6,fps=1,text_tokens=64,output_tokens=128" -``` - -**Key parameters:** - -- `--data "type=synthetic_image,..."`: Knobs include `width`, `height`, the `resolution=720p` / `aspect_ratio=16:9` sugar, `format` (`jpeg` or `png`), `jpeg_quality`, `content` (`gradient` default, `noise`, `solid`, `checkerboard`), `images_per_request`, `text_tokens` (with the same `stdev`/`min`/`max` companions as the synthetic text mode), `output_tokens`, and `seed`. `prompt_tokens` is accepted as an alias for `text_tokens`. -- `--data "type=synthetic_video,..."`: Knobs include `width`, `height`, `frames`, `fps`, `video_bitrate`, `content` (`gradient` default or `noise`), the same `text_tokens` / `output_tokens` fields, and `seed`. `format` is `mp4` (h264, yuv420p) in v1. -- Defaults pick per-row seeded gradients so every payload is byte-different from the next, which defeats vLLM's multimodal preprocessor cache while still compressing like real media. `noise` is opt-in for worst-case wire sizes; `solid` and `checkerboard` are opt-in for cache-sensitivity sweeps. +GuideLLM can synthesize images and short videos on the fly so you can benchmark VLM serving configurations without bringing your own dataset. Two `--data` types — `synthetic_image` and `synthetic_video` — compose with the existing text token controls. See [Synthetic Visual Data](docs/guides/multimodal/synthetic_vision.md) for example commands and the full list of configuration options. ### Request Types and API Targets diff --git a/docs/guides/datasets.md b/docs/guides/datasets.md index ad3ee1e46..e6688b5e0 100644 --- a/docs/guides/datasets.md +++ b/docs/guides/datasets.md @@ -85,6 +85,8 @@ GuideLLM supports several types of datasets, each with its own advantages and us Synthetic datasets allow you to generate data on the fly with customizable parameters. This is useful for controlled experiments, stress testing, and simulating specific scenarios. For example, you might want to evaluate how a model handles long prompts or generates outputs with specific characteristics. +GuideLLM supports both synthetic *text* — described below — and synthetic *visual* data (images and short videos) for benchmarking Vision-Language Models. See [Synthetic Visual Data](multimodal/synthetic_vision.md) for the `synthetic_image` and `synthetic_video` `--data` types, which compose with all of the text token controls listed here. + #### Example Commands ```bash diff --git a/docs/guides/multimodal/index.md b/docs/guides/multimodal/index.md index ec9966705..807d4a793 100644 --- a/docs/guides/multimodal/index.md +++ b/docs/guides/multimodal/index.md @@ -49,4 +49,12 @@ Ensure you have a running inference server and model compatible with the OpenAI [:octicons-arrow-right-24: Audio Guide](audio.md) +- :material-image-multiple-outline:{ .lg .middle } Synthetic Vision + + ______________________________________________________________________ + + Generate images and short videos on the fly to benchmark VLM serving configurations without bringing your own dataset. Covers the `synthetic_image` and `synthetic_video` `--data` types. + + [:octicons-arrow-right-24: Synthetic Vision Guide](synthetic_vision.md) + diff --git a/docs/guides/multimodal/synthetic_vision.md b/docs/guides/multimodal/synthetic_vision.md new file mode 100644 index 000000000..a78c0ca4e --- /dev/null +++ b/docs/guides/multimodal/synthetic_vision.md @@ -0,0 +1,93 @@ +--- +weight: 40 +--- + +# Synthetic Visual Data + +GuideLLM can synthesize images and short videos on the fly so you can benchmark Vision-Language Model (VLM) serving configurations without bringing your own dataset. Two `--data` types — `synthetic_image` and `synthetic_video` — compose with the existing synthetic text token controls (`text_tokens`, `output_tokens`, and their `stdev`/`min`/`max` companions) so a single command produces a fully-shaped multimodal request. + +Synthetic visual data is useful when you want to control payload shape precisely (image dimensions, frame count, frames-per-second) or stress-test serving paths that the preprocessor cache would otherwise hide. Defaults are tuned so every generated payload is byte-different from the next, which defeats vLLM's multimodal preprocessor cache while still compressing like real media on the wire. + +## Prerequisites + +Install GuideLLM with the `vision` extra to enable image and video synthesis: + +```bash +pip install guidellm[vision] +``` + +## Synthetic image + +Use `--data "type=synthetic_image"` to generate a single image per request alongside any text prompt. + +### Example Commands + +A single 720p image alongside 200 text tokens and 64 output tokens: + +```bash +guidellm benchmark run \ + --target http://localhost:8000 \ + --data "type=synthetic_image,resolution=720p,text_tokens=200,output_tokens=64" +``` + +A 1280×720 JPEG with two images per request: + +```bash +guidellm benchmark run \ + --target http://localhost:8000 \ + --data "type=synthetic_image,width=1280,height=720,format=jpeg,images_per_request=2,text_tokens=200,output_tokens=64" +``` + +### Configuration Options + +- `width`: Width of the generated image in pixels. +- `height`: Height of the generated image in pixels. +- `resolution`: Shorthand that sets `height` to a named value (`480p`, `720p`, `1080p`, …); pairs with `aspect_ratio` to derive `width`. +- `aspect_ratio`: Shorthand such as `16:9` or `4:3` that derives the missing dimension when only one of `width`/`height`/`resolution` is given. +- `format`: Encoded image format, `jpeg` (default) or `png`. +- `jpeg_quality`: JPEG quality factor (1–100) when `format=jpeg`. Defaults to 85. +- `content`: Per-row image content. `gradient` (default) emits a per-row seeded gradient that compresses like real photography; `noise` emits uniform random pixels for worst-case wire size; `solid` and `checkerboard` are useful for preprocessor-cache sensitivity sweeps. +- `images_per_request`: Number of images to attach to each request. Defaults to 1. +- `text_tokens`: Average number of tokens in the accompanying text prompt. Accepts the same `stdev` / `min` / `max` suffixes as the synthetic text mode. `prompt_tokens` is accepted as an alias. +- `output_tokens`: Average number of tokens the model should generate. Same `stdev` / `min` / `max` suffixes apply. +- `seed`: Random seed for reproducible generation across runs. + +## Synthetic video + +Use `--data "type=synthetic_video"` to generate a short clip per request alongside any text prompt. Output is `mp4` (h264, yuv420p). + +### Example Commands + +A six-frame 480p clip at 1 fps with modest prompt and output budgets: + +```bash +guidellm benchmark run \ + --target http://localhost:8000 \ + --data "type=synthetic_video,width=854,height=480,frames=6,fps=1,text_tokens=64,output_tokens=128" +``` + +A twelve-frame 720p clip at 3 fps with a wire-size pin: + +```bash +guidellm benchmark run \ + --target http://localhost:8000 \ + --data "type=synthetic_video,width=1280,height=720,frames=12,fps=3,video_bitrate=2M,text_tokens=64,output_tokens=128" +``` + +### Configuration Options + +- `width`: Width of the generated video in pixels. +- `height`: Height of the generated video in pixels. The same `resolution` / `aspect_ratio` shorthands as for synthetic image apply. +- `frames`: Number of frames in the clip. +- `fps`: Frames per second. Combined with `frames`, this also determines the clip duration. +- `video_bitrate`: Optional h264 target bitrate (e.g. `1M`, `500k`) — useful when wire size needs to be pinned across runs. +- `content`: Per-row clip content. `gradient` (default) emits a seeded gradient with a coordinate warp so each clip compresses similarly to real video; `noise` emits uniform random pixels for worst-case wire size. +- `text_tokens`: Average number of tokens in the accompanying text prompt; same `stdev` / `min` / `max` suffixes as synthetic image. `prompt_tokens` is accepted as an alias. +- `output_tokens`: Average number of tokens the model should generate; same `stdev` / `min` / `max` suffixes apply. +- `seed`: Random seed for reproducible generation across runs. + +## Notes + +- A processor/tokenizer is required for the text portion of the request. By default the model passed in or retrieved from the server is used; otherwise specify one with `--processor`. +- Per-row seeded gradients produce byte-different payloads on every request, which bypasses vLLM's multimodal preprocessor cache. If you want to deliberately hit the cache, set `content=solid` or pin a fixed `seed` and `samples`. +- Reproducibility of exact mp4 bytes depends on the installed `ffmpeg` and `PIL` versions. Pin via the lockfile if you compare runs across machines. From ae7b2c9c8938a3087561bb741e354b0bef1be2b6 Mon Sep 17 00:00:00 2001 From: Zakaria el hjouji Date: Wed, 20 May 2026 14:08:42 -0400 Subject: [PATCH 10/14] review: address second-pass docs and code comments - Replace bare assert in both synthetic deserializers with an explicit isinstance check + RuntimeError, matching guidellm's style for unexpected-type guards. - docs/guides/multimodal/index.md: expand "VLM" to "Vision-Language Model (VLM)" on the Synthetic Vision card to avoid the VLM/vLLM/LLM visual collision. - docs/guides/multimodal/synthetic_vision.md: - drop the "wire-size pin" phrasing from the bitrate example - "pin"/"pinned" -> "specify"/"fixed" in the video_bitrate bullet - rewrite the ffmpeg/PIL note to just warn about byte-level variability across versions, instead of recommending users modify the uv.lock file - pyproject.toml unchanged; uv.lock regenerated via `uv sync --extra vision` so it tracks the vision-extra dependency closure. Signed-off-by: Zakaria el hjouji --- docs/guides/multimodal/index.md | 2 +- docs/guides/multimodal/synthetic_vision.md | 6 +- .../data/deserializers/synthetic_image.py | 5 +- .../data/deserializers/synthetic_video.py | 5 +- uv.lock | 66 +++++++++++++++++++ 5 files changed, 78 insertions(+), 6 deletions(-) diff --git a/docs/guides/multimodal/index.md b/docs/guides/multimodal/index.md index 807d4a793..5648e1bf1 100644 --- a/docs/guides/multimodal/index.md +++ b/docs/guides/multimodal/index.md @@ -53,7 +53,7 @@ Ensure you have a running inference server and model compatible with the OpenAI ______________________________________________________________________ - Generate images and short videos on the fly to benchmark VLM serving configurations without bringing your own dataset. Covers the `synthetic_image` and `synthetic_video` `--data` types. + Generate images and short videos on the fly to benchmark Vision-Language Model (VLM) serving configurations without bringing your own dataset. Covers the `synthetic_image` and `synthetic_video` `--data` types. [:octicons-arrow-right-24: Synthetic Vision Guide](synthetic_vision.md) diff --git a/docs/guides/multimodal/synthetic_vision.md b/docs/guides/multimodal/synthetic_vision.md index a78c0ca4e..828987af8 100644 --- a/docs/guides/multimodal/synthetic_vision.md +++ b/docs/guides/multimodal/synthetic_vision.md @@ -66,7 +66,7 @@ guidellm benchmark run \ --data "type=synthetic_video,width=854,height=480,frames=6,fps=1,text_tokens=64,output_tokens=128" ``` -A twelve-frame 720p clip at 3 fps with a wire-size pin: +A twelve-frame 720p clip at 3 fps with an explicit h264 target bitrate: ```bash guidellm benchmark run \ @@ -80,7 +80,7 @@ guidellm benchmark run \ - `height`: Height of the generated video in pixels. The same `resolution` / `aspect_ratio` shorthands as for synthetic image apply. - `frames`: Number of frames in the clip. - `fps`: Frames per second. Combined with `frames`, this also determines the clip duration. -- `video_bitrate`: Optional h264 target bitrate (e.g. `1M`, `500k`) — useful when wire size needs to be pinned across runs. +- `video_bitrate`: Optional h264 target bitrate (e.g. `1M`, `500k`) — useful when you want to specify a fixed wire size across runs. - `content`: Per-row clip content. `gradient` (default) emits a seeded gradient with a coordinate warp so each clip compresses similarly to real video; `noise` emits uniform random pixels for worst-case wire size. - `text_tokens`: Average number of tokens in the accompanying text prompt; same `stdev` / `min` / `max` suffixes as synthetic image. `prompt_tokens` is accepted as an alias. - `output_tokens`: Average number of tokens the model should generate; same `stdev` / `min` / `max` suffixes apply. @@ -90,4 +90,4 @@ guidellm benchmark run \ - A processor/tokenizer is required for the text portion of the request. By default the model passed in or retrieved from the server is used; otherwise specify one with `--processor`. - Per-row seeded gradients produce byte-different payloads on every request, which bypasses vLLM's multimodal preprocessor cache. If you want to deliberately hit the cache, set `content=solid` or pin a fixed `seed` and `samples`. -- Reproducibility of exact mp4 bytes depends on the installed `ffmpeg` and `PIL` versions. Pin via the lockfile if you compare runs across machines. +- The exact mp4 bytes produced for a given seed depend on the installed `ffmpeg` and `PIL` versions. Output token counts and request shape stay stable across versions, but if you are comparing byte-level outputs or wire-size measurements across machines, expect small variation. diff --git a/src/guidellm/data/deserializers/synthetic_image.py b/src/guidellm/data/deserializers/synthetic_image.py index 4e3d14c07..809190f3c 100644 --- a/src/guidellm/data/deserializers/synthetic_image.py +++ b/src/guidellm/data/deserializers/synthetic_image.py @@ -238,7 +238,10 @@ def _build_prompt( decoded = processor.decode(token_ids[:token_count], skip_special_tokens=True) if isinstance(decoded, str): return decoded - raise RuntimeError("Processor returned unexpected prompt decode type.") + raise RuntimeError( + "Processor.decode returned a non-string value while generating " + "synthetic image prompt text." + ) def __iter__(self) -> Iterator[tuple[int, dict[str, Any]]]: iter_seed = self.random_seed + self.iteration_count diff --git a/src/guidellm/data/deserializers/synthetic_video.py b/src/guidellm/data/deserializers/synthetic_video.py index 3a6ae5971..7b83e21e5 100644 --- a/src/guidellm/data/deserializers/synthetic_video.py +++ b/src/guidellm/data/deserializers/synthetic_video.py @@ -158,7 +158,10 @@ def _build_prompt( decoded = processor.decode(token_ids[:token_count], skip_special_tokens=True) if isinstance(decoded, str): return decoded - raise RuntimeError("Processor returned unexpected prompt decode type.") + raise RuntimeError( + "Processor.decode returned a non-string value while generating " + "synthetic video prompt text." + ) def __iter__(self) -> Iterator[tuple[int, dict[str, Any]]]: iter_seed = self.random_seed + self.iteration_count diff --git a/uv.lock b/uv.lock index 9dffc2852..3242fa77e 100644 --- a/uv.lock +++ b/uv.lock @@ -851,6 +851,7 @@ dependencies = [ all = [ { name = "blobfile" }, { name = "datasets", extra = ["audio", "vision"] }, + { name = "imageio", extra = ["ffmpeg"] }, { name = "mistral-common" }, { name = "msgpack" }, { name = "msgspec" }, @@ -874,6 +875,7 @@ dev = [ { name = "blobfile" }, { name = "build" }, { name = "datasets", extra = ["audio", "vision"] }, + { name = "imageio", extra = ["ffmpeg"] }, { name = "lorem" }, { name = "mdformat" }, { name = "mdformat-footnote" }, @@ -935,6 +937,7 @@ tokenizers = [ ] vision = [ { name = "datasets", extra = ["vision"] }, + { name = "imageio", extra = ["ffmpeg"] }, { name = "pillow" }, ] @@ -959,6 +962,7 @@ requires-dist = [ { name = "guidellm", extras = ["audio", "perf", "tokenizers", "vision"], marker = "extra == 'all'" }, { name = "guidellm", extras = ["perf", "tokenizers"], marker = "extra == 'recommended'" }, { name = "httpx", extras = ["http2"], specifier = "<1.0.0" }, + { name = "imageio", extras = ["ffmpeg"], marker = "extra == 'vision'" }, { name = "loguru" }, { name = "lorem", marker = "extra == 'dev'", specifier = "~=0.1.1" }, { name = "mdformat", marker = "extra == 'dev'", specifier = "~=1.0.0" }, @@ -1211,6 +1215,40 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, ] +[[package]] +name = "imageio" +version = "2.37.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/84/93bcd1300216ea50811cee96873b84a1bebf8d0489ffaf7f2a3756bab866/imageio-2.37.3.tar.gz", hash = "sha256:bbb37efbfc4c400fcd534b367b91fcd66d5da639aaa138034431a1c5e0a41451", size = 389673, upload-time = "2026-03-09T11:31:12.573Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/49/fa/391e437a34e55095173dca5f24070d89cbc233ff85bf1c29c93248c6588d/imageio-2.37.3-py3-none-any.whl", hash = "sha256:46f5bb8522cd421c0f5ae104d8268f569d856b29eb1a13b92829d1970f32c9f0", size = 317646, upload-time = "2026-03-09T11:31:10.771Z" }, +] + +[package.optional-dependencies] +ffmpeg = [ + { name = "imageio-ffmpeg" }, + { name = "psutil" }, +] + +[[package]] +name = "imageio-ffmpeg" +version = "0.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/44/bd/c3343c721f2a1b0c9fc71c1aebf1966a3b7f08c2eea8ed5437a2865611d6/imageio_ffmpeg-0.6.0.tar.gz", hash = "sha256:e2556bed8e005564a9f925bb7afa4002d82770d6b08825078b7697ab88ba1755", size = 25210, upload-time = "2025-01-16T21:34:32.747Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/58/87ef68ac83f4c7690961bce288fd8e382bc5f1513860fc7f90a9c1c1c6bf/imageio_ffmpeg-0.6.0-py3-none-macosx_10_9_intel.macosx_10_9_x86_64.whl", hash = "sha256:9d2baaf867088508d4a3458e61eeb30e945c4ad8016025545f66c4b5aaef0a61", size = 24932969, upload-time = "2025-01-16T21:34:20.464Z" }, + { url = "https://files.pythonhosted.org/packages/40/5c/f3d8a657d362cc93b81aab8feda487317da5b5d31c0e1fdfd5e986e55d17/imageio_ffmpeg-0.6.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b1ae3173414b5fc5f538a726c4e48ea97edc0d2cdc11f103afee655c463fa742", size = 21113891, upload-time = "2025-01-16T21:34:00.277Z" }, + { url = "https://files.pythonhosted.org/packages/33/e7/1925bfbc563c39c1d2e82501d8372734a5c725e53ac3b31b4c2d081e895b/imageio_ffmpeg-0.6.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1d47bebd83d2c5fc770720d211855f208af8a596c82d17730aa51e815cdee6dc", size = 25632706, upload-time = "2025-01-16T21:33:53.475Z" }, + { url = "https://files.pythonhosted.org/packages/a0/2d/43c8522a2038e9d0e7dbdf3a61195ecc31ca576fb1527a528c877e87d973/imageio_ffmpeg-0.6.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:c7e46fcec401dd990405049d2e2f475e2b397779df2519b544b8aab515195282", size = 29498237, upload-time = "2025-01-16T21:34:13.726Z" }, + { url = "https://files.pythonhosted.org/packages/a0/13/59da54728351883c3c1d9fca1710ab8eee82c7beba585df8f25ca925f08f/imageio_ffmpeg-0.6.0-py3-none-win32.whl", hash = "sha256:196faa79366b4a82f95c0f4053191d2013f4714a715780f0ad2a68ff37483cc2", size = 19652251, upload-time = "2025-01-16T21:34:06.812Z" }, + { url = "https://files.pythonhosted.org/packages/2c/c6/fa760e12a2483469e2bf5058c5faff664acf66cadb4df2ad6205b016a73d/imageio_ffmpeg-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02fa47c83703c37df6bfe4896aab339013f62bf02c5ebf2dce6da56af04ffc0a", size = 31246824, upload-time = "2025-01-16T21:34:28.6Z" }, +] + [[package]] name = "imagesize" version = "1.4.1" @@ -2597,6 +2635,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/57/bf/2086963c69bdac3d7cff1cc7ff79b8ce5ea0bec6797a017e1be338a46248/protobuf-6.33.5-py3-none-any.whl", hash = "sha256:69915a973dd0f60f31a08b8318b73eab2bd6a392c79184b3612226b0a3f8ec02", size = 170687, upload-time = "2026-01-29T21:51:32.557Z" }, ] +[[package]] +name = "psutil" +version = "7.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/c6/d1ddf4abb55e93cebc4f2ed8b5d6dbad109ecb8d63748dd2b20ab5e57ebe/psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372", size = 493740, upload-time = "2026-01-28T18:14:54.428Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/08/510cbdb69c25a96f4ae523f733cdc963ae654904e8db864c07585ef99875/psutil-7.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:2edccc433cbfa046b980b0df0171cd25bcaeb3a68fe9022db0979e7aa74a826b", size = 130595, upload-time = "2026-01-28T18:14:57.293Z" }, + { url = "https://files.pythonhosted.org/packages/d6/f5/97baea3fe7a5a9af7436301f85490905379b1c6f2dd51fe3ecf24b4c5fbf/psutil-7.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e78c8603dcd9a04c7364f1a3e670cea95d51ee865e4efb3556a3a63adef958ea", size = 131082, upload-time = "2026-01-28T18:14:59.732Z" }, + { url = "https://files.pythonhosted.org/packages/37/d6/246513fbf9fa174af531f28412297dd05241d97a75911ac8febefa1a53c6/psutil-7.2.2-cp313-cp313t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1a571f2330c966c62aeda00dd24620425d4b0cc86881c89861fbc04549e5dc63", size = 181476, upload-time = "2026-01-28T18:15:01.884Z" }, + { url = "https://files.pythonhosted.org/packages/b8/b5/9182c9af3836cca61696dabe4fd1304e17bc56cb62f17439e1154f225dd3/psutil-7.2.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:917e891983ca3c1887b4ef36447b1e0873e70c933afc831c6b6da078ba474312", size = 184062, upload-time = "2026-01-28T18:15:04.436Z" }, + { url = "https://files.pythonhosted.org/packages/16/ba/0756dca669f5a9300d0cbcbfae9a4c30e446dfc7440ffe43ded5724bfd93/psutil-7.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:ab486563df44c17f5173621c7b198955bd6b613fb87c71c161f827d3fb149a9b", size = 139893, upload-time = "2026-01-28T18:15:06.378Z" }, + { url = "https://files.pythonhosted.org/packages/1c/61/8fa0e26f33623b49949346de05ec1ddaad02ed8ba64af45f40a147dbfa97/psutil-7.2.2-cp313-cp313t-win_arm64.whl", hash = "sha256:ae0aefdd8796a7737eccea863f80f81e468a1e4cf14d926bd9b6f5f2d5f90ca9", size = 135589, upload-time = "2026-01-28T18:15:08.03Z" }, + { url = "https://files.pythonhosted.org/packages/81/69/ef179ab5ca24f32acc1dac0c247fd6a13b501fd5534dbae0e05a1c48b66d/psutil-7.2.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:eed63d3b4d62449571547b60578c5b2c4bcccc5387148db46e0c2313dad0ee00", size = 130664, upload-time = "2026-01-28T18:15:09.469Z" }, + { url = "https://files.pythonhosted.org/packages/7b/64/665248b557a236d3fa9efc378d60d95ef56dd0a490c2cd37dafc7660d4a9/psutil-7.2.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7b6d09433a10592ce39b13d7be5a54fbac1d1228ed29abc880fb23df7cb694c9", size = 131087, upload-time = "2026-01-28T18:15:11.724Z" }, + { url = "https://files.pythonhosted.org/packages/d5/2e/e6782744700d6759ebce3043dcfa661fb61e2fb752b91cdeae9af12c2178/psutil-7.2.2-cp314-cp314t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fa4ecf83bcdf6e6c8f4449aff98eefb5d0604bf88cb883d7da3d8d2d909546a", size = 182383, upload-time = "2026-01-28T18:15:13.445Z" }, + { url = "https://files.pythonhosted.org/packages/57/49/0a41cefd10cb7505cdc04dab3eacf24c0c2cb158a998b8c7b1d27ee2c1f5/psutil-7.2.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e452c464a02e7dc7822a05d25db4cde564444a67e58539a00f929c51eddda0cf", size = 185210, upload-time = "2026-01-28T18:15:16.002Z" }, + { url = "https://files.pythonhosted.org/packages/dd/2c/ff9bfb544f283ba5f83ba725a3c5fec6d6b10b8f27ac1dc641c473dc390d/psutil-7.2.2-cp314-cp314t-win_amd64.whl", hash = "sha256:c7663d4e37f13e884d13994247449e9f8f574bc4655d509c3b95e9ec9e2b9dc1", size = 141228, upload-time = "2026-01-28T18:15:18.385Z" }, + { url = "https://files.pythonhosted.org/packages/f2/fc/f8d9c31db14fcec13748d373e668bc3bed94d9077dbc17fb0eebc073233c/psutil-7.2.2-cp314-cp314t-win_arm64.whl", hash = "sha256:11fe5a4f613759764e79c65cf11ebdf26e33d6dd34336f8a337aa2996d71c841", size = 136284, upload-time = "2026-01-28T18:15:19.912Z" }, + { url = "https://files.pythonhosted.org/packages/e7/36/5ee6e05c9bd427237b11b3937ad82bb8ad2752d72c6969314590dd0c2f6e/psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486", size = 129090, upload-time = "2026-01-28T18:15:22.168Z" }, + { url = "https://files.pythonhosted.org/packages/80/c4/f5af4c1ca8c1eeb2e92ccca14ce8effdeec651d5ab6053c589b074eda6e1/psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979", size = 129859, upload-time = "2026-01-28T18:15:23.795Z" }, + { url = "https://files.pythonhosted.org/packages/b5/70/5d8df3b09e25bce090399cf48e452d25c935ab72dad19406c77f4e828045/psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9", size = 155560, upload-time = "2026-01-28T18:15:25.976Z" }, + { url = "https://files.pythonhosted.org/packages/63/65/37648c0c158dc222aba51c089eb3bdfa238e621674dc42d48706e639204f/psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e", size = 156997, upload-time = "2026-01-28T18:15:27.794Z" }, + { url = "https://files.pythonhosted.org/packages/8e/13/125093eadae863ce03c6ffdbae9929430d116a246ef69866dad94da3bfbc/psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8", size = 148972, upload-time = "2026-01-28T18:15:29.342Z" }, + { url = "https://files.pythonhosted.org/packages/04/78/0acd37ca84ce3ddffaa92ef0f571e073faa6d8ff1f0559ab1272188ea2be/psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc", size = 148266, upload-time = "2026-01-28T18:15:31.597Z" }, + { url = "https://files.pythonhosted.org/packages/b4/90/e2159492b5426be0c1fef7acba807a03511f97c5f86b3caeda6ad92351a7/psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988", size = 137737, upload-time = "2026-01-28T18:15:33.849Z" }, + { url = "https://files.pythonhosted.org/packages/8c/c7/7bb2e321574b10df20cbde462a94e2b71d05f9bbda251ef27d104668306a/psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee", size = 134617, upload-time = "2026-01-28T18:15:36.514Z" }, +] + [[package]] name = "pyarrow" version = "23.0.1" From 061d319e3e69a78fba593e2ea09d9b82e5831753 Mon Sep 17 00:00:00 2001 From: Ali Tayeb Date: Wed, 24 Jun 2026 21:30:55 -0700 Subject: [PATCH 11/14] data: align synthetic vision with kind routing Signed-off-by: Ali Tayeb --- .../data/deserializers/synthetic_image.py | 32 +++++++++++-------- .../data/deserializers/synthetic_video.py | 31 ++++++++++-------- src/guidellm/extras/vision.pyi | 4 ++- src/guidellm/utils/vision.py | 2 +- 4 files changed, 39 insertions(+), 30 deletions(-) diff --git a/src/guidellm/data/deserializers/synthetic_image.py b/src/guidellm/data/deserializers/synthetic_image.py index 809190f3c..0c19cb849 100644 --- a/src/guidellm/data/deserializers/synthetic_image.py +++ b/src/guidellm/data/deserializers/synthetic_image.py @@ -28,7 +28,7 @@ _DESERIALIZER_TYPE = "synthetic_image" -_RESOLUTION_PRESETS: dict[str, tuple[int, int]] = { +RESOLUTION_PRESETS: dict[str, tuple[int, int]] = { "240p": (426, 240), "360p": (640, 360), "480p": (854, 480), @@ -41,7 +41,7 @@ } -def _parse_aspect_ratio(aspect: str) -> float: +def parse_aspect_ratio(aspect: str) -> float: """Parse 'W:H' or 'W/H' into a float ratio.""" sep = ":" if ":" in aspect else "/" try: @@ -53,7 +53,7 @@ def _parse_aspect_ratio(aspect: str) -> float: ) from exc -class _SyntheticVisionTextMixin(DataArgs): +class SyntheticVisionTextMixin(DataArgs): text_tokens: int = Field( description="The average number of text tokens generated for the text portion.", gt=0, @@ -116,11 +116,11 @@ def _alias_prompt_tokens(cls, data: object) -> object: @DataArgs.register(_DESERIALIZER_TYPE) -class SyntheticImageDataArgs(_SyntheticVisionTextMixin): +class SyntheticImageDataArgs(SyntheticVisionTextMixin): """Model for synthetic image dataset deserializer arguments.""" kind: Literal["synthetic_image"] = Field( # type: ignore[assignment] - default=_DESERIALIZER_TYPE, + default="synthetic_image", description="Type identifier for the synthetic image dataset configuration.", ) width: int | None = Field( @@ -166,26 +166,26 @@ def _resolve_dimensions(self) -> SyntheticImageDataArgs: w = self.width h = self.height if self.resolution is not None: - preset = _RESOLUTION_PRESETS.get(self.resolution.lower()) + preset = RESOLUTION_PRESETS.get(self.resolution.lower()) if preset is None: raise ValueError( f"Unknown resolution '{self.resolution}'. Known: " - f"{sorted(_RESOLUTION_PRESETS)}" + f"{sorted(RESOLUTION_PRESETS)}" ) preset_w, preset_h = preset if h is None: h = preset_h if w is None: w = ( - int(round(h * _parse_aspect_ratio(self.aspect_ratio))) + int(round(h * parse_aspect_ratio(self.aspect_ratio))) if self.aspect_ratio is not None else preset_w ) elif self.aspect_ratio is not None: if h is not None and w is None: - w = int(round(h * _parse_aspect_ratio(self.aspect_ratio))) + w = int(round(h * parse_aspect_ratio(self.aspect_ratio))) elif w is not None and h is None: - h = int(round(w / _parse_aspect_ratio(self.aspect_ratio))) + h = int(round(w / parse_aspect_ratio(self.aspect_ratio))) if w is None or h is None: raise ValueError( @@ -196,8 +196,7 @@ def _resolve_dimensions(self) -> SyntheticImageDataArgs: self.height = int(h) - (int(h) % 2) if self.width <= 0 or self.height <= 0: raise ValueError( - f"Resolved image dims must be positive, got " - f"{self.width}x{self.height}" + f"Resolved image dims must be positive, got {self.width}x{self.height}" ) return self @@ -296,10 +295,15 @@ def __iter__(self) -> Iterator[tuple[int, dict[str, Any]]]: if output_token_count is not None: row["output_tokens_count_0"] = output_token_count + width = self.config.width + height = self.config.height + if width is None or height is None: + raise RuntimeError("Synthetic image dimensions were not resolved.") + for img_idx in range(self.config.images_per_request): encoded = synthesize_image( - width=int(self.config.width), - height=int(self.config.height), + width=width, + height=height, content=self.config.content, image_format=self.config.format, jpeg_quality=self.config.jpeg_quality, diff --git a/src/guidellm/data/deserializers/synthetic_video.py b/src/guidellm/data/deserializers/synthetic_video.py index 7b83e21e5..fb6020fd3 100644 --- a/src/guidellm/data/deserializers/synthetic_video.py +++ b/src/guidellm/data/deserializers/synthetic_video.py @@ -17,9 +17,9 @@ DatasetDeserializerFactory, ) from guidellm.data.deserializers.synthetic_image import ( - _RESOLUTION_PRESETS, - _SyntheticVisionTextMixin, - _parse_aspect_ratio, + RESOLUTION_PRESETS, + SyntheticVisionTextMixin, + parse_aspect_ratio, ) from guidellm.data.schemas import DataArgs from guidellm.utils.random import IntegerRangeSampler @@ -36,11 +36,11 @@ @DataArgs.register(_DESERIALIZER_TYPE) -class SyntheticVideoDataArgs(_SyntheticVisionTextMixin): +class SyntheticVideoDataArgs(SyntheticVisionTextMixin): """Model for synthetic video dataset deserializer arguments.""" kind: Literal["synthetic_video"] = Field( # type: ignore[assignment] - default=_DESERIALIZER_TYPE, + default="synthetic_video", description="Type identifier for the synthetic video dataset configuration.", ) width: int | None = Field( @@ -88,26 +88,26 @@ def _resolve_dimensions(self) -> SyntheticVideoDataArgs: w = self.width h = self.height if self.resolution is not None: - preset = _RESOLUTION_PRESETS.get(self.resolution.lower()) + preset = RESOLUTION_PRESETS.get(self.resolution.lower()) if preset is None: raise ValueError( f"Unknown resolution '{self.resolution}'. Known: " - f"{sorted(_RESOLUTION_PRESETS)}" + f"{sorted(RESOLUTION_PRESETS)}" ) preset_w, preset_h = preset if h is None: h = preset_h if w is None: w = ( - int(round(h * _parse_aspect_ratio(self.aspect_ratio))) + int(round(h * parse_aspect_ratio(self.aspect_ratio))) if self.aspect_ratio is not None else preset_w ) elif self.aspect_ratio is not None: if h is not None and w is None: - w = int(round(h * _parse_aspect_ratio(self.aspect_ratio))) + w = int(round(h * parse_aspect_ratio(self.aspect_ratio))) elif w is not None and h is None: - h = int(round(w / _parse_aspect_ratio(self.aspect_ratio))) + h = int(round(w / parse_aspect_ratio(self.aspect_ratio))) if w is None or h is None: raise ValueError( @@ -118,8 +118,7 @@ def _resolve_dimensions(self) -> SyntheticVideoDataArgs: self.height = int(h) - (int(h) % 2) if self.width <= 0 or self.height <= 0: raise ValueError( - f"Resolved video dims must be positive, got " - f"{self.width}x{self.height}" + f"Resolved video dims must be positive, got {self.width}x{self.height}" ) return self @@ -207,14 +206,18 @@ def __iter__(self) -> Iterator[tuple[int, dict[str, Any]]]: faker, f"{self.iteration_count} {row_index} ", ) + width = self.config.width + height = self.config.height + if width is None or height is None: + raise RuntimeError("Synthetic video dimensions were not resolved.") row: dict[str, Any] = { "prefix": "", "prompt_0": prompt, "prompt_tokens_count_0": text_token_count, "video": synthesize_video( - width=int(self.config.width), - height=int(self.config.height), + width=width, + height=height, frames=int(self.config.frames), fps=float(self.config.fps), content=self.config.content, diff --git a/src/guidellm/extras/vision.pyi b/src/guidellm/extras/vision.pyi index edfee67a6..48a5afccf 100644 --- a/src/guidellm/extras/vision.pyi +++ b/src/guidellm/extras/vision.pyi @@ -1,5 +1,7 @@ +from typing import Any + from PIL import Image as _PILImage from PIL.Image import Image as Image -import imageio.v3 as iio PILImage = _PILImage +iio: Any diff --git a/src/guidellm/utils/vision.py b/src/guidellm/utils/vision.py index 815d421ba..acadbc7eb 100644 --- a/src/guidellm/utils/vision.py +++ b/src/guidellm/utils/vision.py @@ -28,7 +28,7 @@ def is_url(text: Any) -> bool: return isinstance(text, str) and text.startswith(("http://", "https://")) -def encode_image( +def encode_image( # noqa: C901 image: bytes | str | Path | np.ndarray | dict[str, Any] | libs.Image, width: int | None = None, height: int | None = None, From 590684327df6a80bc53aeb93a93be59f6f97af3d Mon Sep 17 00:00:00 2001 From: Ali Tayeb Date: Wed, 24 Jun 2026 21:31:42 -0700 Subject: [PATCH 12/14] tests: update synthetic vision kind coverage Signed-off-by: Ali Tayeb --- .../test_synthetic_multimodal_benchmark.py | 34 ++--- .../test_synthetic_multimodal.py | 144 +++++++++--------- 2 files changed, 90 insertions(+), 88 deletions(-) diff --git a/tests/integration/data/test_synthetic_multimodal_benchmark.py b/tests/integration/data/test_synthetic_multimodal_benchmark.py index a55eb3e95..e664b3f30 100644 --- a/tests/integration/data/test_synthetic_multimodal_benchmark.py +++ b/tests/integration/data/test_synthetic_multimodal_benchmark.py @@ -98,28 +98,22 @@ def _run_benchmark( sys.executable, "-m", "guidellm", - "benchmark", "run", - "--target", - base_url, + "--backend", + f"kind=openai_http,target={base_url}", "--data", data, - "--data-samples", - "8", + "--data-loader", + "kind=pytorch,samples=8", "--profile", - "constant", - "--rate", - "2", - "--max-seconds", - str(max_seconds), - "--processor", - "Xenova/gpt-4", - "--backend", - "openai_http", - "--outputs", - str(output_path), - "--disable-progress", - "--disable-console-outputs", + "kind=constant,rate=2", + "--constraint", + f"kind=max_duration,seconds={max_seconds}", + "--tokenizer", + "kind=huggingface_auto,model=Xenova/gpt-4", + "--output", + f"kind=json,path={output_path}", + "--disable-console", ] return subprocess.run( # noqa: S603 cmd, capture_output=True, text=True, timeout=180, check=False @@ -135,7 +129,7 @@ def test_synthetic_image_benchmark_against_mock(mock_backend, tmp_path): result = _run_benchmark( base_url=mock_backend, data=( - "type=synthetic_image,width=128,height=128,format=jpeg," + "kind=synthetic_image,width=128,height=128,format=jpeg," "jpeg_quality=85,text_tokens=20,output_tokens=8,seed=11" ), output_dir=tmp_path, @@ -160,7 +154,7 @@ def test_synthetic_video_benchmark_against_mock(mock_backend, tmp_path): result = _run_benchmark( base_url=mock_backend, data=( - "type=synthetic_video,width=160,height=120,frames=4,fps=1," + "kind=synthetic_video,width=160,height=120,frames=4,fps=1," "text_tokens=10,output_tokens=4,seed=23" ), output_dir=tmp_path, diff --git a/tests/unit/data/deserializers/test_synthetic_multimodal.py b/tests/unit/data/deserializers/test_synthetic_multimodal.py index e04e8893c..ceaa83097 100644 --- a/tests/unit/data/deserializers/test_synthetic_multimodal.py +++ b/tests/unit/data/deserializers/test_synthetic_multimodal.py @@ -7,6 +7,7 @@ import io import tempfile from pathlib import Path +from typing import Any from unittest.mock import Mock import imageio @@ -15,17 +16,15 @@ from guidellm.data.deserializers import ( DatasetDeserializerFactory, + SyntheticImageDataArgs, SyntheticImageDataset, SyntheticImageDatasetDeserializer, + SyntheticVideoDataArgs, SyntheticVideoDataset, SyntheticVideoDatasetDeserializer, ) -from guidellm.data.deserializers.deserializer import DataNotSupportedError -from guidellm.data.schemas import ( - SyntheticImageDatasetConfig, - SyntheticVideoDatasetConfig, -) -from guidellm.extras.vision import synthesize_image, synthesize_video +from guidellm.data.schemas import DataArgs +from guidellm.utils.vision import synthesize_image, synthesize_video def _mock_tokenizer() -> Mock: @@ -141,7 +140,7 @@ def test_decoded_frame_count_and_seconds_match(self, frames: int, fps: float): f.write(decoded) path = f.name try: - reader = imageio.get_reader(path, "ffmpeg") + reader: Any = imageio.get_reader(path, "ffmpeg") # type: ignore[arg-type] decoded_frames = [frame for frame in reader] # noqa: C416 assert len(decoded_frames) == frames assert decoded_frames[0].shape == (240, 320, 3) @@ -209,14 +208,14 @@ class TestSyntheticImageConfig: @pytest.mark.smoke def test_resolution_resolves_to_width_height(self): """## WRITTEN BY AI ##""" - cfg = SyntheticImageDatasetConfig(resolution="720p", text_tokens=50) + cfg = SyntheticImageDataArgs(resolution="720p", text_tokens=50) assert cfg.width == 1280 assert cfg.height == 720 @pytest.mark.sanity def test_aspect_ratio_overrides_width(self): """## WRITTEN BY AI ##""" - cfg = SyntheticImageDatasetConfig( + cfg = SyntheticImageDataArgs( resolution="720p", aspect_ratio="4:3", text_tokens=50 ) # 720 * 4 / 3 = 960 @@ -226,8 +225,13 @@ def test_aspect_ratio_overrides_width(self): @pytest.mark.sanity def test_prompt_tokens_alias_accepted(self): """## WRITTEN BY AI ##""" - cfg = SyntheticImageDatasetConfig.model_validate( - {"width": 640, "height": 480, "prompt_tokens": 50} + cfg = SyntheticImageDataArgs.model_validate( + { + "kind": "synthetic_image", + "width": 640, + "height": 480, + "prompt_tokens": 50, + } ) assert cfg.text_tokens == 50 @@ -235,29 +239,31 @@ def test_prompt_tokens_alias_accepted(self): def test_missing_dims_raises(self): """## WRITTEN BY AI ##""" with pytest.raises(ValueError): - SyntheticImageDatasetConfig(text_tokens=10) + SyntheticImageDataArgs(text_tokens=10) @pytest.mark.regression def test_unknown_resolution_raises(self): """## WRITTEN BY AI ##""" with pytest.raises(ValueError, match="resolution"): - SyntheticImageDatasetConfig(resolution="9000p", text_tokens=10) + SyntheticImageDataArgs(resolution="9000p", text_tokens=10) # --------------------------------------------------------------------------- -# Deserializer-from-string + 10-row pull +# Deserializer from typed config + 10-row pull # --------------------------------------------------------------------------- class TestSyntheticImageDeserializer: @pytest.mark.smoke - def test_pull_10_rows_from_data_string(self): + def test_pull_10_rows_from_config(self): """## WRITTEN BY AI ##""" d = SyntheticImageDatasetDeserializer() ds = d( - data=( - "type=synthetic_image,resolution=480p,text_tokens=20," - "output_tokens=8,seed=11" + config=SyntheticImageDataArgs( + resolution="480p", + text_tokens=20, + output_tokens=8, + seed=11, ), processor_factory=_mock_tokenizer, random_seed=42, @@ -284,36 +290,35 @@ def test_pull_10_rows_from_data_string(self): assert len(digests) == 10 @pytest.mark.sanity - def test_factory_dispatch_via_explicit_type(self): + def test_factory_dispatch_via_explicit_kind(self): """## WRITTEN BY AI ##""" + config = DataArgs.model_validate( + { + "kind": "synthetic_image", + "width": 320, + "height": 240, + "text_tokens": 15, + "output_tokens": 4, + } + ) ds = DatasetDeserializerFactory.deserialize( - data=( - "type=synthetic_image,width=320,height=240,text_tokens=15," - "output_tokens=4" - ), + config=config, processor_factory=_mock_tokenizer, + random_seed=42, ) assert isinstance(ds, SyntheticImageDataset) - @pytest.mark.sanity - def test_refuses_when_type_mismatch(self): - """## WRITTEN BY AI ##""" - d = SyntheticImageDatasetDeserializer() - with pytest.raises(DataNotSupportedError): - d( - data="type=synthetic_text,prompt_tokens=50", - processor_factory=_mock_tokenizer, - random_seed=42, - ) - @pytest.mark.regression def test_images_per_request_emits_indexed_columns(self): """## WRITTEN BY AI ##""" d = SyntheticImageDatasetDeserializer() ds = d( - data=( - "type=synthetic_image,width=64,height=64,images_per_request=3," - "text_tokens=5,output_tokens=2" + config=SyntheticImageDataArgs( + width=64, + height=64, + images_per_request=3, + text_tokens=5, + output_tokens=2, ), processor_factory=_mock_tokenizer, random_seed=42, @@ -329,13 +334,18 @@ def test_images_per_request_emits_indexed_columns(self): class TestSyntheticVideoDeserializer: @pytest.mark.smoke - def test_pull_10_rows_from_data_string(self): + def test_pull_10_rows_from_config(self): """## WRITTEN BY AI ##""" d = SyntheticVideoDatasetDeserializer() ds = d( - data=( - "type=synthetic_video,width=320,height=240,frames=4,fps=1," - "text_tokens=10,output_tokens=4,seed=17" + config=SyntheticVideoDataArgs( + width=320, + height=240, + frames=4, + fps=1, + text_tokens=10, + output_tokens=4, + seed=17, ), processor_factory=_mock_tokenizer, random_seed=42, @@ -360,33 +370,32 @@ def test_pull_10_rows_from_data_string(self): assert len(digests) == 10 @pytest.mark.sanity - def test_factory_dispatch_via_explicit_type(self): + def test_factory_dispatch_via_explicit_kind(self): """## WRITTEN BY AI ##""" + config = DataArgs.model_validate( + { + "kind": "synthetic_video", + "width": 160, + "height": 120, + "frames": 3, + "fps": 1, + "text_tokens": 10, + "output_tokens": 4, + } + ) ds = DatasetDeserializerFactory.deserialize( - data=( - "type=synthetic_video,width=160,height=120,frames=3,fps=1," - "text_tokens=10,output_tokens=4" - ), + config=config, processor_factory=_mock_tokenizer, + random_seed=42, ) assert isinstance(ds, SyntheticVideoDataset) - @pytest.mark.sanity - def test_refuses_when_type_mismatch(self): - """## WRITTEN BY AI ##""" - d = SyntheticVideoDatasetDeserializer() - with pytest.raises(DataNotSupportedError): - d( - data="type=synthetic_image,width=64,height=64,text_tokens=10", - processor_factory=_mock_tokenizer, - random_seed=42, - ) - @pytest.mark.smoke def test_video_config_via_json(self): """## WRITTEN BY AI ##""" - cfg = SyntheticVideoDatasetConfig.model_validate( + cfg = SyntheticVideoDataArgs.model_validate( { + "kind": "synthetic_video", "width": 320, "height": 240, "frames": 4, @@ -412,16 +421,15 @@ def test_full_dataset_reproducible_with_same_seed(): ## WRITTEN BY AI ## """ d = SyntheticImageDatasetDeserializer() - common = { - "data": ( - "type=synthetic_image,width=128,height=128,text_tokens=10," - "output_tokens=2,seed=999" - ), - "processor_factory": _mock_tokenizer, - "random_seed": 42, - } - ds_a = d(**common) - ds_b = d(**common) + config = SyntheticImageDataArgs( + width=128, + height=128, + text_tokens=10, + output_tokens=2, + seed=999, + ) + ds_a = d(config=config, processor_factory=_mock_tokenizer, random_seed=42) + ds_b = d(config=config, processor_factory=_mock_tokenizer, random_seed=42) digests_a = [] digests_b = [] From b015823419c03547f21bd31d9309395127fd8699 Mon Sep 17 00:00:00 2001 From: Ali Tayeb Date: Wed, 24 Jun 2026 21:32:24 -0700 Subject: [PATCH 13/14] docs: update synthetic vision CLI examples Signed-off-by: Ali Tayeb --- README.md | 2 +- docs/guides/datasets.md | 2 +- docs/guides/multimodal/index.md | 2 +- docs/guides/multimodal/synthetic_vision.md | 34 +++++++++++----------- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 540ea0c15..e5eb93147 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,7 @@ guidellm run \ ### Synthetic Visual Data -GuideLLM can synthesize images and short videos on the fly so you can benchmark VLM serving configurations without bringing your own dataset. Two `--data` types — `synthetic_image` and `synthetic_video` — compose with the existing text token controls. See [Synthetic Visual Data](docs/guides/multimodal/synthetic_vision.md) for example commands and the full list of configuration options. +GuideLLM can synthesize images and short videos on the fly so you can benchmark Vision-Language Model (VLM) serving configurations without bringing your own dataset. Two `--data` kinds — `synthetic_image` and `synthetic_video` — compose with the existing text token controls. See [Synthetic Visual Data](docs/guides/multimodal/synthetic_vision.md) for example commands and the full list of configuration options. ### Request Types and API Targets diff --git a/docs/guides/datasets.md b/docs/guides/datasets.md index e6688b5e0..c8db6702e 100644 --- a/docs/guides/datasets.md +++ b/docs/guides/datasets.md @@ -85,7 +85,7 @@ GuideLLM supports several types of datasets, each with its own advantages and us Synthetic datasets allow you to generate data on the fly with customizable parameters. This is useful for controlled experiments, stress testing, and simulating specific scenarios. For example, you might want to evaluate how a model handles long prompts or generates outputs with specific characteristics. -GuideLLM supports both synthetic *text* — described below — and synthetic *visual* data (images and short videos) for benchmarking Vision-Language Models. See [Synthetic Visual Data](multimodal/synthetic_vision.md) for the `synthetic_image` and `synthetic_video` `--data` types, which compose with all of the text token controls listed here. +GuideLLM supports both synthetic *text* — described below — and synthetic *visual* data (images and short videos) for benchmarking Vision-Language Models. See [Synthetic Visual Data](multimodal/synthetic_vision.md) for the `synthetic_image` and `synthetic_video` `--data` kinds, which compose with all of the text token controls listed here. #### Example Commands diff --git a/docs/guides/multimodal/index.md b/docs/guides/multimodal/index.md index 5648e1bf1..fd3aa23ce 100644 --- a/docs/guides/multimodal/index.md +++ b/docs/guides/multimodal/index.md @@ -53,7 +53,7 @@ Ensure you have a running inference server and model compatible with the OpenAI ______________________________________________________________________ - Generate images and short videos on the fly to benchmark Vision-Language Model (VLM) serving configurations without bringing your own dataset. Covers the `synthetic_image` and `synthetic_video` `--data` types. + Generate images and short videos on the fly to benchmark Vision-Language Model (VLM) serving configurations without bringing your own dataset. Covers the `synthetic_image` and `synthetic_video` `--data` kinds. [:octicons-arrow-right-24: Synthetic Vision Guide](synthetic_vision.md) diff --git a/docs/guides/multimodal/synthetic_vision.md b/docs/guides/multimodal/synthetic_vision.md index 828987af8..c3b78436f 100644 --- a/docs/guides/multimodal/synthetic_vision.md +++ b/docs/guides/multimodal/synthetic_vision.md @@ -4,7 +4,7 @@ weight: 40 # Synthetic Visual Data -GuideLLM can synthesize images and short videos on the fly so you can benchmark Vision-Language Model (VLM) serving configurations without bringing your own dataset. Two `--data` types — `synthetic_image` and `synthetic_video` — compose with the existing synthetic text token controls (`text_tokens`, `output_tokens`, and their `stdev`/`min`/`max` companions) so a single command produces a fully-shaped multimodal request. +GuideLLM can synthesize images and short videos on the fly so you can benchmark Vision-Language Model (VLM) serving configurations without bringing your own dataset. Two `--data` kinds — `synthetic_image` and `synthetic_video` — compose with the existing synthetic text token controls (`text_tokens`, `output_tokens`, and their `stdev`/`min`/`max` companions) so a single command produces a fully-shaped multimodal request. Synthetic visual data is useful when you want to control payload shape precisely (image dimensions, frame count, frames-per-second) or stress-test serving paths that the preprocessor cache would otherwise hide. Defaults are tuned so every generated payload is byte-different from the next, which defeats vLLM's multimodal preprocessor cache while still compressing like real media on the wire. @@ -18,24 +18,24 @@ pip install guidellm[vision] ## Synthetic image -Use `--data "type=synthetic_image"` to generate a single image per request alongside any text prompt. +Use `--data "kind=synthetic_image"` to generate a single image per request alongside any text prompt. ### Example Commands A single 720p image alongside 200 text tokens and 64 output tokens: ```bash -guidellm benchmark run \ - --target http://localhost:8000 \ - --data "type=synthetic_image,resolution=720p,text_tokens=200,output_tokens=64" +guidellm run \ + --backend "kind=openai_http,target=http://localhost:8000" \ + --data "kind=synthetic_image,resolution=720p,text_tokens=200,output_tokens=64" ``` A 1280×720 JPEG with two images per request: ```bash -guidellm benchmark run \ - --target http://localhost:8000 \ - --data "type=synthetic_image,width=1280,height=720,format=jpeg,images_per_request=2,text_tokens=200,output_tokens=64" +guidellm run \ + --backend "kind=openai_http,target=http://localhost:8000" \ + --data "kind=synthetic_image,width=1280,height=720,format=jpeg,images_per_request=2,text_tokens=200,output_tokens=64" ``` ### Configuration Options @@ -54,24 +54,24 @@ guidellm benchmark run \ ## Synthetic video -Use `--data "type=synthetic_video"` to generate a short clip per request alongside any text prompt. Output is `mp4` (h264, yuv420p). +Use `--data "kind=synthetic_video"` to generate a short clip per request alongside any text prompt. Output is `mp4` (h264, yuv420p). ### Example Commands A six-frame 480p clip at 1 fps with modest prompt and output budgets: ```bash -guidellm benchmark run \ - --target http://localhost:8000 \ - --data "type=synthetic_video,width=854,height=480,frames=6,fps=1,text_tokens=64,output_tokens=128" +guidellm run \ + --backend "kind=openai_http,target=http://localhost:8000" \ + --data "kind=synthetic_video,width=854,height=480,frames=6,fps=1,text_tokens=64,output_tokens=128" ``` A twelve-frame 720p clip at 3 fps with an explicit h264 target bitrate: ```bash -guidellm benchmark run \ - --target http://localhost:8000 \ - --data "type=synthetic_video,width=1280,height=720,frames=12,fps=3,video_bitrate=2M,text_tokens=64,output_tokens=128" +guidellm run \ + --backend "kind=openai_http,target=http://localhost:8000" \ + --data "kind=synthetic_video,width=1280,height=720,frames=12,fps=3,video_bitrate=2M,text_tokens=64,output_tokens=128" ``` ### Configuration Options @@ -88,6 +88,6 @@ guidellm benchmark run \ ## Notes -- A processor/tokenizer is required for the text portion of the request. By default the model passed in or retrieved from the server is used; otherwise specify one with `--processor`. -- Per-row seeded gradients produce byte-different payloads on every request, which bypasses vLLM's multimodal preprocessor cache. If you want to deliberately hit the cache, set `content=solid` or pin a fixed `seed` and `samples`. +- A tokenizer is required for the text portion of the request. By default the model passed in or retrieved from the server is used; otherwise specify one with `--tokenizer`. +- Per-row seeded gradients produce byte-different payloads on every request, which bypasses vLLM's multimodal preprocessor cache. If you want to deliberately hit the cache, use fixed payload settings such as `content=solid` for images, or a fixed `seed` with a fixed `--data-loader "kind=pytorch,samples=..."` value. - The exact mp4 bytes produced for a given seed depend on the installed `ffmpeg` and `PIL` versions. Output token counts and request shape stay stable across versions, but if you are comparing byte-level outputs or wire-size measurements across machines, expect small variation. From ef6579e0fd05ea2f26df2aff907b43669c184cfd Mon Sep 17 00:00:00 2001 From: Ali Tayeb Date: Wed, 24 Jun 2026 21:33:38 -0700 Subject: [PATCH 14/14] tox: install dev extra for quality envs Signed-off-by: Ali Tayeb --- tox.ini | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tox.ini b/tox.ini index 5b16c1be3..ccb60e622 100644 --- a/tox.ini +++ b/tox.ini @@ -5,35 +5,35 @@ env_list = {lint,type}-check,test-{unit,integration,e2e} [testenv:tests] description = Run all tests -dependency_groups = dev +extras = dev commands = python -m pytest {posargs:tests/} [testenv:test-unit] description = Run unit tests -base = tests +extras = dev commands = python -m pytest tests/unit {posargs} [testenv:test-integration] description = Run integration tests -base = tests +extras = dev commands = python -m pytest tests/integration {posargs} [testenv:test-e2e] description = Run end-to-end tests -base = tests +extras = dev commands = python -m pytest tests/e2e {posargs} [testenv:lint-check] description = Run all quality checks -base = tests +extras = dev commands = ruff format --check --diff ruff check @@ -42,7 +42,7 @@ commands = [testenv:lint-fix] description = Run style checks and fixes -base = tests +extras = dev commands = ruff format ruff check --fix @@ -51,14 +51,14 @@ commands = [testenv:type-check] description = Run type checks -base = tests +extras = dev commands = mypy --check-untyped-defs {posargs} [testenv:link-check] description = Run link checks for root and docs markdown files -base = tests +extras = dev commands = mkdocs-linkcheck ./ mkdocs-linkcheck docs/