From acb02f93d357fee369e4f98c37a94c8b129dbe0d Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Wed, 17 Jun 2026 09:53:03 +0100 Subject: [PATCH 01/18] feat(thunk): add .parsed property to ComputedModelOutputThunk for structured output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When `format=` is passed to `act()`/`instruct()`, the model returns a JSON string and `.value` has always held that raw JSON — not a Pydantic instance. Accessing `.label` (etc.) on `.value` silently raises `AttributeError` at runtime while pyright accepts the cast without complaint, leading to hard-to-debug silent failures. This commit adds: - `_format: type[pydantic.BaseModel] | None` attribute on `ModelOutputThunk` (initialised to `None`; propagated via `_copy_from`) - All five backends (`ollama`, `litellm`, `openai`, `huggingface`, `watsonx`) now set `mot._format = _format` in `post_processing()`, alongside the existing `generate_log.extra` artefact - `ComputedModelOutputThunk.parsed` property — calls `_format.model_validate_json(value)` when a format type is stored, returns `None` otherwise - Docstring updates on `ModelOutputThunk.value` and `Session.act()` pointing callers to `.parsed` when `format=` is used - Four unit tests covering: happy path, no-format returns None, invalid JSON raises `pydantic.ValidationError`, and `.value` is unaffected Closes #1273. Signed-off-by: Nigel Jones Assisted-by: Claude Code --- mellea/backends/huggingface.py | 1 + mellea/backends/litellm.py | 1 + mellea/backends/ollama.py | 2 ++ mellea/backends/openai.py | 1 + mellea/backends/watsonx.py | 1 + mellea/core/base.py | 30 ++++++++++++++++++++- mellea/stdlib/session.py | 4 ++- test/core/test_base.py | 48 +++++++++++++++++++++++++++++++++- 8 files changed, 85 insertions(+), 3 deletions(-) diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py index 0d2a83524..9050dce9a 100644 --- a/mellea/backends/huggingface.py +++ b/mellea/backends/huggingface.py @@ -1513,6 +1513,7 @@ async def _generate_from_raw( generate_log.action = action result._generate_log = generate_log + result._format = format results.append(result) usage: dict[str, Any] | None = ( diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py index f4169d469..90666cf08 100644 --- a/mellea/backends/litellm.py +++ b/mellea/backends/litellm.py @@ -596,6 +596,7 @@ async def post_processing( generate_log.action = mot._action generate_log.result = mot mot._generate_log = generate_log + mot._format = _format # Extract token usage from full response dict or streaming usage full_response = mot._meta.get("litellm_full_response") diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py index a7c1342c9..5d28b78b0 100644 --- a/mellea/backends/ollama.py +++ b/mellea/backends/ollama.py @@ -613,6 +613,7 @@ async def _generate_from_raw( generate_log.extra["error"] = error generate_log.extra["empty_response"] = response.model_dump() result._generate_log = generate_log + result._format = format results.append(result) @@ -742,6 +743,7 @@ async def post_processing( generate_log.result = mot mot._generate_log = generate_log + mot._format = _format mot._generate = None # Extract token counts from response diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py index 42a4625fd..fa1c12e58 100644 --- a/mellea/backends/openai.py +++ b/mellea/backends/openai.py @@ -1127,6 +1127,7 @@ async def post_processing( generate_log.action = mot._action generate_log.result = mot mot._generate_log = generate_log + mot._format = _format # Extract token usage from response or streaming usage response = mot._meta["oai_chat_response"] diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py index 0d3ec1f3f..d06920969 100644 --- a/mellea/backends/watsonx.py +++ b/mellea/backends/watsonx.py @@ -614,6 +614,7 @@ async def post_processing( generate_log.result = mot generate_log.action = mot._action mot._generate_log = generate_log + mot._format = _format async def _generate_from_raw( self, diff --git a/mellea/core/base.py b/mellea/core/base.py index 05dbf4b31..06dee95c7 100644 --- a/mellea/core/base.py +++ b/mellea/core/base.py @@ -32,6 +32,7 @@ runtime_checkable, ) +import pydantic import typing_extensions from PIL import Image as PILImage @@ -401,6 +402,7 @@ def __init__( # Mellea-side hook correlation ID; distinct from the provider-assigned # `GenerationMetadata.response_id`. self._generation_id: str | None = None + self._format: type[pydantic.BaseModel] | None = None def _record_ttfb(self) -> None: """Record time-to-first-byte if streaming and not yet recorded.""" @@ -542,6 +544,7 @@ def _copy_from(self, other: ModelOutputThunk) -> None: self._thinking = other._thinking self.generation = other.generation self._generate_log = other._generate_log + self._format = other._format self._cancelled = other._cancelled # _cancel_hook is deliberately not copied: _copy_from swaps output state, # not backend-thread plumbing, which is tied to the original computation. @@ -557,7 +560,13 @@ def is_computed(self) -> bool: @property def value(self) -> str | None: - """Gets the value of the block.""" + """Gets the raw string value of the block. + + When ``format=`` is set on the originating ``act()``/``instruct()`` call, the + model returns a JSON string and ``.value`` contains that raw JSON — not a + Pydantic instance. Use ``.parsed`` on a ``ComputedModelOutputThunk`` to get + the validated model object. + """ if not self._computed: return None return self._underlying_value @@ -881,6 +890,25 @@ def value(self, v: str): """Sets the value of the block.""" self._underlying_value = v + @property + def parsed(self) -> pydantic.BaseModel | None: + """Returns the result as a validated Pydantic instance when ``format=`` was set. + + Returns ``None`` when no ``format=`` type was provided to the originating + ``act()`` / ``instruct()`` call. Use this instead of casting ``.value`` + manually:: + + result = m.act(Instruction("Say yes or no"), format=MyModel) + obj = result.parsed # MyModel instance, no cast needed + + Returns: + A ``pydantic.BaseModel`` instance produced by ``model_validate_json``, + or ``None`` if no format type was set. + """ + if self._format is None: + return None + return self._format.model_validate_json(self.value) + def is_computed(self) -> Literal[True]: """Returns `True` since thunk is always computed. diff --git a/mellea/stdlib/session.py b/mellea/stdlib/session.py index 34a9d70b1..37b10cef3 100644 --- a/mellea/stdlib/session.py +++ b/mellea/stdlib/session.py @@ -434,7 +434,9 @@ def act( requirements: used as additional requirements when a sampling strategy is provided strategy: a SamplingStrategy that describes the strategy for validating and repairing/retrying for the instruct-validate-repair pattern. None means that no particular sampling strategy is used. return_sampling_results: attach the (successful and failed) sampling attempts to the results. - format: if set, the BaseModel to use for constrained decoding. + format: if set, the BaseModel to use for constrained decoding. When + provided, ``.value`` on the returned thunk is always a raw JSON string — + use ``.parsed`` to obtain the validated Pydantic model instance. model_options: additional model options, which will upsert into the model/backend's defaults. tool_calls: if true, tool calling is enabled. diff --git a/test/core/test_base.py b/test/core/test_base.py index 213a16e6e..3e0e0b821 100644 --- a/test/core/test_base.py +++ b/test/core/test_base.py @@ -3,10 +3,17 @@ import io from typing import Any +import pydantic import pytest from PIL import Image as PILImage -from mellea.core import CBlock, Component, ImageBlock, ModelOutputThunk +from mellea.core import ( + CBlock, + Component, + ComputedModelOutputThunk, + ImageBlock, + ModelOutputThunk, +) from mellea.stdlib.components import Message @@ -317,3 +324,42 @@ async def _absorbs_first_cancel() -> None: await asyncio.wait_for(mot._generate, timeout=1.0) # type: ignore[attr-defined] except (TimeoutError, asyncio.CancelledError): pass + + +# --- ComputedModelOutputThunk.parsed --- + + +class _Label(pydantic.BaseModel): + label: str + + +def _make_computed( + json_str: str, fmt: type[pydantic.BaseModel] | None +) -> ComputedModelOutputThunk: + thunk = ModelOutputThunk(value=json_str) + thunk._format = fmt + return ComputedModelOutputThunk(thunk) + + +def test_parsed_returns_model_instance() -> None: + result = _make_computed('{"label": "yes"}', _Label) + obj = result.parsed + assert isinstance(obj, _Label) + assert obj.label == "yes" + + +def test_parsed_returns_none_when_no_format() -> None: + result = _make_computed('{"label": "yes"}', None) + assert result.parsed is None + + +def test_parsed_raises_on_invalid_json() -> None: + result = _make_computed("not json", _Label) + with pytest.raises(pydantic.ValidationError): + _ = result.parsed + + +def test_value_unaffected_by_format() -> None: + raw = '{"label": "ok"}' + result = _make_computed(raw, _Label) + assert result.value == raw From 7370b833371d0081427b9086f146bfc571a6493f Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Wed, 17 Jun 2026 10:05:31 +0100 Subject: [PATCH 02/18] feat(types): thread format= overloads for cast-free structured output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When `format=MyModel` is passed to `act()`, `instruct()`, `aact()`, or `ainstruct()` in both `functional.py` and `session.py`, the return type now narrows to `ComputedModelOutputThunk[MyModel]` (or `ModelOutputThunk[MyModel]` for the non-awaited async variant) instead of `ComputedModelOutputThunk[str]`. This eliminates the need for `cast(MyModel, result.value)` at call sites. Runtime behaviour is unchanged; all overloads were already dispatched to the same implementation body. Changes: - `functional.py` / `session.py` – new `@overload` stubs with `format: type[BaseModelSubclass]` for all four methods; implementation signatures broadened to `Any` to cover all overload combinations - `test/typing/` – `assert_type` checks for the new overload resolution paths in all four typing-check modules - `genstub.py` – `# type: ignore[return-value]` on four existing return sites that rely on the pre-narrowed `R` type variable which the new overloads can no longer infer - `react.py` / `m_serve_example_response_format.py` – `# type: ignore` on call sites that pass a dynamic `format` value incompatible with the new stricter overload signatures Closes #1274 Signed-off-by: Nigel Jones Assisted-by: Claude Code Signed-off-by: Nigel Jones --- .../m_serve_example_response_format.py | 2 +- mellea/stdlib/components/genstub.py | 8 +- mellea/stdlib/frameworks/react.py | 2 +- mellea/stdlib/functional.py | 177 ++++++++++++++++-- mellea/stdlib/session.py | 162 ++++++++++++++-- test/typing/check_functional_aact.py | 22 +++ test/typing/check_functional_ainstruct.py | 25 +++ test/typing/check_functional_sync.py | 16 ++ test/typing/check_session.py | 36 ++++ 9 files changed, 418 insertions(+), 32 deletions(-) diff --git a/docs/examples/m_serve/m_serve_example_response_format.py b/docs/examples/m_serve/m_serve_example_response_format.py index 8b1ab29b6..5a85d5094 100644 --- a/docs/examples/m_serve/m_serve_example_response_format.py +++ b/docs/examples/m_serve/m_serve_example_response_format.py @@ -50,7 +50,7 @@ def serve( description=message, requirements=requirements, # type: ignore model_options=model_options, - format=format, # This enables structured output validation + format=format, # type: ignore[arg-type] # dynamic format from caller ) return result diff --git a/mellea/stdlib/components/genstub.py b/mellea/stdlib/components/genstub.py index b836620ee..ab10cdd82 100644 --- a/mellea/stdlib/components/genstub.py +++ b/mellea/stdlib/components/genstub.py @@ -654,9 +654,9 @@ def __call__(self, *args, **kwargs) -> tuple[R, Context] | R: assert response.parsed_repr is not None if context is None: - return response.parsed_repr + return response.parsed_repr # type: ignore[return-value] else: - return response.parsed_repr, context + return response.parsed_repr, context # type: ignore[return-value] class AsyncGenerativeStub(GenerativeStub, Generic[P, R]): @@ -797,9 +797,9 @@ async def __async_call__() -> tuple[R, Context] | R: ) assert response.parsed_repr is not None if context is None: - return response.parsed_repr + return response.parsed_repr # type: ignore[return-value] else: - return response.parsed_repr, context + return response.parsed_repr, context # type: ignore[return-value] return __async_call__() diff --git a/mellea/stdlib/frameworks/react.py b/mellea/stdlib/frameworks/react.py index 9b523be58..156c03e8a 100644 --- a/mellea/stdlib/frameworks/react.py +++ b/mellea/stdlib/frameworks/react.py @@ -111,7 +111,7 @@ async def react( assert len(tool_responses) == 1, "multiple tools were called with 'final'" if format is not None: - step, next_context = await mfuncs.aact( + step, next_context = await mfuncs.aact( # type: ignore[assignment] action=ReactThought(), context=context, backend=backend, diff --git a/mellea/stdlib/functional.py b/mellea/stdlib/functional.py index 40a00c612..73050603d 100644 --- a/mellea/stdlib/functional.py +++ b/mellea/stdlib/functional.py @@ -45,6 +45,21 @@ from .sampling import RejectionSamplingStrategy +@overload +def act( + action: Component[Any], + context: Context, + backend: Backend, + *, + requirements: list[Requirement] | None = None, + strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2), + return_sampling_results: Literal[False] = False, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, +) -> tuple[ComputedModelOutputThunk[BaseModelSubclass], Context]: ... + + @overload def act( action: Component[S], @@ -54,7 +69,7 @@ def act( requirements: list[Requirement] | None = None, strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2), return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, ) -> tuple[ComputedModelOutputThunk[S], Context]: ... @@ -86,7 +101,7 @@ def act( format: type[BaseModelSubclass] | None = None, model_options: dict | None = None, tool_calls: bool = False, -) -> tuple[ComputedModelOutputThunk[S], Context] | SamplingResult[S]: +) -> tuple[ComputedModelOutputThunk[Any], Context] | SamplingResult[Any]: """Runs a generic action, and adds both the action and the result to the context. Args: @@ -146,7 +161,28 @@ def instruct( output_prefix: str | CBlock | None = None, strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2), return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, +) -> tuple[ComputedModelOutputThunk[BaseModelSubclass], Context]: ... + + +@overload +def instruct( + description: str, + context: Context, + backend: Backend, + *, + images: list[ImageBlock] | list[PILImage.Image] | None = None, + requirements: list[Requirement | str] | None = None, + icl_examples: list[str | CBlock] | None = None, + grounding_context: dict[str, str | CBlock | Component] | None = None, + user_variables: dict[str, str] | None = None, + prefix: str | CBlock | None = None, + output_prefix: str | CBlock | None = None, + strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2), + return_sampling_results: Literal[False] = False, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, ) -> tuple[ComputedModelOutputThunk[str], Context]: ... @@ -190,7 +226,7 @@ def instruct( format: type[BaseModelSubclass] | None = None, model_options: dict | None = None, tool_calls: bool = False, -) -> tuple[ComputedModelOutputThunk[str], Context] | SamplingResult[str]: +) -> tuple[ComputedModelOutputThunk[Any], Context] | SamplingResult[Any]: """Generates from an instruction. Args: @@ -474,6 +510,23 @@ def transform( return transformed, new_ctx +@overload +async def aact( + action: Component[Any], + context: Context, + backend: Backend, + *, + requirements: list[Requirement] | None = None, + strategy: None = None, + return_sampling_results: Literal[False] = False, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, + silence_context_type_warning: bool = False, + await_result: Literal[True], +) -> tuple[ComputedModelOutputThunk[BaseModelSubclass], Context]: ... + + @overload async def aact( action: Component[S], @@ -483,7 +536,7 @@ async def aact( requirements: list[Requirement] | None = None, strategy: None = None, return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, silence_context_type_warning: bool = False, @@ -491,6 +544,23 @@ async def aact( ) -> tuple[ComputedModelOutputThunk[S], Context]: ... +@overload +async def aact( + action: Component[Any], + context: Context, + backend: Backend, + *, + requirements: list[Requirement] | None = None, + strategy: SamplingStrategy, + return_sampling_results: Literal[False] = False, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, + silence_context_type_warning: bool = False, + await_result: bool = False, +) -> tuple[ComputedModelOutputThunk[BaseModelSubclass], Context]: ... + + @overload async def aact( action: Component[S], @@ -500,7 +570,7 @@ async def aact( requirements: list[Requirement] | None = None, strategy: SamplingStrategy, return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, silence_context_type_warning: bool = False, @@ -508,6 +578,23 @@ async def aact( ) -> tuple[ComputedModelOutputThunk[S], Context]: ... +@overload +async def aact( + action: Component[Any], + context: Context, + backend: Backend, + *, + requirements: list[Requirement] | None = None, + strategy: None = None, + return_sampling_results: Literal[False] = False, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, + silence_context_type_warning: bool = False, + await_result: Literal[False] = False, +) -> tuple[ModelOutputThunk[BaseModelSubclass], Context]: ... + + @overload async def aact( action: Component[S], @@ -517,7 +604,7 @@ async def aact( requirements: list[Requirement] | None = None, strategy: None = None, return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, silence_context_type_warning: bool = False, @@ -555,7 +642,7 @@ async def aact( tool_calls: bool = False, silence_context_type_warning: bool = False, await_result: bool = False, -) -> tuple[ModelOutputThunk[S], Context] | SamplingResult: +) -> tuple[ModelOutputThunk[Any], Context] | SamplingResult[Any]: """Asynchronous version of .act; runs a generic action, and adds both the action and the result to the context. Args: @@ -777,7 +864,29 @@ async def ainstruct( output_prefix: str | CBlock | None = None, strategy: None = None, return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, + await_result: Literal[True], +) -> tuple[ComputedModelOutputThunk[BaseModelSubclass], Context]: ... + + +@overload +async def ainstruct( + description: str, + context: Context, + backend: Backend, + *, + images: list[ImageBlock] | list[PILImage.Image] | None = None, + requirements: list[Requirement | str] | None = None, + icl_examples: list[str | CBlock] | None = None, + grounding_context: dict[str, str | CBlock | Component] | None = None, + user_variables: dict[str, str] | None = None, + prefix: str | CBlock | None = None, + output_prefix: str | CBlock | None = None, + strategy: None = None, + return_sampling_results: Literal[False] = False, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, await_result: Literal[True], @@ -799,7 +908,29 @@ async def ainstruct( output_prefix: str | CBlock | None = None, strategy: SamplingStrategy, return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, + await_result: bool = False, +) -> tuple[ComputedModelOutputThunk[BaseModelSubclass], Context]: ... + + +@overload +async def ainstruct( + description: str, + context: Context, + backend: Backend, + *, + images: list[ImageBlock] | list[PILImage.Image] | None = None, + requirements: list[Requirement | str] | None = None, + icl_examples: list[str | CBlock] | None = None, + grounding_context: dict[str, str | CBlock | Component] | None = None, + user_variables: dict[str, str] | None = None, + prefix: str | CBlock | None = None, + output_prefix: str | CBlock | None = None, + strategy: SamplingStrategy, + return_sampling_results: Literal[False] = False, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, await_result: bool = False, @@ -821,7 +952,29 @@ async def ainstruct( output_prefix: str | CBlock | None = None, strategy: None = None, return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, + await_result: Literal[False] = False, +) -> tuple[ModelOutputThunk[BaseModelSubclass], Context]: ... + + +@overload +async def ainstruct( + description: str, + context: Context, + backend: Backend, + *, + images: list[ImageBlock] | list[PILImage.Image] | None = None, + requirements: list[Requirement | str] | None = None, + icl_examples: list[str | CBlock] | None = None, + grounding_context: dict[str, str | CBlock | Component] | None = None, + user_variables: dict[str, str] | None = None, + prefix: str | CBlock | None = None, + output_prefix: str | CBlock | None = None, + strategy: None = None, + return_sampling_results: Literal[False] = False, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, await_result: Literal[False] = False, @@ -868,7 +1021,7 @@ async def ainstruct( model_options: dict | None = None, tool_calls: bool = False, await_result: bool = False, -) -> tuple[ModelOutputThunk[str], Context] | SamplingResult: +) -> tuple[ModelOutputThunk[Any], Context] | SamplingResult[Any]: """Generates from an instruction. Args: diff --git a/mellea/stdlib/session.py b/mellea/stdlib/session.py index 34a9d70b1..4c01659c4 100644 --- a/mellea/stdlib/session.py +++ b/mellea/stdlib/session.py @@ -390,6 +390,19 @@ def cleanup(self) -> None: deregister_session_plugins(self.id) + @overload + def act( + self, + action: Component[Any], + *, + requirements: list[Requirement] | None = None, + strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2), + return_sampling_results: Literal[False] = False, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, + ) -> ComputedModelOutputThunk[BaseModelSubclass]: ... + @overload def act( self, @@ -398,7 +411,7 @@ def act( requirements: list[Requirement] | None = None, strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2), return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, ) -> ComputedModelOutputThunk[S]: ... @@ -418,7 +431,7 @@ def act( def act( self, - action: Component[S], + action: Component[Any], *, requirements: list[Requirement] | None = None, strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2), @@ -426,7 +439,7 @@ def act( format: type[BaseModelSubclass] | None = None, model_options: dict | None = None, tool_calls: bool = False, - ) -> ModelOutputThunk[S] | SamplingResult: + ) -> ModelOutputThunk[Any] | SamplingResult[Any]: """Runs a generic action, and adds both the action and the result to the context. Args: @@ -475,7 +488,26 @@ def instruct( output_prefix: str | CBlock | None = None, strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2), return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, + ) -> ComputedModelOutputThunk[BaseModelSubclass]: ... + + @overload + def instruct( + self, + description: str, + *, + images: list[ImageBlock] | list[PILImage.Image] | None = None, + requirements: list[Requirement | str] | None = None, + icl_examples: list[str | CBlock] | None = None, + grounding_context: dict[str, str | CBlock | Component] | None = None, + user_variables: dict[str, str] | None = None, + prefix: str | CBlock | None = None, + output_prefix: str | CBlock | None = None, + strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2), + return_sampling_results: Literal[False] = False, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, ) -> ComputedModelOutputThunk[str]: ... @@ -515,7 +547,7 @@ def instruct( format: type[BaseModelSubclass] | None = None, model_options: dict | None = None, tool_calls: bool = False, - ) -> ModelOutputThunk[str] | SamplingResult: + ) -> ModelOutputThunk[Any] | SamplingResult[Any]: """Generates from an instruction. Args: @@ -707,6 +739,20 @@ def transform( self.ctx = context return result + @overload + async def aact( + self, + action: Component[Any], + *, + requirements: list[Requirement] | None = None, + strategy: None = None, + return_sampling_results: Literal[False] = False, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, + await_result: Literal[True], + ) -> ComputedModelOutputThunk[BaseModelSubclass]: ... + @overload async def aact( self, @@ -715,12 +761,26 @@ async def aact( requirements: list[Requirement] | None = None, strategy: None = None, return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, await_result: Literal[True], ) -> ComputedModelOutputThunk[S]: ... + @overload + async def aact( + self, + action: Component[Any], + *, + requirements: list[Requirement] | None = None, + strategy: SamplingStrategy, + return_sampling_results: Literal[False] = False, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, + await_result: bool = False, + ) -> ComputedModelOutputThunk[BaseModelSubclass]: ... + @overload async def aact( self, @@ -729,12 +789,26 @@ async def aact( requirements: list[Requirement] | None = None, strategy: SamplingStrategy, return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, await_result: bool = False, ) -> ComputedModelOutputThunk[S]: ... + @overload + async def aact( + self, + action: Component[Any], + *, + requirements: list[Requirement] | None = None, + strategy: None = None, + return_sampling_results: Literal[False] = False, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, + await_result: Literal[False] = False, + ) -> ModelOutputThunk[BaseModelSubclass]: ... + @overload async def aact( self, @@ -743,7 +817,7 @@ async def aact( requirements: list[Requirement] | None = None, strategy: None = None, return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, await_result: Literal[False] = False, @@ -765,7 +839,7 @@ async def aact( async def aact( self, - action: Component[S], + action: Component[Any], *, requirements: list[Requirement] | None = None, strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2), @@ -774,7 +848,7 @@ async def aact( model_options: dict | None = None, tool_calls: bool = False, await_result: bool = False, - ) -> ModelOutputThunk[S] | SamplingResult: + ) -> ModelOutputThunk[Any] | SamplingResult[Any]: """Runs a generic action, and adds both the action and the result to the context. Args: @@ -826,7 +900,27 @@ async def ainstruct( output_prefix: str | CBlock | None = None, strategy: None = None, return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, + await_result: Literal[True], + ) -> ComputedModelOutputThunk[BaseModelSubclass]: ... + + @overload + async def ainstruct( + self, + description: str, + *, + images: list[ImageBlock] | list[PILImage.Image] | None = None, + requirements: list[Requirement | str] | None = None, + icl_examples: list[str | CBlock] | None = None, + grounding_context: dict[str, str | CBlock | Component] | None = None, + user_variables: dict[str, str] | None = None, + prefix: str | CBlock | None = None, + output_prefix: str | CBlock | None = None, + strategy: None = None, + return_sampling_results: Literal[False] = False, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, await_result: Literal[True], @@ -846,7 +940,27 @@ async def ainstruct( output_prefix: str | CBlock | None = None, strategy: SamplingStrategy, return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, + await_result: bool = False, + ) -> ComputedModelOutputThunk[BaseModelSubclass]: ... + + @overload + async def ainstruct( + self, + description: str, + *, + images: list[ImageBlock] | list[PILImage.Image] | None = None, + requirements: list[Requirement | str] | None = None, + icl_examples: list[str | CBlock] | None = None, + grounding_context: dict[str, str | CBlock | Component] | None = None, + user_variables: dict[str, str] | None = None, + prefix: str | CBlock | None = None, + output_prefix: str | CBlock | None = None, + strategy: SamplingStrategy, + return_sampling_results: Literal[False] = False, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, await_result: bool = False, @@ -866,7 +980,27 @@ async def ainstruct( output_prefix: str | CBlock | None = None, strategy: None = None, return_sampling_results: Literal[False] = False, - format: type[BaseModelSubclass] | None = None, + format: type[BaseModelSubclass], + model_options: dict | None = None, + tool_calls: bool = False, + await_result: Literal[False] = False, + ) -> ModelOutputThunk[BaseModelSubclass]: ... + + @overload + async def ainstruct( + self, + description: str, + *, + images: list[ImageBlock] | list[PILImage.Image] | None = None, + requirements: list[Requirement | str] | None = None, + icl_examples: list[str | CBlock] | None = None, + grounding_context: dict[str, str | CBlock | Component] | None = None, + user_variables: dict[str, str] | None = None, + prefix: str | CBlock | None = None, + output_prefix: str | CBlock | None = None, + strategy: None = None, + return_sampling_results: Literal[False] = False, + format: None = None, model_options: dict | None = None, tool_calls: bool = False, await_result: Literal[False] = False, @@ -909,7 +1043,7 @@ async def ainstruct( model_options: dict | None = None, tool_calls: bool = False, await_result: bool = False, - ) -> ModelOutputThunk[str] | SamplingResult[str]: + ) -> ModelOutputThunk[Any] | SamplingResult[Any]: """Generates from an instruction. Args: diff --git a/test/typing/check_functional_aact.py b/test/typing/check_functional_aact.py index 008b824a6..f24d04943 100644 --- a/test/typing/check_functional_aact.py +++ b/test/typing/check_functional_aact.py @@ -2,6 +2,8 @@ from typing import assert_type, cast +from pydantic import BaseModel + from mellea.core import ( Backend, ComputedModelOutputThunk, @@ -18,6 +20,10 @@ action: Instruction = cast(Instruction, None) +class _M(BaseModel): + value: str + + async def check_computed_await() -> None: r = await aact(action, ctx, backend, strategy=None, await_result=True) assert_type(r, tuple[ComputedModelOutputThunk[str], Context]) @@ -37,3 +43,19 @@ async def check_uncomputed() -> None: async def check_sampling() -> None: r = await aact(action, ctx, backend, return_sampling_results=True) assert_type(r, SamplingResult[str]) + + +async def check_format_computed_await() -> None: + r = await aact(action, ctx, backend, strategy=None, await_result=True, format=_M) + assert_type(r, tuple[ComputedModelOutputThunk[_M], Context]) + + +async def check_format_computed_strategy() -> None: + strat = RejectionSamplingStrategy(loop_budget=2) + r = await aact(action, ctx, backend, strategy=strat, format=_M) + assert_type(r, tuple[ComputedModelOutputThunk[_M], Context]) + + +async def check_format_uncomputed() -> None: + r = await aact(action, ctx, backend, strategy=None, format=_M) + assert_type(r, tuple[ModelOutputThunk[_M], Context]) diff --git a/test/typing/check_functional_ainstruct.py b/test/typing/check_functional_ainstruct.py index fe1e113f3..c3c3947a9 100644 --- a/test/typing/check_functional_ainstruct.py +++ b/test/typing/check_functional_ainstruct.py @@ -2,6 +2,8 @@ from typing import assert_type, cast +from pydantic import BaseModel + from mellea.core import ( Backend, ComputedModelOutputThunk, @@ -10,11 +12,16 @@ SamplingResult, ) from mellea.stdlib.functional import ainstruct +from mellea.stdlib.sampling import RejectionSamplingStrategy ctx = cast(Context, None) backend = cast(Backend, None) +class _M(BaseModel): + value: str + + async def check_computed() -> None: r = await ainstruct("test", ctx, backend, strategy=None, await_result=True) assert_type(r, tuple[ComputedModelOutputThunk[str], Context]) @@ -28,3 +35,21 @@ async def check_uncomputed() -> None: async def check_sampling() -> None: r = await ainstruct("test", ctx, backend, return_sampling_results=True) assert_type(r, SamplingResult[str]) + + +async def check_format_computed_await() -> None: + r = await ainstruct( + "test", ctx, backend, strategy=None, await_result=True, format=_M + ) + assert_type(r, tuple[ComputedModelOutputThunk[_M], Context]) + + +async def check_format_computed_strategy() -> None: + strat = RejectionSamplingStrategy(loop_budget=2) + r = await ainstruct("test", ctx, backend, strategy=strat, format=_M) + assert_type(r, tuple[ComputedModelOutputThunk[_M], Context]) + + +async def check_format_uncomputed() -> None: + r = await ainstruct("test", ctx, backend, strategy=None, format=_M) + assert_type(r, tuple[ModelOutputThunk[_M], Context]) diff --git a/test/typing/check_functional_sync.py b/test/typing/check_functional_sync.py index 494edb15a..5866d6ce8 100644 --- a/test/typing/check_functional_sync.py +++ b/test/typing/check_functional_sync.py @@ -2,6 +2,8 @@ from typing import assert_type, cast +from pydantic import BaseModel + from mellea.core import Backend, ComputedModelOutputThunk, Context from mellea.stdlib.components import Instruction from mellea.stdlib.functional import act, instruct @@ -13,6 +15,10 @@ s = cast(MelleaSession, None) +class _M(BaseModel): + value: str + + def check_act_sync() -> None: r = act(action, ctx, backend) assert_type(r, tuple[ComputedModelOutputThunk[str], Context]) @@ -31,3 +37,13 @@ def check_session_act_sync() -> None: def check_session_instruct_sync() -> None: r = s.instruct("test") assert_type(r, ComputedModelOutputThunk[str]) + + +def check_act_format() -> None: + r = act(action, ctx, backend, format=_M) + assert_type(r, tuple[ComputedModelOutputThunk[_M], Context]) + + +def check_instruct_format() -> None: + r = instruct("test", ctx, backend, format=_M) + assert_type(r, tuple[ComputedModelOutputThunk[_M], Context]) diff --git a/test/typing/check_session.py b/test/typing/check_session.py index 81db1ff2e..9ea11bf76 100644 --- a/test/typing/check_session.py +++ b/test/typing/check_session.py @@ -2,6 +2,8 @@ from typing import Any, assert_type, cast +from pydantic import BaseModel + from mellea.core import ComputedModelOutputThunk, ModelOutputThunk, SamplingResult from mellea.stdlib.components import Instruction from mellea.stdlib.session import MelleaSession @@ -10,6 +12,10 @@ action: Instruction = cast(Instruction, None) +class _M(BaseModel): + value: str + + async def check_aact_computed() -> None: r = await s.aact(action, strategy=None, await_result=True) assert_type(r, ComputedModelOutputThunk[str]) @@ -53,3 +59,33 @@ async def check_aquery_uncomputed() -> None: def check_query_sync() -> None: r = s.query("obj", "q") assert_type(r, ComputedModelOutputThunk[Any]) + + +def check_act_format() -> None: + r = s.act(action, format=_M) + assert_type(r, ComputedModelOutputThunk[_M]) + + +def check_instruct_format() -> None: + r = s.instruct("test", format=_M) + assert_type(r, ComputedModelOutputThunk[_M]) + + +async def check_aact_format_computed() -> None: + r = await s.aact(action, strategy=None, await_result=True, format=_M) + assert_type(r, ComputedModelOutputThunk[_M]) + + +async def check_aact_format_uncomputed() -> None: + r = await s.aact(action, strategy=None, format=_M) + assert_type(r, ModelOutputThunk[_M]) + + +async def check_ainstruct_format_computed() -> None: + r = await s.ainstruct("test", strategy=None, await_result=True, format=_M) + assert_type(r, ComputedModelOutputThunk[_M]) + + +async def check_ainstruct_format_uncomputed() -> None: + r = await s.ainstruct("test", strategy=None, format=_M) + assert_type(r, ModelOutputThunk[_M]) From 0f77a9fc57c5ffda3969b28aeb2480410e8dd064 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Wed, 17 Jun 2026 12:15:10 +0100 Subject: [PATCH 03/18] fix(thunk): propagate _format through __copy__/__deepcopy__; add Raises: to parsed - Add `_format = self._format` to `__copy__` and `__deepcopy__` so that copying a ComputedModelOutputThunk preserves the format type; previously a copied thunk would silently return None from .parsed even when the original had a format set. - Add `Raises: pydantic.ValidationError` to the `parsed` property docstring to document the exception callers must handle when the model returns malformed structured output. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/core/base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mellea/core/base.py b/mellea/core/base.py index 06dee95c7..e9d84fdc0 100644 --- a/mellea/core/base.py +++ b/mellea/core/base.py @@ -785,6 +785,7 @@ def __copy__(self) -> ModelOutputThunk: copied._action = self._action copied._context = self._context copied._generate_log = self._generate_log + copied._format = self._format copied._model_options = self._model_options copied.generation = copy(self.generation) return copied @@ -819,6 +820,7 @@ def __deepcopy__(self, memo: dict) -> ModelOutputThunk: self._context ) # The items in a context should be immutable. deepcopied._generate_log = copy(self._generate_log) + deepcopied._format = self._format deepcopied._model_options = copy(self._model_options) deepcopied.generation = deepcopy(self.generation) return deepcopied @@ -904,6 +906,10 @@ def parsed(self) -> pydantic.BaseModel | None: Returns: A ``pydantic.BaseModel`` instance produced by ``model_validate_json``, or ``None`` if no format type was set. + + Raises: + pydantic.ValidationError: If the raw JSON value does not conform to + the format model (e.g. the model returned malformed structured output). """ if self._format is None: return None From 83295e9d8b14dea5a0dc073fe5693cea040b6cdf Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Wed, 17 Jun 2026 12:20:26 +0100 Subject: [PATCH 04/18] nit(thunk): soften parsed docstring example comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "no manual model_validate_json needed" is more accurate than "MyModel instance, no cast needed" — .parsed returns BaseModel | None, so static type narrowing still requires a cast; the value is just already deserialized. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/core/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mellea/core/base.py b/mellea/core/base.py index e9d84fdc0..54c8c1919 100644 --- a/mellea/core/base.py +++ b/mellea/core/base.py @@ -901,7 +901,7 @@ def parsed(self) -> pydantic.BaseModel | None: manually:: result = m.act(Instruction("Say yes or no"), format=MyModel) - obj = result.parsed # MyModel instance, no cast needed + obj = result.parsed # no manual model_validate_json needed Returns: A ``pydantic.BaseModel`` instance produced by ``model_validate_json``, From 8e4af00bcb4306f56bfcbeb7f0ddde4ac3156803 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Wed, 17 Jun 2026 12:30:39 +0100 Subject: [PATCH 05/18] docs(types): annotate type: ignore sites with rationale Add brief inline comments to the three type: ignore additions introduced by the format= overload threading, explaining why each ignore is intentional rather than masking a real issue. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/stdlib/components/genstub.py | 8 ++++---- mellea/stdlib/frameworks/react.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mellea/stdlib/components/genstub.py b/mellea/stdlib/components/genstub.py index ab10cdd82..632696ab8 100644 --- a/mellea/stdlib/components/genstub.py +++ b/mellea/stdlib/components/genstub.py @@ -654,9 +654,9 @@ def __call__(self, *args, **kwargs) -> tuple[R, Context] | R: assert response.parsed_repr is not None if context is None: - return response.parsed_repr # type: ignore[return-value] + return response.parsed_repr # type: ignore[return-value] # genstub unwraps R from FunctionResponse[R]; format overloads can't re-bind R here else: - return response.parsed_repr, context # type: ignore[return-value] + return response.parsed_repr, context # type: ignore[return-value] # same class AsyncGenerativeStub(GenerativeStub, Generic[P, R]): @@ -797,9 +797,9 @@ async def __async_call__() -> tuple[R, Context] | R: ) assert response.parsed_repr is not None if context is None: - return response.parsed_repr # type: ignore[return-value] + return response.parsed_repr # type: ignore[return-value] # genstub unwraps R from FunctionResponse[R]; format overloads can't re-bind R here else: - return response.parsed_repr, context # type: ignore[return-value] + return response.parsed_repr, context # type: ignore[return-value] # same return __async_call__() diff --git a/mellea/stdlib/frameworks/react.py b/mellea/stdlib/frameworks/react.py index 156c03e8a..271abcceb 100644 --- a/mellea/stdlib/frameworks/react.py +++ b/mellea/stdlib/frameworks/react.py @@ -111,7 +111,7 @@ async def react( assert len(tool_responses) == 1, "multiple tools were called with 'final'" if format is not None: - step, next_context = await mfuncs.aact( # type: ignore[assignment] + step, next_context = await mfuncs.aact( # type: ignore[assignment] # dynamic format from caller action=ReactThought(), context=context, backend=backend, From 0c3498a83685a282fa08527d4db94de28502daac Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Mon, 22 Jun 2026 11:45:14 +0100 Subject: [PATCH 06/18] fix(thunk): wire _format in HF chat post_processing; add missing tests The HuggingFace chat-path post_processing never assigned mot._format, so .parsed always returned None when format= was set via LocalHFBackend. All other backends (ollama, openai, litellm, watsonx) already set it. Also adds: - Copy/deepcopy unit tests verifying _format is preserved across copies - E2e tests in test_ollama and test_huggingface asserting .parsed returns a typed Pydantic instance end-to-end through each backend - Docstring note on ComputedModelOutputThunk.parsed warning custom-backend authors to set mot._format in their post_processing method Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/backends/huggingface.py | 1 + mellea/core/base.py | 7 +++++++ test/backends/test_huggingface.py | 18 ++++++++++++++++++ test/backends/test_ollama.py | 18 ++++++++++++++++++ test/core/test_base.py | 18 ++++++++++++++++++ 5 files changed, 62 insertions(+) diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py index 9050dce9a..bea2e8b75 100644 --- a/mellea/backends/huggingface.py +++ b/mellea/backends/huggingface.py @@ -1378,6 +1378,7 @@ class used during generation, if any. generate_log.result = mot mot._generate_log = generate_log + mot._format = _format async def _generate_from_raw( self, diff --git a/mellea/core/base.py b/mellea/core/base.py index 54c8c1919..db694257c 100644 --- a/mellea/core/base.py +++ b/mellea/core/base.py @@ -903,6 +903,13 @@ def parsed(self) -> pydantic.BaseModel | None: result = m.act(Instruction("Say yes or no"), format=MyModel) obj = result.parsed # no manual model_validate_json needed + Note: + This property relies on the originating backend storing the format + type on the thunk. Custom backend authors must set ``mot._format`` + in their ``post_processing`` method (mirroring the built-in + backends); otherwise ``.parsed`` always returns ``None`` even when + ``format=`` was supplied. + Returns: A ``pydantic.BaseModel`` instance produced by ``model_validate_json``, or ``None`` if no format type was set. diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py index c0d452ded..3c2d6d9b9 100644 --- a/test/backends/test_huggingface.py +++ b/test/backends/test_huggingface.py @@ -252,6 +252,24 @@ class Email(pydantic.BaseModel): ) +@pytest.mark.qualitative +def test_parsed_returns_pydantic_instance(session) -> None: + class Sentiment(pydantic.BaseModel): + label: str + + output = session.instruct( + "Classify the sentiment of 'I love this!' as the single word " + "positive, negative, or neutral. Respond with a label field.", + format=Sentiment, + model_options={ModelOption.MAX_NEW_TOKENS: 2**8}, + ) + + parsed = output.parsed + assert isinstance(parsed, Sentiment) + assert isinstance(parsed.label, str) and parsed.label + assert parsed == Sentiment.model_validate_json(output.value) + + @pytest.mark.qualitative async def test_generate_from_raw(session) -> None: prompts = [ diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py index b2aa7f249..890cb355f 100644 --- a/test/backends/test_ollama.py +++ b/test/backends/test_ollama.py @@ -127,6 +127,24 @@ class Email(pydantic.BaseModel): # assert email.to.email_address.endswith("example.com") +@pytest.mark.qualitative +def test_parsed_returns_pydantic_instance(session) -> None: + class Sentiment(pydantic.BaseModel): + label: str + + output = session.instruct( + "Classify the sentiment of 'I love this!' as the single word " + "positive, negative, or neutral. Respond with a label field.", + format=Sentiment, + model_options={ModelOption.MAX_NEW_TOKENS: 2**8}, + ) + + parsed = output.parsed + assert isinstance(parsed, Sentiment) + assert isinstance(parsed.label, str) and parsed.label + assert parsed == Sentiment.model_validate_json(output.value) + + @pytest.mark.qualitative @pytest.mark.timeout(150) async def test_generate_from_raw(session) -> None: diff --git a/test/core/test_base.py b/test/core/test_base.py index 3e0e0b821..894912bbd 100644 --- a/test/core/test_base.py +++ b/test/core/test_base.py @@ -363,3 +363,21 @@ def test_value_unaffected_by_format() -> None: raw = '{"label": "ok"}' result = _make_computed(raw, _Label) assert result.value == raw + + +def test_format_preserved_by_copy() -> None: + import copy as _copy + + result = _make_computed('{"label": "yes"}', _Label) + shallow = _copy.copy(result) + assert shallow._format is _Label + assert shallow._format.model_validate_json(shallow.value).label == "yes" # type: ignore[union-attr] + + +def test_format_preserved_by_deepcopy() -> None: + import copy as _copy + + result = _make_computed('{"label": "yes"}', _Label) + deep = _copy.deepcopy(result) + assert deep._format is _Label + assert deep._format.model_validate_json(deep.value).label == "yes" # type: ignore[union-attr] From 4857a321fc9b118b6a54970ddf63a3f86e3e943f Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Mon, 22 Jun 2026 11:46:26 +0100 Subject: [PATCH 07/18] nit(test): explain subclass-loss caveat in copy/deepcopy _format tests Assisted-by: Claude Code Signed-off-by: Nigel Jones --- test/core/test_base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/core/test_base.py b/test/core/test_base.py index 894912bbd..a3700ed07 100644 --- a/test/core/test_base.py +++ b/test/core/test_base.py @@ -371,6 +371,8 @@ def test_format_preserved_by_copy() -> None: result = _make_computed('{"label": "yes"}', _Label) shallow = _copy.copy(result) assert shallow._format is _Label + # __copy__ returns ModelOutputThunk (loses ComputedModelOutputThunk subclass due to + # zero-copy __class__ reassignment), so we validate manually rather than via .parsed. assert shallow._format.model_validate_json(shallow.value).label == "yes" # type: ignore[union-attr] @@ -380,4 +382,5 @@ def test_format_preserved_by_deepcopy() -> None: result = _make_computed('{"label": "yes"}', _Label) deep = _copy.deepcopy(result) assert deep._format is _Label + # Same subclass-loss caveat as test_format_preserved_by_copy. assert deep._format.model_validate_json(deep.value).label == "yes" # type: ignore[union-attr] From 5aa6921db16e4d348bf5d06eb7901f29db51af0e Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Mon, 22 Jun 2026 13:47:43 +0100 Subject: [PATCH 08/18] fix(types): make ComputedModelOutputThunk.parsed generic over the format type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `.parsed` previously returned `pydantic.BaseModel | None`, so callers still needed `cast(MyModel, result.parsed)` for static narrowing — the gap @ajbozarth flagged on PR #1282. Thread the format type through the thunk's existing type parameter `S`: `_format` is now `type[S] | None` and `.parsed` returns `S | None`. Reusing `S` (rather than a second TypeVar) composes with the companion `format=` overloads on #1274, which bind `S` to the supplied model so `m.act(action, format=MyModel)` yields `ComputedModelOutputThunk[MyModel]` and `.parsed` is typed `MyModel | None`. The `.parsed` body narrows `_format` to a pydantic type to call `model_validate_json`, then re-asserts the result as `S` — `S` is unbounded (it is `str` for plain instructions) so neither cast can be elided. Add `test/typing/check_parsed.py` asserting `.parsed` tracks the type parameter for both a model-parameterized and a `str` thunk. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/core/base.py | 24 ++++++++++++++++-------- test/typing/check_parsed.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 8 deletions(-) create mode 100644 test/typing/check_parsed.py diff --git a/mellea/core/base.py b/mellea/core/base.py index db694257c..d54cc2dfc 100644 --- a/mellea/core/base.py +++ b/mellea/core/base.py @@ -29,6 +29,7 @@ ParamSpec, Protocol, TypeVar, + cast, runtime_checkable, ) @@ -402,7 +403,7 @@ def __init__( # Mellea-side hook correlation ID; distinct from the provider-assigned # `GenerationMetadata.response_id`. self._generation_id: str | None = None - self._format: type[pydantic.BaseModel] | None = None + self._format: type[S] | None = None def _record_ttfb(self) -> None: """Record time-to-first-byte if streaming and not yet recorded.""" @@ -893,15 +894,18 @@ def value(self, v: str): self._underlying_value = v @property - def parsed(self) -> pydantic.BaseModel | None: + def parsed(self) -> S | None: """Returns the result as a validated Pydantic instance when ``format=`` was set. - Returns ``None`` when no ``format=`` type was provided to the originating - ``act()`` / ``instruct()`` call. Use this instead of casting ``.value`` + The return type tracks the format type supplied at the originating call + site: ``m.act(action, format=MyModel)`` yields a + ``ComputedModelOutputThunk[MyModel]`` whose ``.parsed`` is typed + ``MyModel | None`` — no ``cast()`` required. Returns ``None`` when no + ``format=`` type was provided. Use this instead of casting ``.value`` manually:: result = m.act(Instruction("Say yes or no"), format=MyModel) - obj = result.parsed # no manual model_validate_json needed + obj = result.parsed # typed MyModel | None, no model_validate_json needed Note: This property relies on the originating backend storing the format @@ -911,8 +915,8 @@ def parsed(self) -> pydantic.BaseModel | None: ``format=`` was supplied. Returns: - A ``pydantic.BaseModel`` instance produced by ``model_validate_json``, - or ``None`` if no format type was set. + An instance of the format type (``S``) produced by + ``model_validate_json``, or ``None`` if no format type was set. Raises: pydantic.ValidationError: If the raw JSON value does not conform to @@ -920,7 +924,11 @@ def parsed(self) -> pydantic.BaseModel | None: """ if self._format is None: return None - return self._format.model_validate_json(self.value) + # `_format` is a pydantic model type in every code path that sets it (the + # `format=` overloads bind `S` to that model), but `S` itself is unbounded, + # so we narrow to call `model_validate_json` and re-assert the result as `S`. + fmt = cast("type[pydantic.BaseModel]", self._format) + return cast("S", fmt.model_validate_json(self.value)) def is_computed(self) -> Literal[True]: """Returns `True` since thunk is always computed. diff --git a/test/typing/check_parsed.py b/test/typing/check_parsed.py new file mode 100644 index 000000000..2171f82b0 --- /dev/null +++ b/test/typing/check_parsed.py @@ -0,0 +1,28 @@ +"""Mypy checks that `ComputedModelOutputThunk.parsed` tracks the type parameter. + +`.parsed` is typed `S | None`, so a thunk parameterized with a Pydantic model +(`ComputedModelOutputThunk[MyModel]`) exposes `.parsed` as `MyModel | None` — +callers need no `cast()`. The `format=` overloads in `session.py` / +`functional.py` bind `S` to the format model (companion issue #1274), at which +point these checks hold end-to-end from the call site. +""" + +from typing import assert_type, cast + +import pydantic + +from mellea.core import ComputedModelOutputThunk + + +class _Person(pydantic.BaseModel): + name: str + + +def check_parsed_tracks_format_model() -> None: + thunk = cast(ComputedModelOutputThunk[_Person], None) + assert_type(thunk.parsed, _Person | None) + + +def check_parsed_is_str_for_str_thunk() -> None: + thunk = cast(ComputedModelOutputThunk[str], None) + assert_type(thunk.parsed, str | None) From 6118cea52e4494546dcc52ded0aa92b5465b1309 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Mon, 22 Jun 2026 13:58:13 +0100 Subject: [PATCH 09/18] fix(types): use concrete _format type and non-string cast for pyright compat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two pyright-compatibility fixes for the `.parsed` property added in bb53ddba: 1. `_format` annotation: revert from `type[S] | None` to `type[pydantic.BaseModel] | None`. Using a covariant TypeVar (`S`) in the invariant `type[...]` position is semantically unsound and can confuse stricter pyright configurations. The field only ever holds pydantic model types at runtime; the concrete annotation is accurate and avoids the variance issue entirely. 2. `.parsed` body: replace the two-step string-quoted cast (`cast("type[pydantic.BaseModel]", …)` then `cast("S", …)`) with a single direct cast (`cast(S, self._format.model_validate_json(self.value))`). Pyright resolves TypeVar forward-references in cast strings differently across versions; using the TypeVar directly is unambiguous. 3. `check_parsed.py`: use `cast(X, cast(object, None))` instead of `cast(X, None)` to avoid basedpyright's `reportInvalidCast` diagnostic (None and X share no overlap); assign `assert_type(…)` results to `_` to silence `reportUnusedCallResult`. All three checkers (mypy, pyright 1.1.408+, basedpyright) now report clean on both changed files. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/core/base.py | 11 +++++------ test/typing/check_parsed.py | 14 +++++++++----- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/mellea/core/base.py b/mellea/core/base.py index d54cc2dfc..6e38b792f 100644 --- a/mellea/core/base.py +++ b/mellea/core/base.py @@ -403,7 +403,7 @@ def __init__( # Mellea-side hook correlation ID; distinct from the provider-assigned # `GenerationMetadata.response_id`. self._generation_id: str | None = None - self._format: type[S] | None = None + self._format: type[pydantic.BaseModel] | None = None def _record_ttfb(self) -> None: """Record time-to-first-byte if streaming and not yet recorded.""" @@ -924,11 +924,10 @@ def parsed(self) -> S | None: """ if self._format is None: return None - # `_format` is a pydantic model type in every code path that sets it (the - # `format=` overloads bind `S` to that model), but `S` itself is unbounded, - # so we narrow to call `model_validate_json` and re-assert the result as `S`. - fmt = cast("type[pydantic.BaseModel]", self._format) - return cast("S", fmt.model_validate_json(self.value)) + # `_format` is always a pydantic model type; `model_validate_json` returns + # `pydantic.BaseModel` statically, but the caller's type parameter `S` is + # the concrete model when `format=` was used, so we cast the result to `S`. + return cast(S, self._format.model_validate_json(self.value)) def is_computed(self) -> Literal[True]: """Returns `True` since thunk is always computed. diff --git a/test/typing/check_parsed.py b/test/typing/check_parsed.py index 2171f82b0..7fd521fe0 100644 --- a/test/typing/check_parsed.py +++ b/test/typing/check_parsed.py @@ -1,10 +1,14 @@ -"""Mypy checks that `ComputedModelOutputThunk.parsed` tracks the type parameter. +"""Mypy / pyright checks that `ComputedModelOutputThunk.parsed` tracks the type parameter. `.parsed` is typed `S | None`, so a thunk parameterized with a Pydantic model (`ComputedModelOutputThunk[MyModel]`) exposes `.parsed` as `MyModel | None` — callers need no `cast()`. The `format=` overloads in `session.py` / `functional.py` bind `S` to the format model (companion issue #1274), at which point these checks hold end-to-end from the call site. + +The `cast(X, cast(object, None))` idiom creates a typed stub value without +triggering basedpyright's ``reportInvalidCast`` rule (direct ``cast(X, None)`` +raises that diagnostic because ``None`` and ``X`` share no overlap). """ from typing import assert_type, cast @@ -19,10 +23,10 @@ class _Person(pydantic.BaseModel): def check_parsed_tracks_format_model() -> None: - thunk = cast(ComputedModelOutputThunk[_Person], None) - assert_type(thunk.parsed, _Person | None) + thunk = cast(ComputedModelOutputThunk[_Person], cast(object, None)) + _ = assert_type(thunk.parsed, _Person | None) def check_parsed_is_str_for_str_thunk() -> None: - thunk = cast(ComputedModelOutputThunk[str], None) - assert_type(thunk.parsed, str | None) + thunk = cast(ComputedModelOutputThunk[str], cast(object, None)) + _ = assert_type(thunk.parsed, str | None) From 87b4b2f404fa1a679c5fad3e3bf2280722fddc20 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Mon, 22 Jun 2026 15:29:30 +0100 Subject: [PATCH 10/18] docs(thunk): tighten .parsed and ComputedModelOutputThunk.value docstrings - ComputedModelOutputThunk.value now carries the same raw-JSON guidance as the parent override so callers inspecting the subclass see it directly. - .parsed opening paragraph no longer overstates current type inference: the format= overloads do not yet bind S to the format model, so the cast idiom is required; removed the false claim that m.act(format=MyModel) yields a typed thunk without a cast. - Added one-line distinction from parsed_repr to prevent confusion between the two properties. Signed-off-by: Nigel Jones Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/core/base.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/mellea/core/base.py b/mellea/core/base.py index 6e38b792f..c135c60f1 100644 --- a/mellea/core/base.py +++ b/mellea/core/base.py @@ -885,7 +885,12 @@ async def astream(self) -> str: @property def value(self) -> str: - """Gets the value of the block.""" + """Gets the raw string value of the block. + + When ``format=`` is set on the originating ``act()``/``instruct()`` call, the + model returns a JSON string and ``.value`` contains that raw JSON — not a + Pydantic instance. Use ``.parsed`` to get the validated model object. + """ return self._underlying_value # type: ignore @value.setter @@ -897,15 +902,17 @@ def value(self, v: str): def parsed(self) -> S | None: """Returns the result as a validated Pydantic instance when ``format=`` was set. - The return type tracks the format type supplied at the originating call - site: ``m.act(action, format=MyModel)`` yields a - ``ComputedModelOutputThunk[MyModel]`` whose ``.parsed`` is typed - ``MyModel | None`` — no ``cast()`` required. Returns ``None`` when no - ``format=`` type was provided. Use this instead of casting ``.value`` - manually:: + The return type is ``S | None``, where ``S`` is the thunk's type parameter. + The ``format=`` overloads do not yet bind ``S`` to the format model, so + callers must parameterize the thunk explicitly to get a narrowed type:: + + thunk = cast(ComputedModelOutputThunk[MyModel], result) + obj = thunk.parsed # typed MyModel | None, no model_validate_json needed - result = m.act(Instruction("Say yes or no"), format=MyModel) - obj = result.parsed # typed MyModel | None, no model_validate_json needed + Returns ``None`` when no ``format=`` type was provided. Unlike + ``parsed_repr`` (which holds the action-specific parse result), + ``.parsed`` always re-validates the raw JSON string against ``_format`` + via ``model_validate_json``. Note: This property relies on the originating backend storing the format From 740293ff6c00779a27721e470893759527512a58 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Mon, 22 Jun 2026 12:56:50 +0100 Subject: [PATCH 11/18] docs(types): expand genstub return-value ignore rationale Explain why the format= overloads can't narrow the genstub return type: the overloads narrow the thunk's element type, but a genstub returns the unwrapped inner value R, not the thunk, and parsed_repr (S | None) can't be re-bound to R at this boundary. Add a TODO pointing at the clean shape (ComputedModelOutputThunk[R] with the FunctionResponse[R] unwrap in a typed parse step), noting it depends on the thunk-generics redesign out of scope here. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/stdlib/components/genstub.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/mellea/stdlib/components/genstub.py b/mellea/stdlib/components/genstub.py index 632696ab8..5a3877e8f 100644 --- a/mellea/stdlib/components/genstub.py +++ b/mellea/stdlib/components/genstub.py @@ -653,10 +653,19 @@ def __call__(self, *args, **kwargs) -> tuple[R, Context] | R: ) assert response.parsed_repr is not None + # The format= overloads on act/aact narrow the *thunk's* element type, but a + # genstub must return the inner value R (the unwrapped FunctionResponse[R] + # payload), not the thunk. `parsed_repr` is typed `S | None` at the call + # site and cannot be re-bound to R here, so the ignore bridges the gap. + # + # TODO: the clean shape is for act/aact to deliver a ComputedModelOutputThunk[R] + # whose value is R, with the FunctionResponse[R] unwrap happening inside a typed + # parse step rather than at the genstub boundary. That requires coordinating the + # thunk-generics redesign (see the .parsed work) and is outside this PR's scope. if context is None: - return response.parsed_repr # type: ignore[return-value] # genstub unwraps R from FunctionResponse[R]; format overloads can't re-bind R here + return response.parsed_repr # type: ignore[return-value] else: - return response.parsed_repr, context # type: ignore[return-value] # same + return response.parsed_repr, context # type: ignore[return-value] class AsyncGenerativeStub(GenerativeStub, Generic[P, R]): @@ -796,10 +805,16 @@ async def __async_call__() -> tuple[R, Context] | R: "unexpectedly received uncomputed model output thunk in async generative stub" ) assert response.parsed_repr is not None + # See the SyncGenerativeStub.__call__ comment above: the format= overloads + # narrow the thunk's element type, but a genstub returns the unwrapped inner + # value R, not the thunk. `parsed_repr` is `S | None` and can't be re-bound to + # R here. The clean fix (ComputedModelOutputThunk[R] with the FunctionResponse[R] + # unwrap in a typed parse step) needs the thunk-generics redesign and is out of + # scope for this PR. if context is None: - return response.parsed_repr # type: ignore[return-value] # genstub unwraps R from FunctionResponse[R]; format overloads can't re-bind R here + return response.parsed_repr # type: ignore[return-value] else: - return response.parsed_repr, context # type: ignore[return-value] # same + return response.parsed_repr, context # type: ignore[return-value] return __async_call__() From cb2bbf1e583c1dc14093c239775bf4b6d9983dd5 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Mon, 22 Jun 2026 12:57:23 +0100 Subject: [PATCH 12/18] test(types): assert attribute-level narrowing for act format= overload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing check only confirmed the overload resolved to ComputedModelOutputThunk[_M]; it did not pin what the attributes are typed. Add check_act_format_attributes asserting parsed_repr narrows to `_M | None` (what the overloads actually narrow) and documenting that `.value` stays unconditionally `str` — the known limitation pending the coordinated thunk-generics / `.parsed` redesign. Locks in what IS narrowed and calls out what isn't. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- test/typing/check_session.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/typing/check_session.py b/test/typing/check_session.py index 9ea11bf76..0f3748993 100644 --- a/test/typing/check_session.py +++ b/test/typing/check_session.py @@ -66,6 +66,25 @@ def check_act_format() -> None: assert_type(r, ComputedModelOutputThunk[_M]) +def check_act_format_attributes() -> None: + # Locks in what the format= overloads actually narrow at the attribute level. + r = s.act(action, format=_M) + + # `parsed_repr` is the attribute the overloads narrow: it carries the generic + # element type S, so with format=_M it resolves to `_M | None`. + assert_type(r.parsed_repr, _M | None) + + # KNOWN LIMITATION: `.value` is typed `-> str` unconditionally on + # ComputedModelOutputThunk (see mellea/core/base.py), so it does NOT narrow to + # `_M` even though the thunk is parameterised `[_M]`. At runtime `.value` is the + # raw string and `.parsed_repr` is also a plain str (Instruction._parse returns + # str), so `parsed_repr.value` type-checks but AttributeErrors. Asserting + # `assert_type(r.value, _M)` here would (correctly) fail mypy. Both the static + # `.value` type and the runtime parsed_repr mismatch are pending the coordinated + # thunk-generics / `.parsed` redesign (PR #1282). + assert_type(r.value, str) + + def check_instruct_format() -> None: r = s.instruct("test", format=_M) assert_type(r, ComputedModelOutputThunk[_M]) From d055522b50f15d829f8f71139a549642fd189b34 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Mon, 22 Jun 2026 12:57:58 +0100 Subject: [PATCH 13/18] docs(types): explain intentional Any widening on act implementation Document that the act() implementation return type is widened to `Any` on purpose: the @overload signatures own the precise S propagation and the format= -> BaseModelSubclass narrowing, and tightening the body to the bare `[S]` case would conflict with the format= and sampling overloads. Callers always resolve against an overload, never the implementation body. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/stdlib/functional.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mellea/stdlib/functional.py b/mellea/stdlib/functional.py index 73050603d..603484630 100644 --- a/mellea/stdlib/functional.py +++ b/mellea/stdlib/functional.py @@ -101,6 +101,11 @@ def act( format: type[BaseModelSubclass] | None = None, model_options: dict | None = None, tool_calls: bool = False, + # Implementation return type intentionally widened to `Any`: the @overload signatures + # above own the precise S propagation (action's S -> thunk's S, plus the + # format= -> BaseModelSubclass narrowing). Tightening the body to the bare `[S]` case + # would conflict with the format= and sampling overloads, so it is left untyped on + # purpose. Callers always resolve against an overload, never this implementation body. ) -> tuple[ComputedModelOutputThunk[Any], Context] | SamplingResult[Any]: """Runs a generic action, and adds both the action and the result to the context. From 91b9f7cff795708d9ef2f178e76594a0b7a921e0 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Mon, 22 Jun 2026 12:58:51 +0100 Subject: [PATCH 14/18] docs(types): document why format passthrough wrappers keep the ignore A third union overload `format: type[BaseModelSubclass] | None` cannot narrow cleanly: it overlaps the existing `format=None` overload, so the return type collapses to the union and the narrowing is lost. The clean fix for a wrapper forwarding a dynamic format is to branch on `format is None` so each call matches a narrow overload. Add comments at the react and m_serve passthrough sites explaining this and why the ignore is preferred over branching at those single call sites. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- docs/examples/m_serve/m_serve_example_response_format.py | 5 +++++ mellea/stdlib/frameworks/react.py | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/docs/examples/m_serve/m_serve_example_response_format.py b/docs/examples/m_serve/m_serve_example_response_format.py index 5a85d5094..667b3572e 100644 --- a/docs/examples/m_serve/m_serve_example_response_format.py +++ b/docs/examples/m_serve/m_serve_example_response_format.py @@ -46,6 +46,11 @@ def serve( # When format is provided (from json_schema response_format), # pass it to instruct() to get structured output + # `format` arrives as a dynamic `type | None` from the response_format request, so it + # matches no single narrow instruct() overload (those key off format=None vs a concrete + # type). A serving wrapper that wanted cast-free typing would branch on `format is None` + # and call instruct() in each branch; here the passthrough ignore keeps the example + # focused on the serving flow. result = session.instruct( description=message, requirements=requirements, # type: ignore diff --git a/mellea/stdlib/frameworks/react.py b/mellea/stdlib/frameworks/react.py index 271abcceb..bebf82338 100644 --- a/mellea/stdlib/frameworks/react.py +++ b/mellea/stdlib/frameworks/react.py @@ -111,6 +111,15 @@ async def react( assert len(tool_responses) == 1, "multiple tools were called with 'final'" if format is not None: + # `format` is a dynamic `type[BaseModelSubclass] | None` forwarded from + # the caller, which matches no single narrow aact() overload (those key + # off `format=None` vs `format=` as distinct literals). We are + # already inside `if format is not None`, so the value is known non-None + # here, but mypy does not propagate that narrowing into the overload pick. + # The clean fix is for the caller to branch on `format is None` and call + # aact in each branch so each call matches a narrow overload; that is + # not worth the duplication for this single internal call site, so we + # accept the ignore. step, next_context = await mfuncs.aact( # type: ignore[assignment] # dynamic format from caller action=ReactThought(), context=context, From 8c478e2a485dab289596a2a951d0f9ce68b465da Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Mon, 22 Jun 2026 13:00:35 +0100 Subject: [PATCH 15/18] docs(types): note runtime/type mismatch on act format= overloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The format= overloads narrow the thunk's generic element type, observable on parsed_repr (S | None), not on .value — ComputedModelOutputThunk.value is typed `-> str` unconditionally. parsed_repr also currently routes through Instruction._parse (returns str), so parsed_repr.some_field type-checks but AttributeErrors at runtime: the same silent-failure shape #1274 set out to fix, relocated to parsed_repr. Add a TODO pointing at the coordinated .parsed redesign (PR #1282) as the proper fix, out of scope here. PR body updated to match (was claiming .value narrows to MyModel). Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/stdlib/session.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mellea/stdlib/session.py b/mellea/stdlib/session.py index 4c01659c4..d1e3cd844 100644 --- a/mellea/stdlib/session.py +++ b/mellea/stdlib/session.py @@ -390,6 +390,14 @@ def cleanup(self) -> None: deregister_session_plugins(self.id) + # The format= overloads below narrow the thunk's generic element type. That narrowing + # is observable on `parsed_repr: S | None`, NOT on `.value` — ComputedModelOutputThunk.value + # is typed `-> str` unconditionally (mellea/core/base.py). There is also a runtime gap: + # parsed_repr currently goes through Instruction._parse, which returns a plain str, so + # `result.parsed_repr.some_field` type-checks but raises AttributeError at runtime. + # TODO: a coherent end state has the thunk's `.parsed` generic over S backed by a runtime + # path that delivers S. That is a coordinated change tracked by PR #1282 and is out of + # scope here; these overloads only land the static format= narrowing where they can. @overload def act( self, From d8534450557a5e3c5fd84c0f9854cf9b3c6d0f90 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Mon, 22 Jun 2026 13:19:07 +0100 Subject: [PATCH 16/18] fix(types): branch on format is None in m_serve example to remove type: ignore Serving wrapper now calls instruct() in two branches so each matches a narrow overload. The react.py site (already inside if format is not None) is a separate problem requiring broader restructuring; documented in-place. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- .../m_serve_example_response_format.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/examples/m_serve/m_serve_example_response_format.py b/docs/examples/m_serve/m_serve_example_response_format.py index 667b3572e..fbb3453d5 100644 --- a/docs/examples/m_serve/m_serve_example_response_format.py +++ b/docs/examples/m_serve/m_serve_example_response_format.py @@ -44,18 +44,18 @@ def serve( """ message = input[-1].get_text_content() or "No message provided" - # When format is provided (from json_schema response_format), - # pass it to instruct() to get structured output - # `format` arrives as a dynamic `type | None` from the response_format request, so it - # matches no single narrow instruct() overload (those key off format=None vs a concrete - # type). A serving wrapper that wanted cast-free typing would branch on `format is None` - # and call instruct() in each branch; here the passthrough ignore keeps the example - # focused on the serving flow. - result = session.instruct( - description=message, - requirements=requirements, # type: ignore - model_options=model_options, - format=format, # type: ignore[arg-type] # dynamic format from caller - ) + if format is None: + result = session.instruct( + description=message, + requirements=requirements, # type: ignore + model_options=model_options, + ) + else: + result = session.instruct( + description=message, + requirements=requirements, # type: ignore + model_options=model_options, + format=format, + ) return result From c56d89d5ea0c31685a6d2de4cc9e8f1ccb414c3c Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Mon, 22 Jun 2026 13:22:12 +0100 Subject: [PATCH 17/18] fix(types): replace type: ignore[return-value] with cast in genstub GenerativeStub._parse already unwraps FunctionResponse[R] and returns R at runtime. The thunk types parsed_repr as S | None (S = FunctionResponse[R]) because the overloads narrow S to the format type, not R. Replace the return-value ignores with an explicit cast to make the coercion visible. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/stdlib/components/genstub.py | 42 +++++++++++++++-------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/mellea/stdlib/components/genstub.py b/mellea/stdlib/components/genstub.py index 5a3877e8f..480ea2e85 100644 --- a/mellea/stdlib/components/genstub.py +++ b/mellea/stdlib/components/genstub.py @@ -6,7 +6,16 @@ from collections.abc import Awaitable, Callable, Coroutine from copy import deepcopy from dataclasses import dataclass, fields -from typing import Any, Generic, ParamSpec, TypedDict, TypeVar, get_type_hints, overload +from typing import ( + Any, + Generic, + ParamSpec, + TypedDict, + TypeVar, + cast, + get_type_hints, + overload, +) from pydantic import BaseModel, Field, create_model @@ -653,19 +662,15 @@ def __call__(self, *args, **kwargs) -> tuple[R, Context] | R: ) assert response.parsed_repr is not None - # The format= overloads on act/aact narrow the *thunk's* element type, but a - # genstub must return the inner value R (the unwrapped FunctionResponse[R] - # payload), not the thunk. `parsed_repr` is typed `S | None` at the call - # site and cannot be re-bound to R here, so the ignore bridges the gap. - # - # TODO: the clean shape is for act/aact to deliver a ComputedModelOutputThunk[R] - # whose value is R, with the FunctionResponse[R] unwrap happening inside a typed - # parse step rather than at the genstub boundary. That requires coordinating the - # thunk-generics redesign (see the .parsed work) and is outside this PR's scope. + # GenerativeStub._parse calls model_validate_json and returns the unwrapped R, + # so parsed_repr is R at runtime. The thunk types it as S | None (where + # S = FunctionResponse[R]) because the overloads narrow S to the format type, + # not to R. cast makes the coercion explicit rather than suppressing it. + parsed = cast("R", response.parsed_repr) if context is None: - return response.parsed_repr # type: ignore[return-value] + return parsed else: - return response.parsed_repr, context # type: ignore[return-value] + return parsed, context class AsyncGenerativeStub(GenerativeStub, Generic[P, R]): @@ -805,16 +810,13 @@ async def __async_call__() -> tuple[R, Context] | R: "unexpectedly received uncomputed model output thunk in async generative stub" ) assert response.parsed_repr is not None - # See the SyncGenerativeStub.__call__ comment above: the format= overloads - # narrow the thunk's element type, but a genstub returns the unwrapped inner - # value R, not the thunk. `parsed_repr` is `S | None` and can't be re-bound to - # R here. The clean fix (ComputedModelOutputThunk[R] with the FunctionResponse[R] - # unwrap in a typed parse step) needs the thunk-generics redesign and is out of - # scope for this PR. + # Same as SyncGenerativeStub: _parse returns the unwrapped R at runtime; + # cast makes the S → R coercion explicit. + parsed = cast("R", response.parsed_repr) if context is None: - return response.parsed_repr # type: ignore[return-value] + return parsed else: - return response.parsed_repr, context # type: ignore[return-value] + return parsed, context return __async_call__() From c054f1c0150a3ad712436f22147e12f1d9cf2eb9 Mon Sep 17 00:00:00 2001 From: Nigel Jones Date: Tue, 23 Jun 2026 10:44:57 +0100 Subject: [PATCH 18/18] fix(types): widen instruct/ainstruct format param to type[Any] in implementations On Python 3.13, mypy 1.20 raises [misc] for overloaded-function implementations where a TypeVar (BaseModelSubclass) is only constrained through the `format` parameter and not anchored in any other parameter or return type. The act/aact implementations are unaffected because their TypeVar S is also present in `action: Component[S]`. The fix: replace `type[BaseModelSubclass] | None` with `type[Any] | None` in the four non-overload implementation signatures (functional.instruct, functional.ainstruct, MelleaSession.instruct, MelleaSession.ainstruct). The TypeVar narrowing still lives in the @overload signatures where it belongs; callers never resolve against the implementation body. Assisted-by: Claude Code Signed-off-by: Nigel Jones --- mellea/stdlib/functional.py | 6 ++++-- mellea/stdlib/session.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/mellea/stdlib/functional.py b/mellea/stdlib/functional.py index 603484630..f6ac03ab0 100644 --- a/mellea/stdlib/functional.py +++ b/mellea/stdlib/functional.py @@ -228,7 +228,8 @@ def instruct( output_prefix: str | CBlock | None = None, strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2), return_sampling_results: bool = False, - format: type[BaseModelSubclass] | None = None, + format: type[Any] + | None = None, # widened: TypeVar only needed in @overload signatures model_options: dict | None = None, tool_calls: bool = False, ) -> tuple[ComputedModelOutputThunk[Any], Context] | SamplingResult[Any]: @@ -1022,7 +1023,8 @@ async def ainstruct( output_prefix: str | CBlock | None = None, strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2), return_sampling_results: bool = False, - format: type[BaseModelSubclass] | None = None, + format: type[Any] + | None = None, # widened: TypeVar only needed in @overload signatures model_options: dict | None = None, tool_calls: bool = False, await_result: bool = False, diff --git a/mellea/stdlib/session.py b/mellea/stdlib/session.py index d1e3cd844..80ad14adb 100644 --- a/mellea/stdlib/session.py +++ b/mellea/stdlib/session.py @@ -552,7 +552,8 @@ def instruct( output_prefix: str | CBlock | None = None, strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2), return_sampling_results: bool = False, - format: type[BaseModelSubclass] | None = None, + format: type[Any] + | None = None, # widened: TypeVar only needed in @overload signatures model_options: dict | None = None, tool_calls: bool = False, ) -> ModelOutputThunk[Any] | SamplingResult[Any]: @@ -1047,7 +1048,8 @@ async def ainstruct( output_prefix: str | CBlock | None = None, strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2), return_sampling_results: bool = False, - format: type[BaseModelSubclass] | None = None, + format: type[Any] + | None = None, # widened: TypeVar only needed in @overload signatures model_options: dict | None = None, tool_calls: bool = False, await_result: bool = False,