From acb02f93d357fee369e4f98c37a94c8b129dbe0d Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 17 Jun 2026 09:53:03 +0100
Subject: [PATCH 01/18] feat(thunk): add .parsed property to
 ComputedModelOutputThunk for structured output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When `format=` is passed to `act()`/`instruct()`, the model returns a JSON string and
`.value` has always held that raw JSON — not a Pydantic instance.  Accessing `.label`
(etc.) on `.value` silently raises `AttributeError` at runtime while pyright accepts
the cast without complaint, leading to hard-to-debug silent failures.

This commit adds:
- `_format: type[pydantic.BaseModel] | None` attribute on `ModelOutputThunk` (initialised
  to `None`; propagated via `_copy_from`)
- All five backends (`ollama`, `litellm`, `openai`, `huggingface`, `watsonx`) now set
  `mot._format = _format` in `post_processing()`, alongside the existing
  `generate_log.extra` artefact
- `ComputedModelOutputThunk.parsed` property — calls `_format.model_validate_json(value)`
  when a format type is stored, returns `None` otherwise
- Docstring updates on `ModelOutputThunk.value` and `Session.act()` pointing callers to
  `.parsed` when `format=` is used
- Four unit tests covering: happy path, no-format returns None, invalid JSON raises
  `pydantic.ValidationError`, and `.value` is unaffected

Closes #1273.

Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
Assisted-by: Claude Code
---
 mellea/backends/huggingface.py |  1 +
 mellea/backends/litellm.py     |  1 +
 mellea/backends/ollama.py      |  2 ++
 mellea/backends/openai.py      |  1 +
 mellea/backends/watsonx.py     |  1 +
 mellea/core/base.py            | 30 ++++++++++++++++++++-
 mellea/stdlib/session.py       |  4 ++-
 test/core/test_base.py         | 48 +++++++++++++++++++++++++++++++++-
 8 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py
index 0d2a83524..9050dce9a 100644
--- a/mellea/backends/huggingface.py
+++ b/mellea/backends/huggingface.py
@@ -1513,6 +1513,7 @@ async def _generate_from_raw(
             generate_log.action = action
 
             result._generate_log = generate_log
+            result._format = format
             results.append(result)
 
         usage: dict[str, Any] | None = (
diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py
index f4169d469..90666cf08 100644
--- a/mellea/backends/litellm.py
+++ b/mellea/backends/litellm.py
@@ -596,6 +596,7 @@ async def post_processing(
         generate_log.action = mot._action
         generate_log.result = mot
         mot._generate_log = generate_log
+        mot._format = _format
 
         # Extract token usage from full response dict or streaming usage
         full_response = mot._meta.get("litellm_full_response")
diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py
index a7c1342c9..5d28b78b0 100644
--- a/mellea/backends/ollama.py
+++ b/mellea/backends/ollama.py
@@ -613,6 +613,7 @@ async def _generate_from_raw(
                 generate_log.extra["error"] = error
                 generate_log.extra["empty_response"] = response.model_dump()
             result._generate_log = generate_log
+            result._format = format
 
             results.append(result)
 
@@ -742,6 +743,7 @@ async def post_processing(
         generate_log.result = mot
 
         mot._generate_log = generate_log
+        mot._format = _format
         mot._generate = None
 
         # Extract token counts from response
diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py
index 42a4625fd..fa1c12e58 100644
--- a/mellea/backends/openai.py
+++ b/mellea/backends/openai.py
@@ -1127,6 +1127,7 @@ async def post_processing(
         generate_log.action = mot._action
         generate_log.result = mot
         mot._generate_log = generate_log
+        mot._format = _format
 
         # Extract token usage from response or streaming usage
         response = mot._meta["oai_chat_response"]
diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py
index 0d3ec1f3f..d06920969 100644
--- a/mellea/backends/watsonx.py
+++ b/mellea/backends/watsonx.py
@@ -614,6 +614,7 @@ async def post_processing(
         generate_log.result = mot
         generate_log.action = mot._action
         mot._generate_log = generate_log
+        mot._format = _format
 
     async def _generate_from_raw(
         self,
diff --git a/mellea/core/base.py b/mellea/core/base.py
index 05dbf4b31..06dee95c7 100644
--- a/mellea/core/base.py
+++ b/mellea/core/base.py
@@ -32,6 +32,7 @@
     runtime_checkable,
 )
 
+import pydantic
 import typing_extensions
 from PIL import Image as PILImage
 
@@ -401,6 +402,7 @@ def __init__(
         # Mellea-side hook correlation ID; distinct from the provider-assigned
         # `GenerationMetadata.response_id`.
         self._generation_id: str | None = None
+        self._format: type[pydantic.BaseModel] | None = None
 
     def _record_ttfb(self) -> None:
         """Record time-to-first-byte if streaming and not yet recorded."""
@@ -542,6 +544,7 @@ def _copy_from(self, other: ModelOutputThunk) -> None:
         self._thinking = other._thinking
         self.generation = other.generation
         self._generate_log = other._generate_log
+        self._format = other._format
         self._cancelled = other._cancelled
         # _cancel_hook is deliberately not copied: _copy_from swaps output state,
         # not backend-thread plumbing, which is tied to the original computation.
@@ -557,7 +560,13 @@ def is_computed(self) -> bool:
 
     @property
     def value(self) -> str | None:
-        """Gets the value of the block."""
+        """Gets the raw string value of the block.
+
+        When ``format=`` is set on the originating ``act()``/``instruct()`` call, the
+        model returns a JSON string and ``.value`` contains that raw JSON — not a
+        Pydantic instance.  Use ``.parsed`` on a ``ComputedModelOutputThunk`` to get
+        the validated model object.
+        """
         if not self._computed:
             return None
         return self._underlying_value
@@ -881,6 +890,25 @@ def value(self, v: str):
         """Sets the value of the block."""
         self._underlying_value = v
 
+    @property
+    def parsed(self) -> pydantic.BaseModel | None:
+        """Returns the result as a validated Pydantic instance when ``format=`` was set.
+
+        Returns ``None`` when no ``format=`` type was provided to the originating
+        ``act()`` / ``instruct()`` call.  Use this instead of casting ``.value``
+        manually::
+
+            result = m.act(Instruction("Say yes or no"), format=MyModel)
+            obj = result.parsed  # MyModel instance, no cast needed
+
+        Returns:
+            A ``pydantic.BaseModel`` instance produced by ``model_validate_json``,
+            or ``None`` if no format type was set.
+        """
+        if self._format is None:
+            return None
+        return self._format.model_validate_json(self.value)
+
     def is_computed(self) -> Literal[True]:
         """Returns `True` since thunk is always computed.
 
diff --git a/mellea/stdlib/session.py b/mellea/stdlib/session.py
index 34a9d70b1..37b10cef3 100644
--- a/mellea/stdlib/session.py
+++ b/mellea/stdlib/session.py
@@ -434,7 +434,9 @@ def act(
             requirements: used as additional requirements when a sampling strategy is provided
             strategy: a SamplingStrategy that describes the strategy for validating and repairing/retrying for the instruct-validate-repair pattern. None means that no particular sampling strategy is used.
             return_sampling_results: attach the (successful and failed) sampling attempts to the results.
-            format: if set, the BaseModel to use for constrained decoding.
+            format: if set, the BaseModel to use for constrained decoding.  When
+                provided, ``.value`` on the returned thunk is always a raw JSON string —
+                use ``.parsed`` to obtain the validated Pydantic model instance.
             model_options: additional model options, which will upsert into the model/backend's defaults.
             tool_calls: if true, tool calling is enabled.
 
diff --git a/test/core/test_base.py b/test/core/test_base.py
index 213a16e6e..3e0e0b821 100644
--- a/test/core/test_base.py
+++ b/test/core/test_base.py
@@ -3,10 +3,17 @@
 import io
 from typing import Any
 
+import pydantic
 import pytest
 from PIL import Image as PILImage
 
-from mellea.core import CBlock, Component, ImageBlock, ModelOutputThunk
+from mellea.core import (
+    CBlock,
+    Component,
+    ComputedModelOutputThunk,
+    ImageBlock,
+    ModelOutputThunk,
+)
 from mellea.stdlib.components import Message
 
 
@@ -317,3 +324,42 @@ async def _absorbs_first_cancel() -> None:
         await asyncio.wait_for(mot._generate, timeout=1.0)  # type: ignore[attr-defined]
     except (TimeoutError, asyncio.CancelledError):
         pass
+
+
+# --- ComputedModelOutputThunk.parsed ---
+
+
+class _Label(pydantic.BaseModel):
+    label: str
+
+
+def _make_computed(
+    json_str: str, fmt: type[pydantic.BaseModel] | None
+) -> ComputedModelOutputThunk:
+    thunk = ModelOutputThunk(value=json_str)
+    thunk._format = fmt
+    return ComputedModelOutputThunk(thunk)
+
+
+def test_parsed_returns_model_instance() -> None:
+    result = _make_computed('{"label": "yes"}', _Label)
+    obj = result.parsed
+    assert isinstance(obj, _Label)
+    assert obj.label == "yes"
+
+
+def test_parsed_returns_none_when_no_format() -> None:
+    result = _make_computed('{"label": "yes"}', None)
+    assert result.parsed is None
+
+
+def test_parsed_raises_on_invalid_json() -> None:
+    result = _make_computed("not json", _Label)
+    with pytest.raises(pydantic.ValidationError):
+        _ = result.parsed
+
+
+def test_value_unaffected_by_format() -> None:
+    raw = '{"label": "ok"}'
+    result = _make_computed(raw, _Label)
+    assert result.value == raw

From 7370b833371d0081427b9086f146bfc571a6493f Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 17 Jun 2026 10:05:31 +0100
Subject: [PATCH 02/18] feat(types): thread format= overloads for cast-free
 structured output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When `format=MyModel` is passed to `act()`, `instruct()`, `aact()`, or
`ainstruct()` in both `functional.py` and `session.py`, the return type
now narrows to `ComputedModelOutputThunk[MyModel]` (or
`ModelOutputThunk[MyModel]` for the non-awaited async variant) instead
of `ComputedModelOutputThunk[str]`.

This eliminates the need for `cast(MyModel, result.value)` at call
sites. Runtime behaviour is unchanged; all overloads were already
dispatched to the same implementation body.

Changes:
- `functional.py` / `session.py` – new `@overload` stubs with
  `format: type[BaseModelSubclass]` for all four methods; implementation
  signatures broadened to `Any` to cover all overload combinations
- `test/typing/` – `assert_type` checks for the new overload resolution
  paths in all four typing-check modules
- `genstub.py` – `# type: ignore[return-value]` on four existing return
  sites that rely on the pre-narrowed `R` type variable which the new
  overloads can no longer infer
- `react.py` / `m_serve_example_response_format.py` – `# type: ignore`
  on call sites that pass a dynamic `format` value incompatible with the
  new stricter overload signatures

Closes #1274

Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 .../m_serve_example_response_format.py        |   2 +-
 mellea/stdlib/components/genstub.py           |   8 +-
 mellea/stdlib/frameworks/react.py             |   2 +-
 mellea/stdlib/functional.py                   | 177 ++++++++++++++++--
 mellea/stdlib/session.py                      | 162 ++++++++++++++--
 test/typing/check_functional_aact.py          |  22 +++
 test/typing/check_functional_ainstruct.py     |  25 +++
 test/typing/check_functional_sync.py          |  16 ++
 test/typing/check_session.py                  |  36 ++++
 9 files changed, 418 insertions(+), 32 deletions(-)

diff --git a/docs/examples/m_serve/m_serve_example_response_format.py b/docs/examples/m_serve/m_serve_example_response_format.py
index 8b1ab29b6..5a85d5094 100644
--- a/docs/examples/m_serve/m_serve_example_response_format.py
+++ b/docs/examples/m_serve/m_serve_example_response_format.py
@@ -50,7 +50,7 @@ def serve(
         description=message,
         requirements=requirements,  # type: ignore
         model_options=model_options,
-        format=format,  # This enables structured output validation
+        format=format,  # type: ignore[arg-type]  # dynamic format from caller
     )
 
     return result
diff --git a/mellea/stdlib/components/genstub.py b/mellea/stdlib/components/genstub.py
index b836620ee..ab10cdd82 100644
--- a/mellea/stdlib/components/genstub.py
+++ b/mellea/stdlib/components/genstub.py
@@ -654,9 +654,9 @@ def __call__(self, *args, **kwargs) -> tuple[R, Context] | R:
 
         assert response.parsed_repr is not None
         if context is None:
-            return response.parsed_repr
+            return response.parsed_repr  # type: ignore[return-value]
         else:
-            return response.parsed_repr, context
+            return response.parsed_repr, context  # type: ignore[return-value]
 
 
 class AsyncGenerativeStub(GenerativeStub, Generic[P, R]):
@@ -797,9 +797,9 @@ async def __async_call__() -> tuple[R, Context] | R:
             )
             assert response.parsed_repr is not None
             if context is None:
-                return response.parsed_repr
+                return response.parsed_repr  # type: ignore[return-value]
             else:
-                return response.parsed_repr, context
+                return response.parsed_repr, context  # type: ignore[return-value]
 
         return __async_call__()
 
diff --git a/mellea/stdlib/frameworks/react.py b/mellea/stdlib/frameworks/react.py
index 9b523be58..156c03e8a 100644
--- a/mellea/stdlib/frameworks/react.py
+++ b/mellea/stdlib/frameworks/react.py
@@ -111,7 +111,7 @@ async def react(
             assert len(tool_responses) == 1, "multiple tools were called with 'final'"
 
             if format is not None:
-                step, next_context = await mfuncs.aact(
+                step, next_context = await mfuncs.aact(  # type: ignore[assignment]
                     action=ReactThought(),
                     context=context,
                     backend=backend,
diff --git a/mellea/stdlib/functional.py b/mellea/stdlib/functional.py
index 40a00c612..73050603d 100644
--- a/mellea/stdlib/functional.py
+++ b/mellea/stdlib/functional.py
@@ -45,6 +45,21 @@
 from .sampling import RejectionSamplingStrategy
 
 
+@overload
+def act(
+    action: Component[Any],
+    context: Context,
+    backend: Backend,
+    *,
+    requirements: list[Requirement] | None = None,
+    strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2),
+    return_sampling_results: Literal[False] = False,
+    format: type[BaseModelSubclass],
+    model_options: dict | None = None,
+    tool_calls: bool = False,
+) -> tuple[ComputedModelOutputThunk[BaseModelSubclass], Context]: ...
+
+
 @overload
 def act(
     action: Component[S],
@@ -54,7 +69,7 @@ def act(
     requirements: list[Requirement] | None = None,
     strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2),
     return_sampling_results: Literal[False] = False,
-    format: type[BaseModelSubclass] | None = None,
+    format: None = None,
     model_options: dict | None = None,
     tool_calls: bool = False,
 ) -> tuple[ComputedModelOutputThunk[S], Context]: ...
@@ -86,7 +101,7 @@ def act(
     format: type[BaseModelSubclass] | None = None,
     model_options: dict | None = None,
     tool_calls: bool = False,
-) -> tuple[ComputedModelOutputThunk[S], Context] | SamplingResult[S]:
+) -> tuple[ComputedModelOutputThunk[Any], Context] | SamplingResult[Any]:
     """Runs a generic action, and adds both the action and the result to the context.
 
     Args:
@@ -146,7 +161,28 @@ def instruct(
     output_prefix: str | CBlock | None = None,
     strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2),
     return_sampling_results: Literal[False] = False,
-    format: type[BaseModelSubclass] | None = None,
+    format: type[BaseModelSubclass],
+    model_options: dict | None = None,
+    tool_calls: bool = False,
+) -> tuple[ComputedModelOutputThunk[BaseModelSubclass], Context]: ...
+
+
+@overload
+def instruct(
+    description: str,
+    context: Context,
+    backend: Backend,
+    *,
+    images: list[ImageBlock] | list[PILImage.Image] | None = None,
+    requirements: list[Requirement | str] | None = None,
+    icl_examples: list[str | CBlock] | None = None,
+    grounding_context: dict[str, str | CBlock | Component] | None = None,
+    user_variables: dict[str, str] | None = None,
+    prefix: str | CBlock | None = None,
+    output_prefix: str | CBlock | None = None,
+    strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2),
+    return_sampling_results: Literal[False] = False,
+    format: None = None,
     model_options: dict | None = None,
     tool_calls: bool = False,
 ) -> tuple[ComputedModelOutputThunk[str], Context]: ...
@@ -190,7 +226,7 @@ def instruct(
     format: type[BaseModelSubclass] | None = None,
     model_options: dict | None = None,
     tool_calls: bool = False,
-) -> tuple[ComputedModelOutputThunk[str], Context] | SamplingResult[str]:
+) -> tuple[ComputedModelOutputThunk[Any], Context] | SamplingResult[Any]:
     """Generates from an instruction.
 
     Args:
@@ -474,6 +510,23 @@ def transform(
     return transformed, new_ctx
 
 
+@overload
+async def aact(
+    action: Component[Any],
+    context: Context,
+    backend: Backend,
+    *,
+    requirements: list[Requirement] | None = None,
+    strategy: None = None,
+    return_sampling_results: Literal[False] = False,
+    format: type[BaseModelSubclass],
+    model_options: dict | None = None,
+    tool_calls: bool = False,
+    silence_context_type_warning: bool = False,
+    await_result: Literal[True],
+) -> tuple[ComputedModelOutputThunk[BaseModelSubclass], Context]: ...
+
+
 @overload
 async def aact(
     action: Component[S],
@@ -483,7 +536,7 @@ async def aact(
     requirements: list[Requirement] | None = None,
     strategy: None = None,
     return_sampling_results: Literal[False] = False,
-    format: type[BaseModelSubclass] | None = None,
+    format: None = None,
     model_options: dict | None = None,
     tool_calls: bool = False,
     silence_context_type_warning: bool = False,
@@ -491,6 +544,23 @@ async def aact(
 ) -> tuple[ComputedModelOutputThunk[S], Context]: ...
 
 
+@overload
+async def aact(
+    action: Component[Any],
+    context: Context,
+    backend: Backend,
+    *,
+    requirements: list[Requirement] | None = None,
+    strategy: SamplingStrategy,
+    return_sampling_results: Literal[False] = False,
+    format: type[BaseModelSubclass],
+    model_options: dict | None = None,
+    tool_calls: bool = False,
+    silence_context_type_warning: bool = False,
+    await_result: bool = False,
+) -> tuple[ComputedModelOutputThunk[BaseModelSubclass], Context]: ...
+
+
 @overload
 async def aact(
     action: Component[S],
@@ -500,7 +570,7 @@ async def aact(
     requirements: list[Requirement] | None = None,
     strategy: SamplingStrategy,
     return_sampling_results: Literal[False] = False,
-    format: type[BaseModelSubclass] | None = None,
+    format: None = None,
     model_options: dict | None = None,
     tool_calls: bool = False,
     silence_context_type_warning: bool = False,
@@ -508,6 +578,23 @@ async def aact(
 ) -> tuple[ComputedModelOutputThunk[S], Context]: ...
 
 
+@overload
+async def aact(
+    action: Component[Any],
+    context: Context,
+    backend: Backend,
+    *,
+    requirements: list[Requirement] | None = None,
+    strategy: None = None,
+    return_sampling_results: Literal[False] = False,
+    format: type[BaseModelSubclass],
+    model_options: dict | None = None,
+    tool_calls: bool = False,
+    silence_context_type_warning: bool = False,
+    await_result: Literal[False] = False,
+) -> tuple[ModelOutputThunk[BaseModelSubclass], Context]: ...
+
+
 @overload
 async def aact(
     action: Component[S],
@@ -517,7 +604,7 @@ async def aact(
     requirements: list[Requirement] | None = None,
     strategy: None = None,
     return_sampling_results: Literal[False] = False,
-    format: type[BaseModelSubclass] | None = None,
+    format: None = None,
     model_options: dict | None = None,
     tool_calls: bool = False,
     silence_context_type_warning: bool = False,
@@ -555,7 +642,7 @@ async def aact(
     tool_calls: bool = False,
     silence_context_type_warning: bool = False,
     await_result: bool = False,
-) -> tuple[ModelOutputThunk[S], Context] | SamplingResult:
+) -> tuple[ModelOutputThunk[Any], Context] | SamplingResult[Any]:
     """Asynchronous version of .act; runs a generic action, and adds both the action and the result to the context.
 
     Args:
@@ -777,7 +864,29 @@ async def ainstruct(
     output_prefix: str | CBlock | None = None,
     strategy: None = None,
     return_sampling_results: Literal[False] = False,
-    format: type[BaseModelSubclass] | None = None,
+    format: type[BaseModelSubclass],
+    model_options: dict | None = None,
+    tool_calls: bool = False,
+    await_result: Literal[True],
+) -> tuple[ComputedModelOutputThunk[BaseModelSubclass], Context]: ...
+
+
+@overload
+async def ainstruct(
+    description: str,
+    context: Context,
+    backend: Backend,
+    *,
+    images: list[ImageBlock] | list[PILImage.Image] | None = None,
+    requirements: list[Requirement | str] | None = None,
+    icl_examples: list[str | CBlock] | None = None,
+    grounding_context: dict[str, str | CBlock | Component] | None = None,
+    user_variables: dict[str, str] | None = None,
+    prefix: str | CBlock | None = None,
+    output_prefix: str | CBlock | None = None,
+    strategy: None = None,
+    return_sampling_results: Literal[False] = False,
+    format: None = None,
     model_options: dict | None = None,
     tool_calls: bool = False,
     await_result: Literal[True],
@@ -799,7 +908,29 @@ async def ainstruct(
     output_prefix: str | CBlock | None = None,
     strategy: SamplingStrategy,
     return_sampling_results: Literal[False] = False,
-    format: type[BaseModelSubclass] | None = None,
+    format: type[BaseModelSubclass],
+    model_options: dict | None = None,
+    tool_calls: bool = False,
+    await_result: bool = False,
+) -> tuple[ComputedModelOutputThunk[BaseModelSubclass], Context]: ...
+
+
+@overload
+async def ainstruct(
+    description: str,
+    context: Context,
+    backend: Backend,
+    *,
+    images: list[ImageBlock] | list[PILImage.Image] | None = None,
+    requirements: list[Requirement | str] | None = None,
+    icl_examples: list[str | CBlock] | None = None,
+    grounding_context: dict[str, str | CBlock | Component] | None = None,
+    user_variables: dict[str, str] | None = None,
+    prefix: str | CBlock | None = None,
+    output_prefix: str | CBlock | None = None,
+    strategy: SamplingStrategy,
+    return_sampling_results: Literal[False] = False,
+    format: None = None,
     model_options: dict | None = None,
     tool_calls: bool = False,
     await_result: bool = False,
@@ -821,7 +952,29 @@ async def ainstruct(
     output_prefix: str | CBlock | None = None,
     strategy: None = None,
     return_sampling_results: Literal[False] = False,
-    format: type[BaseModelSubclass] | None = None,
+    format: type[BaseModelSubclass],
+    model_options: dict | None = None,
+    tool_calls: bool = False,
+    await_result: Literal[False] = False,
+) -> tuple[ModelOutputThunk[BaseModelSubclass], Context]: ...
+
+
+@overload
+async def ainstruct(
+    description: str,
+    context: Context,
+    backend: Backend,
+    *,
+    images: list[ImageBlock] | list[PILImage.Image] | None = None,
+    requirements: list[Requirement | str] | None = None,
+    icl_examples: list[str | CBlock] | None = None,
+    grounding_context: dict[str, str | CBlock | Component] | None = None,
+    user_variables: dict[str, str] | None = None,
+    prefix: str | CBlock | None = None,
+    output_prefix: str | CBlock | None = None,
+    strategy: None = None,
+    return_sampling_results: Literal[False] = False,
+    format: None = None,
     model_options: dict | None = None,
     tool_calls: bool = False,
     await_result: Literal[False] = False,
@@ -868,7 +1021,7 @@ async def ainstruct(
     model_options: dict | None = None,
     tool_calls: bool = False,
     await_result: bool = False,
-) -> tuple[ModelOutputThunk[str], Context] | SamplingResult:
+) -> tuple[ModelOutputThunk[Any], Context] | SamplingResult[Any]:
     """Generates from an instruction.
 
     Args:
diff --git a/mellea/stdlib/session.py b/mellea/stdlib/session.py
index 34a9d70b1..4c01659c4 100644
--- a/mellea/stdlib/session.py
+++ b/mellea/stdlib/session.py
@@ -390,6 +390,19 @@ def cleanup(self) -> None:
 
             deregister_session_plugins(self.id)
 
+    @overload
+    def act(
+        self,
+        action: Component[Any],
+        *,
+        requirements: list[Requirement] | None = None,
+        strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2),
+        return_sampling_results: Literal[False] = False,
+        format: type[BaseModelSubclass],
+        model_options: dict | None = None,
+        tool_calls: bool = False,
+    ) -> ComputedModelOutputThunk[BaseModelSubclass]: ...
+
     @overload
     def act(
         self,
@@ -398,7 +411,7 @@ def act(
         requirements: list[Requirement] | None = None,
         strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2),
         return_sampling_results: Literal[False] = False,
-        format: type[BaseModelSubclass] | None = None,
+        format: None = None,
         model_options: dict | None = None,
         tool_calls: bool = False,
     ) -> ComputedModelOutputThunk[S]: ...
@@ -418,7 +431,7 @@ def act(
 
     def act(
         self,
-        action: Component[S],
+        action: Component[Any],
         *,
         requirements: list[Requirement] | None = None,
         strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2),
@@ -426,7 +439,7 @@ def act(
         format: type[BaseModelSubclass] | None = None,
         model_options: dict | None = None,
         tool_calls: bool = False,
-    ) -> ModelOutputThunk[S] | SamplingResult:
+    ) -> ModelOutputThunk[Any] | SamplingResult[Any]:
         """Runs a generic action, and adds both the action and the result to the context.
 
         Args:
@@ -475,7 +488,26 @@ def instruct(
         output_prefix: str | CBlock | None = None,
         strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2),
         return_sampling_results: Literal[False] = False,
-        format: type[BaseModelSubclass] | None = None,
+        format: type[BaseModelSubclass],
+        model_options: dict | None = None,
+        tool_calls: bool = False,
+    ) -> ComputedModelOutputThunk[BaseModelSubclass]: ...
+
+    @overload
+    def instruct(
+        self,
+        description: str,
+        *,
+        images: list[ImageBlock] | list[PILImage.Image] | None = None,
+        requirements: list[Requirement | str] | None = None,
+        icl_examples: list[str | CBlock] | None = None,
+        grounding_context: dict[str, str | CBlock | Component] | None = None,
+        user_variables: dict[str, str] | None = None,
+        prefix: str | CBlock | None = None,
+        output_prefix: str | CBlock | None = None,
+        strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2),
+        return_sampling_results: Literal[False] = False,
+        format: None = None,
         model_options: dict | None = None,
         tool_calls: bool = False,
     ) -> ComputedModelOutputThunk[str]: ...
@@ -515,7 +547,7 @@ def instruct(
         format: type[BaseModelSubclass] | None = None,
         model_options: dict | None = None,
         tool_calls: bool = False,
-    ) -> ModelOutputThunk[str] | SamplingResult:
+    ) -> ModelOutputThunk[Any] | SamplingResult[Any]:
         """Generates from an instruction.
 
         Args:
@@ -707,6 +739,20 @@ def transform(
         self.ctx = context
         return result
 
+    @overload
+    async def aact(
+        self,
+        action: Component[Any],
+        *,
+        requirements: list[Requirement] | None = None,
+        strategy: None = None,
+        return_sampling_results: Literal[False] = False,
+        format: type[BaseModelSubclass],
+        model_options: dict | None = None,
+        tool_calls: bool = False,
+        await_result: Literal[True],
+    ) -> ComputedModelOutputThunk[BaseModelSubclass]: ...
+
     @overload
     async def aact(
         self,
@@ -715,12 +761,26 @@ async def aact(
         requirements: list[Requirement] | None = None,
         strategy: None = None,
         return_sampling_results: Literal[False] = False,
-        format: type[BaseModelSubclass] | None = None,
+        format: None = None,
         model_options: dict | None = None,
         tool_calls: bool = False,
         await_result: Literal[True],
     ) -> ComputedModelOutputThunk[S]: ...
 
+    @overload
+    async def aact(
+        self,
+        action: Component[Any],
+        *,
+        requirements: list[Requirement] | None = None,
+        strategy: SamplingStrategy,
+        return_sampling_results: Literal[False] = False,
+        format: type[BaseModelSubclass],
+        model_options: dict | None = None,
+        tool_calls: bool = False,
+        await_result: bool = False,
+    ) -> ComputedModelOutputThunk[BaseModelSubclass]: ...
+
     @overload
     async def aact(
         self,
@@ -729,12 +789,26 @@ async def aact(
         requirements: list[Requirement] | None = None,
         strategy: SamplingStrategy,
         return_sampling_results: Literal[False] = False,
-        format: type[BaseModelSubclass] | None = None,
+        format: None = None,
         model_options: dict | None = None,
         tool_calls: bool = False,
         await_result: bool = False,
     ) -> ComputedModelOutputThunk[S]: ...
 
+    @overload
+    async def aact(
+        self,
+        action: Component[Any],
+        *,
+        requirements: list[Requirement] | None = None,
+        strategy: None = None,
+        return_sampling_results: Literal[False] = False,
+        format: type[BaseModelSubclass],
+        model_options: dict | None = None,
+        tool_calls: bool = False,
+        await_result: Literal[False] = False,
+    ) -> ModelOutputThunk[BaseModelSubclass]: ...
+
     @overload
     async def aact(
         self,
@@ -743,7 +817,7 @@ async def aact(
         requirements: list[Requirement] | None = None,
         strategy: None = None,
         return_sampling_results: Literal[False] = False,
-        format: type[BaseModelSubclass] | None = None,
+        format: None = None,
         model_options: dict | None = None,
         tool_calls: bool = False,
         await_result: Literal[False] = False,
@@ -765,7 +839,7 @@ async def aact(
 
     async def aact(
         self,
-        action: Component[S],
+        action: Component[Any],
         *,
         requirements: list[Requirement] | None = None,
         strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2),
@@ -774,7 +848,7 @@ async def aact(
         model_options: dict | None = None,
         tool_calls: bool = False,
         await_result: bool = False,
-    ) -> ModelOutputThunk[S] | SamplingResult:
+    ) -> ModelOutputThunk[Any] | SamplingResult[Any]:
         """Runs a generic action, and adds both the action and the result to the context.
 
         Args:
@@ -826,7 +900,27 @@ async def ainstruct(
         output_prefix: str | CBlock | None = None,
         strategy: None = None,
         return_sampling_results: Literal[False] = False,
-        format: type[BaseModelSubclass] | None = None,
+        format: type[BaseModelSubclass],
+        model_options: dict | None = None,
+        tool_calls: bool = False,
+        await_result: Literal[True],
+    ) -> ComputedModelOutputThunk[BaseModelSubclass]: ...
+
+    @overload
+    async def ainstruct(
+        self,
+        description: str,
+        *,
+        images: list[ImageBlock] | list[PILImage.Image] | None = None,
+        requirements: list[Requirement | str] | None = None,
+        icl_examples: list[str | CBlock] | None = None,
+        grounding_context: dict[str, str | CBlock | Component] | None = None,
+        user_variables: dict[str, str] | None = None,
+        prefix: str | CBlock | None = None,
+        output_prefix: str | CBlock | None = None,
+        strategy: None = None,
+        return_sampling_results: Literal[False] = False,
+        format: None = None,
         model_options: dict | None = None,
         tool_calls: bool = False,
         await_result: Literal[True],
@@ -846,7 +940,27 @@ async def ainstruct(
         output_prefix: str | CBlock | None = None,
         strategy: SamplingStrategy,
         return_sampling_results: Literal[False] = False,
-        format: type[BaseModelSubclass] | None = None,
+        format: type[BaseModelSubclass],
+        model_options: dict | None = None,
+        tool_calls: bool = False,
+        await_result: bool = False,
+    ) -> ComputedModelOutputThunk[BaseModelSubclass]: ...
+
+    @overload
+    async def ainstruct(
+        self,
+        description: str,
+        *,
+        images: list[ImageBlock] | list[PILImage.Image] | None = None,
+        requirements: list[Requirement | str] | None = None,
+        icl_examples: list[str | CBlock] | None = None,
+        grounding_context: dict[str, str | CBlock | Component] | None = None,
+        user_variables: dict[str, str] | None = None,
+        prefix: str | CBlock | None = None,
+        output_prefix: str | CBlock | None = None,
+        strategy: SamplingStrategy,
+        return_sampling_results: Literal[False] = False,
+        format: None = None,
         model_options: dict | None = None,
         tool_calls: bool = False,
         await_result: bool = False,
@@ -866,7 +980,27 @@ async def ainstruct(
         output_prefix: str | CBlock | None = None,
         strategy: None = None,
         return_sampling_results: Literal[False] = False,
-        format: type[BaseModelSubclass] | None = None,
+        format: type[BaseModelSubclass],
+        model_options: dict | None = None,
+        tool_calls: bool = False,
+        await_result: Literal[False] = False,
+    ) -> ModelOutputThunk[BaseModelSubclass]: ...
+
+    @overload
+    async def ainstruct(
+        self,
+        description: str,
+        *,
+        images: list[ImageBlock] | list[PILImage.Image] | None = None,
+        requirements: list[Requirement | str] | None = None,
+        icl_examples: list[str | CBlock] | None = None,
+        grounding_context: dict[str, str | CBlock | Component] | None = None,
+        user_variables: dict[str, str] | None = None,
+        prefix: str | CBlock | None = None,
+        output_prefix: str | CBlock | None = None,
+        strategy: None = None,
+        return_sampling_results: Literal[False] = False,
+        format: None = None,
         model_options: dict | None = None,
         tool_calls: bool = False,
         await_result: Literal[False] = False,
@@ -909,7 +1043,7 @@ async def ainstruct(
         model_options: dict | None = None,
         tool_calls: bool = False,
         await_result: bool = False,
-    ) -> ModelOutputThunk[str] | SamplingResult[str]:
+    ) -> ModelOutputThunk[Any] | SamplingResult[Any]:
         """Generates from an instruction.
 
         Args:
diff --git a/test/typing/check_functional_aact.py b/test/typing/check_functional_aact.py
index 008b824a6..f24d04943 100644
--- a/test/typing/check_functional_aact.py
+++ b/test/typing/check_functional_aact.py
@@ -2,6 +2,8 @@
 
 from typing import assert_type, cast
 
+from pydantic import BaseModel
+
 from mellea.core import (
     Backend,
     ComputedModelOutputThunk,
@@ -18,6 +20,10 @@
 action: Instruction = cast(Instruction, None)
 
 
+class _M(BaseModel):
+    value: str
+
+
 async def check_computed_await() -> None:
     r = await aact(action, ctx, backend, strategy=None, await_result=True)
     assert_type(r, tuple[ComputedModelOutputThunk[str], Context])
@@ -37,3 +43,19 @@ async def check_uncomputed() -> None:
 async def check_sampling() -> None:
     r = await aact(action, ctx, backend, return_sampling_results=True)
     assert_type(r, SamplingResult[str])
+
+
+async def check_format_computed_await() -> None:
+    r = await aact(action, ctx, backend, strategy=None, await_result=True, format=_M)
+    assert_type(r, tuple[ComputedModelOutputThunk[_M], Context])
+
+
+async def check_format_computed_strategy() -> None:
+    strat = RejectionSamplingStrategy(loop_budget=2)
+    r = await aact(action, ctx, backend, strategy=strat, format=_M)
+    assert_type(r, tuple[ComputedModelOutputThunk[_M], Context])
+
+
+async def check_format_uncomputed() -> None:
+    r = await aact(action, ctx, backend, strategy=None, format=_M)
+    assert_type(r, tuple[ModelOutputThunk[_M], Context])
diff --git a/test/typing/check_functional_ainstruct.py b/test/typing/check_functional_ainstruct.py
index fe1e113f3..c3c3947a9 100644
--- a/test/typing/check_functional_ainstruct.py
+++ b/test/typing/check_functional_ainstruct.py
@@ -2,6 +2,8 @@
 
 from typing import assert_type, cast
 
+from pydantic import BaseModel
+
 from mellea.core import (
     Backend,
     ComputedModelOutputThunk,
@@ -10,11 +12,16 @@
     SamplingResult,
 )
 from mellea.stdlib.functional import ainstruct
+from mellea.stdlib.sampling import RejectionSamplingStrategy
 
 ctx = cast(Context, None)
 backend = cast(Backend, None)
 
 
+class _M(BaseModel):
+    value: str
+
+
 async def check_computed() -> None:
     r = await ainstruct("test", ctx, backend, strategy=None, await_result=True)
     assert_type(r, tuple[ComputedModelOutputThunk[str], Context])
@@ -28,3 +35,21 @@ async def check_uncomputed() -> None:
 async def check_sampling() -> None:
     r = await ainstruct("test", ctx, backend, return_sampling_results=True)
     assert_type(r, SamplingResult[str])
+
+
+async def check_format_computed_await() -> None:
+    r = await ainstruct(
+        "test", ctx, backend, strategy=None, await_result=True, format=_M
+    )
+    assert_type(r, tuple[ComputedModelOutputThunk[_M], Context])
+
+
+async def check_format_computed_strategy() -> None:
+    strat = RejectionSamplingStrategy(loop_budget=2)
+    r = await ainstruct("test", ctx, backend, strategy=strat, format=_M)
+    assert_type(r, tuple[ComputedModelOutputThunk[_M], Context])
+
+
+async def check_format_uncomputed() -> None:
+    r = await ainstruct("test", ctx, backend, strategy=None, format=_M)
+    assert_type(r, tuple[ModelOutputThunk[_M], Context])
diff --git a/test/typing/check_functional_sync.py b/test/typing/check_functional_sync.py
index 494edb15a..5866d6ce8 100644
--- a/test/typing/check_functional_sync.py
+++ b/test/typing/check_functional_sync.py
@@ -2,6 +2,8 @@
 
 from typing import assert_type, cast
 
+from pydantic import BaseModel
+
 from mellea.core import Backend, ComputedModelOutputThunk, Context
 from mellea.stdlib.components import Instruction
 from mellea.stdlib.functional import act, instruct
@@ -13,6 +15,10 @@
 s = cast(MelleaSession, None)
 
 
+class _M(BaseModel):
+    value: str
+
+
 def check_act_sync() -> None:
     r = act(action, ctx, backend)
     assert_type(r, tuple[ComputedModelOutputThunk[str], Context])
@@ -31,3 +37,13 @@ def check_session_act_sync() -> None:
 def check_session_instruct_sync() -> None:
     r = s.instruct("test")
     assert_type(r, ComputedModelOutputThunk[str])
+
+
+def check_act_format() -> None:
+    r = act(action, ctx, backend, format=_M)
+    assert_type(r, tuple[ComputedModelOutputThunk[_M], Context])
+
+
+def check_instruct_format() -> None:
+    r = instruct("test", ctx, backend, format=_M)
+    assert_type(r, tuple[ComputedModelOutputThunk[_M], Context])
diff --git a/test/typing/check_session.py b/test/typing/check_session.py
index 81db1ff2e..9ea11bf76 100644
--- a/test/typing/check_session.py
+++ b/test/typing/check_session.py
@@ -2,6 +2,8 @@
 
 from typing import Any, assert_type, cast
 
+from pydantic import BaseModel
+
 from mellea.core import ComputedModelOutputThunk, ModelOutputThunk, SamplingResult
 from mellea.stdlib.components import Instruction
 from mellea.stdlib.session import MelleaSession
@@ -10,6 +12,10 @@
 action: Instruction = cast(Instruction, None)
 
 
+class _M(BaseModel):
+    value: str
+
+
 async def check_aact_computed() -> None:
     r = await s.aact(action, strategy=None, await_result=True)
     assert_type(r, ComputedModelOutputThunk[str])
@@ -53,3 +59,33 @@ async def check_aquery_uncomputed() -> None:
 def check_query_sync() -> None:
     r = s.query("obj", "q")
     assert_type(r, ComputedModelOutputThunk[Any])
+
+
+def check_act_format() -> None:
+    r = s.act(action, format=_M)
+    assert_type(r, ComputedModelOutputThunk[_M])
+
+
+def check_instruct_format() -> None:
+    r = s.instruct("test", format=_M)
+    assert_type(r, ComputedModelOutputThunk[_M])
+
+
+async def check_aact_format_computed() -> None:
+    r = await s.aact(action, strategy=None, await_result=True, format=_M)
+    assert_type(r, ComputedModelOutputThunk[_M])
+
+
+async def check_aact_format_uncomputed() -> None:
+    r = await s.aact(action, strategy=None, format=_M)
+    assert_type(r, ModelOutputThunk[_M])
+
+
+async def check_ainstruct_format_computed() -> None:
+    r = await s.ainstruct("test", strategy=None, await_result=True, format=_M)
+    assert_type(r, ComputedModelOutputThunk[_M])
+
+
+async def check_ainstruct_format_uncomputed() -> None:
+    r = await s.ainstruct("test", strategy=None, format=_M)
+    assert_type(r, ModelOutputThunk[_M])

From 0f77a9fc57c5ffda3969b28aeb2480410e8dd064 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 17 Jun 2026 12:15:10 +0100
Subject: [PATCH 03/18] fix(thunk): propagate _format through
 __copy__/__deepcopy__; add Raises: to parsed

- Add `_format = self._format` to `__copy__` and `__deepcopy__` so that
  copying a ComputedModelOutputThunk preserves the format type; previously
  a copied thunk would silently return None from .parsed even when the
  original had a format set.
- Add `Raises: pydantic.ValidationError` to the `parsed` property docstring
  to document the exception callers must handle when the model returns
  malformed structured output.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/core/base.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mellea/core/base.py b/mellea/core/base.py
index 06dee95c7..e9d84fdc0 100644
--- a/mellea/core/base.py
+++ b/mellea/core/base.py
@@ -785,6 +785,7 @@ def __copy__(self) -> ModelOutputThunk:
         copied._action = self._action
         copied._context = self._context
         copied._generate_log = self._generate_log
+        copied._format = self._format
         copied._model_options = self._model_options
         copied.generation = copy(self.generation)
         return copied
@@ -819,6 +820,7 @@ def __deepcopy__(self, memo: dict) -> ModelOutputThunk:
             self._context
         )  # The items in a context should be immutable.
         deepcopied._generate_log = copy(self._generate_log)
+        deepcopied._format = self._format
         deepcopied._model_options = copy(self._model_options)
         deepcopied.generation = deepcopy(self.generation)
         return deepcopied
@@ -904,6 +906,10 @@ def parsed(self) -> pydantic.BaseModel | None:
         Returns:
             A ``pydantic.BaseModel`` instance produced by ``model_validate_json``,
             or ``None`` if no format type was set.
+
+        Raises:
+            pydantic.ValidationError: If the raw JSON value does not conform to
+                the format model (e.g. the model returned malformed structured output).
         """
         if self._format is None:
             return None

From 83295e9d8b14dea5a0dc073fe5693cea040b6cdf Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 17 Jun 2026 12:20:26 +0100
Subject: [PATCH 04/18] nit(thunk): soften parsed docstring example comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

"no manual model_validate_json needed" is more accurate than
"MyModel instance, no cast needed" — .parsed returns BaseModel | None,
so static type narrowing still requires a cast; the value is just
already deserialized.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/core/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mellea/core/base.py b/mellea/core/base.py
index e9d84fdc0..54c8c1919 100644
--- a/mellea/core/base.py
+++ b/mellea/core/base.py
@@ -901,7 +901,7 @@ def parsed(self) -> pydantic.BaseModel | None:
         manually::
 
             result = m.act(Instruction("Say yes or no"), format=MyModel)
-            obj = result.parsed  # MyModel instance, no cast needed
+            obj = result.parsed  # no manual model_validate_json needed
 
         Returns:
             A ``pydantic.BaseModel`` instance produced by ``model_validate_json``,

From 8e4af00bcb4306f56bfcbeb7f0ddde4ac3156803 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Wed, 17 Jun 2026 12:30:39 +0100
Subject: [PATCH 05/18] docs(types): annotate type: ignore sites with rationale

Add brief inline comments to the three type: ignore additions
introduced by the format= overload threading, explaining why each
ignore is intentional rather than masking a real issue.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/stdlib/components/genstub.py | 8 ++++----
 mellea/stdlib/frameworks/react.py   | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mellea/stdlib/components/genstub.py b/mellea/stdlib/components/genstub.py
index ab10cdd82..632696ab8 100644
--- a/mellea/stdlib/components/genstub.py
+++ b/mellea/stdlib/components/genstub.py
@@ -654,9 +654,9 @@ def __call__(self, *args, **kwargs) -> tuple[R, Context] | R:
 
         assert response.parsed_repr is not None
         if context is None:
-            return response.parsed_repr  # type: ignore[return-value]
+            return response.parsed_repr  # type: ignore[return-value]  # genstub unwraps R from FunctionResponse[R]; format overloads can't re-bind R here
         else:
-            return response.parsed_repr, context  # type: ignore[return-value]
+            return response.parsed_repr, context  # type: ignore[return-value]  # same
 
 
 class AsyncGenerativeStub(GenerativeStub, Generic[P, R]):
@@ -797,9 +797,9 @@ async def __async_call__() -> tuple[R, Context] | R:
             )
             assert response.parsed_repr is not None
             if context is None:
-                return response.parsed_repr  # type: ignore[return-value]
+                return response.parsed_repr  # type: ignore[return-value]  # genstub unwraps R from FunctionResponse[R]; format overloads can't re-bind R here
             else:
-                return response.parsed_repr, context  # type: ignore[return-value]
+                return response.parsed_repr, context  # type: ignore[return-value]  # same
 
         return __async_call__()
 
diff --git a/mellea/stdlib/frameworks/react.py b/mellea/stdlib/frameworks/react.py
index 156c03e8a..271abcceb 100644
--- a/mellea/stdlib/frameworks/react.py
+++ b/mellea/stdlib/frameworks/react.py
@@ -111,7 +111,7 @@ async def react(
             assert len(tool_responses) == 1, "multiple tools were called with 'final'"
 
             if format is not None:
-                step, next_context = await mfuncs.aact(  # type: ignore[assignment]
+                step, next_context = await mfuncs.aact(  # type: ignore[assignment]  # dynamic format from caller
                     action=ReactThought(),
                     context=context,
                     backend=backend,

From 0c3498a83685a282fa08527d4db94de28502daac Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Mon, 22 Jun 2026 11:45:14 +0100
Subject: [PATCH 06/18] fix(thunk): wire _format in HF chat post_processing;
 add missing tests

The HuggingFace chat-path post_processing never assigned mot._format,
so .parsed always returned None when format= was set via LocalHFBackend.
All other backends (ollama, openai, litellm, watsonx) already set it.

Also adds:
- Copy/deepcopy unit tests verifying _format is preserved across copies
- E2e tests in test_ollama and test_huggingface asserting .parsed returns
  a typed Pydantic instance end-to-end through each backend
- Docstring note on ComputedModelOutputThunk.parsed warning custom-backend
  authors to set mot._format in their post_processing method

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/backends/huggingface.py    |  1 +
 mellea/core/base.py               |  7 +++++++
 test/backends/test_huggingface.py | 18 ++++++++++++++++++
 test/backends/test_ollama.py      | 18 ++++++++++++++++++
 test/core/test_base.py            | 18 ++++++++++++++++++
 5 files changed, 62 insertions(+)

diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py
index 9050dce9a..bea2e8b75 100644
--- a/mellea/backends/huggingface.py
+++ b/mellea/backends/huggingface.py
@@ -1378,6 +1378,7 @@ class used during generation, if any.
         generate_log.result = mot
 
         mot._generate_log = generate_log
+        mot._format = _format
 
     async def _generate_from_raw(
         self,
diff --git a/mellea/core/base.py b/mellea/core/base.py
index 54c8c1919..db694257c 100644
--- a/mellea/core/base.py
+++ b/mellea/core/base.py
@@ -903,6 +903,13 @@ def parsed(self) -> pydantic.BaseModel | None:
             result = m.act(Instruction("Say yes or no"), format=MyModel)
             obj = result.parsed  # no manual model_validate_json needed
 
+        Note:
+            This property relies on the originating backend storing the format
+            type on the thunk. Custom backend authors must set ``mot._format``
+            in their ``post_processing`` method (mirroring the built-in
+            backends); otherwise ``.parsed`` always returns ``None`` even when
+            ``format=`` was supplied.
+
         Returns:
             A ``pydantic.BaseModel`` instance produced by ``model_validate_json``,
             or ``None`` if no format type was set.
diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py
index c0d452ded..3c2d6d9b9 100644
--- a/test/backends/test_huggingface.py
+++ b/test/backends/test_huggingface.py
@@ -252,6 +252,24 @@ class Email(pydantic.BaseModel):
     )
 
 
+@pytest.mark.qualitative
+def test_parsed_returns_pydantic_instance(session) -> None:
+    class Sentiment(pydantic.BaseModel):
+        label: str
+
+    output = session.instruct(
+        "Classify the sentiment of 'I love this!' as the single word "
+        "positive, negative, or neutral. Respond with a label field.",
+        format=Sentiment,
+        model_options={ModelOption.MAX_NEW_TOKENS: 2**8},
+    )
+
+    parsed = output.parsed
+    assert isinstance(parsed, Sentiment)
+    assert isinstance(parsed.label, str) and parsed.label
+    assert parsed == Sentiment.model_validate_json(output.value)
+
+
 @pytest.mark.qualitative
 async def test_generate_from_raw(session) -> None:
     prompts = [
diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py
index b2aa7f249..890cb355f 100644
--- a/test/backends/test_ollama.py
+++ b/test/backends/test_ollama.py
@@ -127,6 +127,24 @@ class Email(pydantic.BaseModel):
     # assert email.to.email_address.endswith("example.com")
 
 
+@pytest.mark.qualitative
+def test_parsed_returns_pydantic_instance(session) -> None:
+    class Sentiment(pydantic.BaseModel):
+        label: str
+
+    output = session.instruct(
+        "Classify the sentiment of 'I love this!' as the single word "
+        "positive, negative, or neutral. Respond with a label field.",
+        format=Sentiment,
+        model_options={ModelOption.MAX_NEW_TOKENS: 2**8},
+    )
+
+    parsed = output.parsed
+    assert isinstance(parsed, Sentiment)
+    assert isinstance(parsed.label, str) and parsed.label
+    assert parsed == Sentiment.model_validate_json(output.value)
+
+
 @pytest.mark.qualitative
 @pytest.mark.timeout(150)
 async def test_generate_from_raw(session) -> None:
diff --git a/test/core/test_base.py b/test/core/test_base.py
index 3e0e0b821..894912bbd 100644
--- a/test/core/test_base.py
+++ b/test/core/test_base.py
@@ -363,3 +363,21 @@ def test_value_unaffected_by_format() -> None:
     raw = '{"label": "ok"}'
     result = _make_computed(raw, _Label)
     assert result.value == raw
+
+
+def test_format_preserved_by_copy() -> None:
+    import copy as _copy
+
+    result = _make_computed('{"label": "yes"}', _Label)
+    shallow = _copy.copy(result)
+    assert shallow._format is _Label
+    assert shallow._format.model_validate_json(shallow.value).label == "yes"  # type: ignore[union-attr]
+
+
+def test_format_preserved_by_deepcopy() -> None:
+    import copy as _copy
+
+    result = _make_computed('{"label": "yes"}', _Label)
+    deep = _copy.deepcopy(result)
+    assert deep._format is _Label
+    assert deep._format.model_validate_json(deep.value).label == "yes"  # type: ignore[union-attr]

From 4857a321fc9b118b6a54970ddf63a3f86e3e943f Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Mon, 22 Jun 2026 11:46:26 +0100
Subject: [PATCH 07/18] nit(test): explain subclass-loss caveat in
 copy/deepcopy _format tests

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 test/core/test_base.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/core/test_base.py b/test/core/test_base.py
index 894912bbd..a3700ed07 100644
--- a/test/core/test_base.py
+++ b/test/core/test_base.py
@@ -371,6 +371,8 @@ def test_format_preserved_by_copy() -> None:
     result = _make_computed('{"label": "yes"}', _Label)
     shallow = _copy.copy(result)
     assert shallow._format is _Label
+    # __copy__ returns ModelOutputThunk (loses ComputedModelOutputThunk subclass due to
+    # zero-copy __class__ reassignment), so we validate manually rather than via .parsed.
     assert shallow._format.model_validate_json(shallow.value).label == "yes"  # type: ignore[union-attr]
 
 
@@ -380,4 +382,5 @@ def test_format_preserved_by_deepcopy() -> None:
     result = _make_computed('{"label": "yes"}', _Label)
     deep = _copy.deepcopy(result)
     assert deep._format is _Label
+    # Same subclass-loss caveat as test_format_preserved_by_copy.
     assert deep._format.model_validate_json(deep.value).label == "yes"  # type: ignore[union-attr]

From 5aa6921db16e4d348bf5d06eb7901f29db51af0e Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Mon, 22 Jun 2026 13:47:43 +0100
Subject: [PATCH 08/18] fix(types): make ComputedModelOutputThunk.parsed
 generic over the format type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`.parsed` previously returned `pydantic.BaseModel | None`, so callers still
needed `cast(MyModel, result.parsed)` for static narrowing — the gap
@ajbozarth flagged on PR #1282.

Thread the format type through the thunk's existing type parameter `S`:
`_format` is now `type[S] | None` and `.parsed` returns `S | None`. Reusing
`S` (rather than a second TypeVar) composes with the companion `format=`
overloads on #1274, which bind `S` to the supplied model so
`m.act(action, format=MyModel)` yields `ComputedModelOutputThunk[MyModel]`
and `.parsed` is typed `MyModel | None`.

The `.parsed` body narrows `_format` to a pydantic type to call
`model_validate_json`, then re-asserts the result as `S` — `S` is unbounded
(it is `str` for plain instructions) so neither cast can be elided.

Add `test/typing/check_parsed.py` asserting `.parsed` tracks the type
parameter for both a model-parameterized and a `str` thunk.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/core/base.py         | 24 ++++++++++++++++--------
 test/typing/check_parsed.py | 28 ++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 8 deletions(-)
 create mode 100644 test/typing/check_parsed.py

diff --git a/mellea/core/base.py b/mellea/core/base.py
index db694257c..d54cc2dfc 100644
--- a/mellea/core/base.py
+++ b/mellea/core/base.py
@@ -29,6 +29,7 @@
     ParamSpec,
     Protocol,
     TypeVar,
+    cast,
     runtime_checkable,
 )
 
@@ -402,7 +403,7 @@ def __init__(
         # Mellea-side hook correlation ID; distinct from the provider-assigned
         # `GenerationMetadata.response_id`.
         self._generation_id: str | None = None
-        self._format: type[pydantic.BaseModel] | None = None
+        self._format: type[S] | None = None
 
     def _record_ttfb(self) -> None:
         """Record time-to-first-byte if streaming and not yet recorded."""
@@ -893,15 +894,18 @@ def value(self, v: str):
         self._underlying_value = v
 
     @property
-    def parsed(self) -> pydantic.BaseModel | None:
+    def parsed(self) -> S | None:
         """Returns the result as a validated Pydantic instance when ``format=`` was set.
 
-        Returns ``None`` when no ``format=`` type was provided to the originating
-        ``act()`` / ``instruct()`` call.  Use this instead of casting ``.value``
+        The return type tracks the format type supplied at the originating call
+        site: ``m.act(action, format=MyModel)`` yields a
+        ``ComputedModelOutputThunk[MyModel]`` whose ``.parsed`` is typed
+        ``MyModel | None`` — no ``cast()`` required. Returns ``None`` when no
+        ``format=`` type was provided.  Use this instead of casting ``.value``
         manually::
 
             result = m.act(Instruction("Say yes or no"), format=MyModel)
-            obj = result.parsed  # no manual model_validate_json needed
+            obj = result.parsed  # typed MyModel | None, no model_validate_json needed
 
         Note:
             This property relies on the originating backend storing the format
@@ -911,8 +915,8 @@ def parsed(self) -> pydantic.BaseModel | None:
             ``format=`` was supplied.
 
         Returns:
-            A ``pydantic.BaseModel`` instance produced by ``model_validate_json``,
-            or ``None`` if no format type was set.
+            An instance of the format type (``S``) produced by
+            ``model_validate_json``, or ``None`` if no format type was set.
 
         Raises:
             pydantic.ValidationError: If the raw JSON value does not conform to
@@ -920,7 +924,11 @@ def parsed(self) -> pydantic.BaseModel | None:
         """
         if self._format is None:
             return None
-        return self._format.model_validate_json(self.value)
+        # `_format` is a pydantic model type in every code path that sets it (the
+        # `format=` overloads bind `S` to that model), but `S` itself is unbounded,
+        # so we narrow to call `model_validate_json` and re-assert the result as `S`.
+        fmt = cast("type[pydantic.BaseModel]", self._format)
+        return cast("S", fmt.model_validate_json(self.value))
 
     def is_computed(self) -> Literal[True]:
         """Returns `True` since thunk is always computed.
diff --git a/test/typing/check_parsed.py b/test/typing/check_parsed.py
new file mode 100644
index 000000000..2171f82b0
--- /dev/null
+++ b/test/typing/check_parsed.py
@@ -0,0 +1,28 @@
+"""Mypy checks that `ComputedModelOutputThunk.parsed` tracks the type parameter.
+
+`.parsed` is typed `S | None`, so a thunk parameterized with a Pydantic model
+(`ComputedModelOutputThunk[MyModel]`) exposes `.parsed` as `MyModel | None` —
+callers need no `cast()`. The `format=` overloads in `session.py` /
+`functional.py` bind `S` to the format model (companion issue #1274), at which
+point these checks hold end-to-end from the call site.
+"""
+
+from typing import assert_type, cast
+
+import pydantic
+
+from mellea.core import ComputedModelOutputThunk
+
+
+class _Person(pydantic.BaseModel):
+    name: str
+
+
+def check_parsed_tracks_format_model() -> None:
+    thunk = cast(ComputedModelOutputThunk[_Person], None)
+    assert_type(thunk.parsed, _Person | None)
+
+
+def check_parsed_is_str_for_str_thunk() -> None:
+    thunk = cast(ComputedModelOutputThunk[str], None)
+    assert_type(thunk.parsed, str | None)

From 6118cea52e4494546dcc52ded0aa92b5465b1309 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Mon, 22 Jun 2026 13:58:13 +0100
Subject: [PATCH 09/18] fix(types): use concrete _format type and non-string
 cast for pyright compat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two pyright-compatibility fixes for the `.parsed` property added in bb53ddba:

1. `_format` annotation: revert from `type[S] | None` to
   `type[pydantic.BaseModel] | None`. Using a covariant TypeVar (`S`) in the
   invariant `type[...]` position is semantically unsound and can confuse
   stricter pyright configurations. The field only ever holds pydantic model
   types at runtime; the concrete annotation is accurate and avoids the
   variance issue entirely.

2. `.parsed` body: replace the two-step string-quoted cast
   (`cast("type[pydantic.BaseModel]", …)` then `cast("S", …)`) with a single
   direct cast (`cast(S, self._format.model_validate_json(self.value))`).
   Pyright resolves TypeVar forward-references in cast strings differently
   across versions; using the TypeVar directly is unambiguous.

3. `check_parsed.py`: use `cast(X, cast(object, None))` instead of
   `cast(X, None)` to avoid basedpyright's `reportInvalidCast` diagnostic
   (None and X share no overlap); assign `assert_type(…)` results to `_` to
   silence `reportUnusedCallResult`.

All three checkers (mypy, pyright 1.1.408+, basedpyright) now report clean
on both changed files.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/core/base.py         | 11 +++++------
 test/typing/check_parsed.py | 14 +++++++++-----
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/mellea/core/base.py b/mellea/core/base.py
index d54cc2dfc..6e38b792f 100644
--- a/mellea/core/base.py
+++ b/mellea/core/base.py
@@ -403,7 +403,7 @@ def __init__(
         # Mellea-side hook correlation ID; distinct from the provider-assigned
         # `GenerationMetadata.response_id`.
         self._generation_id: str | None = None
-        self._format: type[S] | None = None
+        self._format: type[pydantic.BaseModel] | None = None
 
     def _record_ttfb(self) -> None:
         """Record time-to-first-byte if streaming and not yet recorded."""
@@ -924,11 +924,10 @@ def parsed(self) -> S | None:
         """
         if self._format is None:
             return None
-        # `_format` is a pydantic model type in every code path that sets it (the
-        # `format=` overloads bind `S` to that model), but `S` itself is unbounded,
-        # so we narrow to call `model_validate_json` and re-assert the result as `S`.
-        fmt = cast("type[pydantic.BaseModel]", self._format)
-        return cast("S", fmt.model_validate_json(self.value))
+        # `_format` is always a pydantic model type; `model_validate_json` returns
+        # `pydantic.BaseModel` statically, but the caller's type parameter `S` is
+        # the concrete model when `format=` was used, so we cast the result to `S`.
+        return cast(S, self._format.model_validate_json(self.value))
 
     def is_computed(self) -> Literal[True]:
         """Returns `True` since thunk is always computed.
diff --git a/test/typing/check_parsed.py b/test/typing/check_parsed.py
index 2171f82b0..7fd521fe0 100644
--- a/test/typing/check_parsed.py
+++ b/test/typing/check_parsed.py
@@ -1,10 +1,14 @@
-"""Mypy checks that `ComputedModelOutputThunk.parsed` tracks the type parameter.
+"""Mypy / pyright checks that `ComputedModelOutputThunk.parsed` tracks the type parameter.
 
 `.parsed` is typed `S | None`, so a thunk parameterized with a Pydantic model
 (`ComputedModelOutputThunk[MyModel]`) exposes `.parsed` as `MyModel | None` —
 callers need no `cast()`. The `format=` overloads in `session.py` /
 `functional.py` bind `S` to the format model (companion issue #1274), at which
 point these checks hold end-to-end from the call site.
+
+The `cast(X, cast(object, None))` idiom creates a typed stub value without
+triggering basedpyright's ``reportInvalidCast`` rule (direct ``cast(X, None)``
+raises that diagnostic because ``None`` and ``X`` share no overlap).
 """
 
 from typing import assert_type, cast
@@ -19,10 +23,10 @@ class _Person(pydantic.BaseModel):
 
 
 def check_parsed_tracks_format_model() -> None:
-    thunk = cast(ComputedModelOutputThunk[_Person], None)
-    assert_type(thunk.parsed, _Person | None)
+    thunk = cast(ComputedModelOutputThunk[_Person], cast(object, None))
+    _ = assert_type(thunk.parsed, _Person | None)
 
 
 def check_parsed_is_str_for_str_thunk() -> None:
-    thunk = cast(ComputedModelOutputThunk[str], None)
-    assert_type(thunk.parsed, str | None)
+    thunk = cast(ComputedModelOutputThunk[str], cast(object, None))
+    _ = assert_type(thunk.parsed, str | None)

From 87b4b2f404fa1a679c5fad3e3bf2280722fddc20 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Mon, 22 Jun 2026 15:29:30 +0100
Subject: [PATCH 10/18] docs(thunk): tighten .parsed and
 ComputedModelOutputThunk.value docstrings

- ComputedModelOutputThunk.value now carries the same raw-JSON guidance as
  the parent override so callers inspecting the subclass see it directly.
- .parsed opening paragraph no longer overstates current type inference: the
  format= overloads do not yet bind S to the format model, so the cast idiom
  is required; removed the false claim that m.act(format=MyModel) yields a
  typed thunk without a cast.
- Added one-line distinction from parsed_repr to prevent confusion between
  the two properties.

Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/core/base.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/mellea/core/base.py b/mellea/core/base.py
index 6e38b792f..c135c60f1 100644
--- a/mellea/core/base.py
+++ b/mellea/core/base.py
@@ -885,7 +885,12 @@ async def astream(self) -> str:
 
     @property
     def value(self) -> str:
-        """Gets the value of the block."""
+        """Gets the raw string value of the block.
+
+        When ``format=`` is set on the originating ``act()``/``instruct()`` call, the
+        model returns a JSON string and ``.value`` contains that raw JSON — not a
+        Pydantic instance.  Use ``.parsed`` to get the validated model object.
+        """
         return self._underlying_value  # type: ignore
 
     @value.setter
@@ -897,15 +902,17 @@ def value(self, v: str):
     def parsed(self) -> S | None:
         """Returns the result as a validated Pydantic instance when ``format=`` was set.
 
-        The return type tracks the format type supplied at the originating call
-        site: ``m.act(action, format=MyModel)`` yields a
-        ``ComputedModelOutputThunk[MyModel]`` whose ``.parsed`` is typed
-        ``MyModel | None`` — no ``cast()`` required. Returns ``None`` when no
-        ``format=`` type was provided.  Use this instead of casting ``.value``
-        manually::
+        The return type is ``S | None``, where ``S`` is the thunk's type parameter.
+        The ``format=`` overloads do not yet bind ``S`` to the format model, so
+        callers must parameterize the thunk explicitly to get a narrowed type::
+
+            thunk = cast(ComputedModelOutputThunk[MyModel], result)
+            obj = thunk.parsed  # typed MyModel | None, no model_validate_json needed
 
-            result = m.act(Instruction("Say yes or no"), format=MyModel)
-            obj = result.parsed  # typed MyModel | None, no model_validate_json needed
+        Returns ``None`` when no ``format=`` type was provided.  Unlike
+        ``parsed_repr`` (which holds the action-specific parse result),
+        ``.parsed`` always re-validates the raw JSON string against ``_format``
+        via ``model_validate_json``.
 
         Note:
             This property relies on the originating backend storing the format

From 740293ff6c00779a27721e470893759527512a58 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Mon, 22 Jun 2026 12:56:50 +0100
Subject: [PATCH 11/18] docs(types): expand genstub return-value ignore
 rationale

Explain why the format= overloads can't narrow the genstub return type:
the overloads narrow the thunk's element type, but a genstub returns the
unwrapped inner value R, not the thunk, and parsed_repr (S | None) can't
be re-bound to R at this boundary. Add a TODO pointing at the clean shape
(ComputedModelOutputThunk[R] with the FunctionResponse[R] unwrap in a typed
parse step), noting it depends on the thunk-generics redesign out of scope here.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/stdlib/components/genstub.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/mellea/stdlib/components/genstub.py b/mellea/stdlib/components/genstub.py
index 632696ab8..5a3877e8f 100644
--- a/mellea/stdlib/components/genstub.py
+++ b/mellea/stdlib/components/genstub.py
@@ -653,10 +653,19 @@ def __call__(self, *args, **kwargs) -> tuple[R, Context] | R:
             )
 
         assert response.parsed_repr is not None
+        # The format= overloads on act/aact narrow the *thunk's* element type, but a
+        # genstub must return the inner value R (the unwrapped FunctionResponse[R]
+        # payload), not the thunk. `parsed_repr` is typed `S | None` at the call
+        # site and cannot be re-bound to R here, so the ignore bridges the gap.
+        #
+        # TODO: the clean shape is for act/aact to deliver a ComputedModelOutputThunk[R]
+        # whose value is R, with the FunctionResponse[R] unwrap happening inside a typed
+        # parse step rather than at the genstub boundary. That requires coordinating the
+        # thunk-generics redesign (see the .parsed work) and is outside this PR's scope.
         if context is None:
-            return response.parsed_repr  # type: ignore[return-value]  # genstub unwraps R from FunctionResponse[R]; format overloads can't re-bind R here
+            return response.parsed_repr  # type: ignore[return-value]
         else:
-            return response.parsed_repr, context  # type: ignore[return-value]  # same
+            return response.parsed_repr, context  # type: ignore[return-value]
 
 
 class AsyncGenerativeStub(GenerativeStub, Generic[P, R]):
@@ -796,10 +805,16 @@ async def __async_call__() -> tuple[R, Context] | R:
                 "unexpectedly received uncomputed model output thunk in async generative stub"
             )
             assert response.parsed_repr is not None
+            # See the SyncGenerativeStub.__call__ comment above: the format= overloads
+            # narrow the thunk's element type, but a genstub returns the unwrapped inner
+            # value R, not the thunk. `parsed_repr` is `S | None` and can't be re-bound to
+            # R here. The clean fix (ComputedModelOutputThunk[R] with the FunctionResponse[R]
+            # unwrap in a typed parse step) needs the thunk-generics redesign and is out of
+            # scope for this PR.
             if context is None:
-                return response.parsed_repr  # type: ignore[return-value]  # genstub unwraps R from FunctionResponse[R]; format overloads can't re-bind R here
+                return response.parsed_repr  # type: ignore[return-value]
             else:
-                return response.parsed_repr, context  # type: ignore[return-value]  # same
+                return response.parsed_repr, context  # type: ignore[return-value]
 
         return __async_call__()
 

From cb2bbf1e583c1dc14093c239775bf4b6d9983dd5 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Mon, 22 Jun 2026 12:57:23 +0100
Subject: [PATCH 12/18] test(types): assert attribute-level narrowing for act
 format= overload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The existing check only confirmed the overload resolved to
ComputedModelOutputThunk[_M]; it did not pin what the attributes are typed.
Add check_act_format_attributes asserting parsed_repr narrows to `_M | None`
(what the overloads actually narrow) and documenting that `.value` stays
unconditionally `str` — the known limitation pending the coordinated
thunk-generics / `.parsed` redesign. Locks in what IS narrowed and calls
out what isn't.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 test/typing/check_session.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/test/typing/check_session.py b/test/typing/check_session.py
index 9ea11bf76..0f3748993 100644
--- a/test/typing/check_session.py
+++ b/test/typing/check_session.py
@@ -66,6 +66,25 @@ def check_act_format() -> None:
     assert_type(r, ComputedModelOutputThunk[_M])
 
 
+def check_act_format_attributes() -> None:
+    # Locks in what the format= overloads actually narrow at the attribute level.
+    r = s.act(action, format=_M)
+
+    # `parsed_repr` is the attribute the overloads narrow: it carries the generic
+    # element type S, so with format=_M it resolves to `_M | None`.
+    assert_type(r.parsed_repr, _M | None)
+
+    # KNOWN LIMITATION: `.value` is typed `-> str` unconditionally on
+    # ComputedModelOutputThunk (see mellea/core/base.py), so it does NOT narrow to
+    # `_M` even though the thunk is parameterised `[_M]`. At runtime `.value` is the
+    # raw string and `.parsed_repr` is also a plain str (Instruction._parse returns
+    # str), so `parsed_repr.value` type-checks but AttributeErrors. Asserting
+    # `assert_type(r.value, _M)` here would (correctly) fail mypy. Both the static
+    # `.value` type and the runtime parsed_repr mismatch are pending the coordinated
+    # thunk-generics / `.parsed` redesign (PR #1282).
+    assert_type(r.value, str)
+
+
 def check_instruct_format() -> None:
     r = s.instruct("test", format=_M)
     assert_type(r, ComputedModelOutputThunk[_M])

From d055522b50f15d829f8f71139a549642fd189b34 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Mon, 22 Jun 2026 12:57:58 +0100
Subject: [PATCH 13/18] docs(types): explain intentional Any widening on act
 implementation

Document that the act() implementation return type is widened to `Any` on
purpose: the @overload signatures own the precise S propagation and the
format= -> BaseModelSubclass narrowing, and tightening the body to the bare
`[S]` case would conflict with the format= and sampling overloads. Callers
always resolve against an overload, never the implementation body.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/stdlib/functional.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mellea/stdlib/functional.py b/mellea/stdlib/functional.py
index 73050603d..603484630 100644
--- a/mellea/stdlib/functional.py
+++ b/mellea/stdlib/functional.py
@@ -101,6 +101,11 @@ def act(
     format: type[BaseModelSubclass] | None = None,
     model_options: dict | None = None,
     tool_calls: bool = False,
+    # Implementation return type intentionally widened to `Any`: the @overload signatures
+    # above own the precise S propagation (action's S -> thunk's S, plus the
+    # format= -> BaseModelSubclass narrowing). Tightening the body to the bare `[S]` case
+    # would conflict with the format= and sampling overloads, so it is left untyped on
+    # purpose. Callers always resolve against an overload, never this implementation body.
 ) -> tuple[ComputedModelOutputThunk[Any], Context] | SamplingResult[Any]:
     """Runs a generic action, and adds both the action and the result to the context.
 

From 91b9f7cff795708d9ef2f178e76594a0b7a921e0 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Mon, 22 Jun 2026 12:58:51 +0100
Subject: [PATCH 14/18] docs(types): document why format passthrough wrappers
 keep the ignore

A third union overload `format: type[BaseModelSubclass] | None` cannot narrow
cleanly: it overlaps the existing `format=None` overload, so the return type
collapses to the union and the narrowing is lost. The clean fix for a wrapper
forwarding a dynamic format is to branch on `format is None` so each call
matches a narrow overload. Add comments at the react and m_serve passthrough
sites explaining this and why the ignore is preferred over branching at those
single call sites.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 docs/examples/m_serve/m_serve_example_response_format.py | 5 +++++
 mellea/stdlib/frameworks/react.py                        | 9 +++++++++
 2 files changed, 14 insertions(+)

diff --git a/docs/examples/m_serve/m_serve_example_response_format.py b/docs/examples/m_serve/m_serve_example_response_format.py
index 5a85d5094..667b3572e 100644
--- a/docs/examples/m_serve/m_serve_example_response_format.py
+++ b/docs/examples/m_serve/m_serve_example_response_format.py
@@ -46,6 +46,11 @@ def serve(
 
     # When format is provided (from json_schema response_format),
     # pass it to instruct() to get structured output
+    # `format` arrives as a dynamic `type | None` from the response_format request, so it
+    # matches no single narrow instruct() overload (those key off format=None vs a concrete
+    # type). A serving wrapper that wanted cast-free typing would branch on `format is None`
+    # and call instruct() in each branch; here the passthrough ignore keeps the example
+    # focused on the serving flow.
     result = session.instruct(
         description=message,
         requirements=requirements,  # type: ignore
diff --git a/mellea/stdlib/frameworks/react.py b/mellea/stdlib/frameworks/react.py
index 271abcceb..bebf82338 100644
--- a/mellea/stdlib/frameworks/react.py
+++ b/mellea/stdlib/frameworks/react.py
@@ -111,6 +111,15 @@ async def react(
             assert len(tool_responses) == 1, "multiple tools were called with 'final'"
 
             if format is not None:
+                # `format` is a dynamic `type[BaseModelSubclass] | None` forwarded from
+                # the caller, which matches no single narrow aact() overload (those key
+                # off `format=None` vs `format=<type>` as distinct literals). We are
+                # already inside `if format is not None`, so the value is known non-None
+                # here, but mypy does not propagate that narrowing into the overload pick.
+                # The clean fix is for the caller to branch on `format is None` and call
+                # aact in each branch so each call matches a narrow overload; that is
+                # not worth the duplication for this single internal call site, so we
+                # accept the ignore.
                 step, next_context = await mfuncs.aact(  # type: ignore[assignment]  # dynamic format from caller
                     action=ReactThought(),
                     context=context,

From 8c478e2a485dab289596a2a951d0f9ce68b465da Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Mon, 22 Jun 2026 13:00:35 +0100
Subject: [PATCH 15/18] docs(types): note runtime/type mismatch on act format=
 overloads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The format= overloads narrow the thunk's generic element type, observable on
parsed_repr (S | None), not on .value — ComputedModelOutputThunk.value is
typed `-> str` unconditionally. parsed_repr also currently routes through
Instruction._parse (returns str), so parsed_repr.some_field type-checks but
AttributeErrors at runtime: the same silent-failure shape #1274 set out to
fix, relocated to parsed_repr. Add a TODO pointing at the coordinated .parsed
redesign (PR #1282) as the proper fix, out of scope here. PR body updated to
match (was claiming .value narrows to MyModel).

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/stdlib/session.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mellea/stdlib/session.py b/mellea/stdlib/session.py
index 4c01659c4..d1e3cd844 100644
--- a/mellea/stdlib/session.py
+++ b/mellea/stdlib/session.py
@@ -390,6 +390,14 @@ def cleanup(self) -> None:
 
             deregister_session_plugins(self.id)
 
+    # The format= overloads below narrow the thunk's generic element type. That narrowing
+    # is observable on `parsed_repr: S | None`, NOT on `.value` — ComputedModelOutputThunk.value
+    # is typed `-> str` unconditionally (mellea/core/base.py). There is also a runtime gap:
+    # parsed_repr currently goes through Instruction._parse, which returns a plain str, so
+    # `result.parsed_repr.some_field` type-checks but raises AttributeError at runtime.
+    # TODO: a coherent end state has the thunk's `.parsed` generic over S backed by a runtime
+    # path that delivers S. That is a coordinated change tracked by PR #1282 and is out of
+    # scope here; these overloads only land the static format= narrowing where they can.
     @overload
     def act(
         self,

From d8534450557a5e3c5fd84c0f9854cf9b3c6d0f90 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Mon, 22 Jun 2026 13:19:07 +0100
Subject: [PATCH 16/18] fix(types): branch on format is None in m_serve example
 to remove type: ignore

Serving wrapper now calls instruct() in two branches so each matches a
narrow overload. The react.py site (already inside if format is not None)
is a separate problem requiring broader restructuring; documented in-place.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 .../m_serve_example_response_format.py        | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/docs/examples/m_serve/m_serve_example_response_format.py b/docs/examples/m_serve/m_serve_example_response_format.py
index 667b3572e..fbb3453d5 100644
--- a/docs/examples/m_serve/m_serve_example_response_format.py
+++ b/docs/examples/m_serve/m_serve_example_response_format.py
@@ -44,18 +44,18 @@ def serve(
     """
     message = input[-1].get_text_content() or "No message provided"
 
-    # When format is provided (from json_schema response_format),
-    # pass it to instruct() to get structured output
-    # `format` arrives as a dynamic `type | None` from the response_format request, so it
-    # matches no single narrow instruct() overload (those key off format=None vs a concrete
-    # type). A serving wrapper that wanted cast-free typing would branch on `format is None`
-    # and call instruct() in each branch; here the passthrough ignore keeps the example
-    # focused on the serving flow.
-    result = session.instruct(
-        description=message,
-        requirements=requirements,  # type: ignore
-        model_options=model_options,
-        format=format,  # type: ignore[arg-type]  # dynamic format from caller
-    )
+    if format is None:
+        result = session.instruct(
+            description=message,
+            requirements=requirements,  # type: ignore
+            model_options=model_options,
+        )
+    else:
+        result = session.instruct(
+            description=message,
+            requirements=requirements,  # type: ignore
+            model_options=model_options,
+            format=format,
+        )
 
     return result

From c56d89d5ea0c31685a6d2de4cc9e8f1ccb414c3c Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Mon, 22 Jun 2026 13:22:12 +0100
Subject: [PATCH 17/18] fix(types): replace type: ignore[return-value] with
 cast in genstub

GenerativeStub._parse already unwraps FunctionResponse[R] and returns R
at runtime. The thunk types parsed_repr as S | None (S = FunctionResponse[R])
because the overloads narrow S to the format type, not R. Replace the
return-value ignores with an explicit cast to make the coercion visible.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/stdlib/components/genstub.py | 42 +++++++++++++++--------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/mellea/stdlib/components/genstub.py b/mellea/stdlib/components/genstub.py
index 5a3877e8f..480ea2e85 100644
--- a/mellea/stdlib/components/genstub.py
+++ b/mellea/stdlib/components/genstub.py
@@ -6,7 +6,16 @@
 from collections.abc import Awaitable, Callable, Coroutine
 from copy import deepcopy
 from dataclasses import dataclass, fields
-from typing import Any, Generic, ParamSpec, TypedDict, TypeVar, get_type_hints, overload
+from typing import (
+    Any,
+    Generic,
+    ParamSpec,
+    TypedDict,
+    TypeVar,
+    cast,
+    get_type_hints,
+    overload,
+)
 
 from pydantic import BaseModel, Field, create_model
 
@@ -653,19 +662,15 @@ def __call__(self, *args, **kwargs) -> tuple[R, Context] | R:
             )
 
         assert response.parsed_repr is not None
-        # The format= overloads on act/aact narrow the *thunk's* element type, but a
-        # genstub must return the inner value R (the unwrapped FunctionResponse[R]
-        # payload), not the thunk. `parsed_repr` is typed `S | None` at the call
-        # site and cannot be re-bound to R here, so the ignore bridges the gap.
-        #
-        # TODO: the clean shape is for act/aact to deliver a ComputedModelOutputThunk[R]
-        # whose value is R, with the FunctionResponse[R] unwrap happening inside a typed
-        # parse step rather than at the genstub boundary. That requires coordinating the
-        # thunk-generics redesign (see the .parsed work) and is outside this PR's scope.
+        # GenerativeStub._parse calls model_validate_json and returns the unwrapped R,
+        # so parsed_repr is R at runtime. The thunk types it as S | None (where
+        # S = FunctionResponse[R]) because the overloads narrow S to the format type,
+        # not to R. cast makes the coercion explicit rather than suppressing it.
+        parsed = cast("R", response.parsed_repr)
         if context is None:
-            return response.parsed_repr  # type: ignore[return-value]
+            return parsed
         else:
-            return response.parsed_repr, context  # type: ignore[return-value]
+            return parsed, context
 
 
 class AsyncGenerativeStub(GenerativeStub, Generic[P, R]):
@@ -805,16 +810,13 @@ async def __async_call__() -> tuple[R, Context] | R:
                 "unexpectedly received uncomputed model output thunk in async generative stub"
             )
             assert response.parsed_repr is not None
-            # See the SyncGenerativeStub.__call__ comment above: the format= overloads
-            # narrow the thunk's element type, but a genstub returns the unwrapped inner
-            # value R, not the thunk. `parsed_repr` is `S | None` and can't be re-bound to
-            # R here. The clean fix (ComputedModelOutputThunk[R] with the FunctionResponse[R]
-            # unwrap in a typed parse step) needs the thunk-generics redesign and is out of
-            # scope for this PR.
+            # Same as SyncGenerativeStub: _parse returns the unwrapped R at runtime;
+            # cast makes the S → R coercion explicit.
+            parsed = cast("R", response.parsed_repr)
             if context is None:
-                return response.parsed_repr  # type: ignore[return-value]
+                return parsed
             else:
-                return response.parsed_repr, context  # type: ignore[return-value]
+                return parsed, context
 
         return __async_call__()
 

From c054f1c0150a3ad712436f22147e12f1d9cf2eb9 Mon Sep 17 00:00:00 2001
From: Nigel Jones <jonesn@uk.ibm.com>
Date: Tue, 23 Jun 2026 10:44:57 +0100
Subject: [PATCH 18/18] fix(types): widen instruct/ainstruct format param to
 type[Any] in implementations

On Python 3.13, mypy 1.20 raises [misc] for overloaded-function
implementations where a TypeVar (BaseModelSubclass) is only constrained
through the `format` parameter and not anchored in any other parameter or
return type.  The act/aact implementations are unaffected because their
TypeVar S is also present in `action: Component[S]`.

The fix: replace `type[BaseModelSubclass] | None` with `type[Any] | None`
in the four non-overload implementation signatures (functional.instruct,
functional.ainstruct, MelleaSession.instruct, MelleaSession.ainstruct).
The TypeVar narrowing still lives in the @overload signatures where it
belongs; callers never resolve against the implementation body.

Assisted-by: Claude Code
Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
---
 mellea/stdlib/functional.py | 6 ++++--
 mellea/stdlib/session.py    | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/mellea/stdlib/functional.py b/mellea/stdlib/functional.py
index 603484630..f6ac03ab0 100644
--- a/mellea/stdlib/functional.py
+++ b/mellea/stdlib/functional.py
@@ -228,7 +228,8 @@ def instruct(
     output_prefix: str | CBlock | None = None,
     strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2),
     return_sampling_results: bool = False,
-    format: type[BaseModelSubclass] | None = None,
+    format: type[Any]
+    | None = None,  # widened: TypeVar only needed in @overload signatures
     model_options: dict | None = None,
     tool_calls: bool = False,
 ) -> tuple[ComputedModelOutputThunk[Any], Context] | SamplingResult[Any]:
@@ -1022,7 +1023,8 @@ async def ainstruct(
     output_prefix: str | CBlock | None = None,
     strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2),
     return_sampling_results: bool = False,
-    format: type[BaseModelSubclass] | None = None,
+    format: type[Any]
+    | None = None,  # widened: TypeVar only needed in @overload signatures
     model_options: dict | None = None,
     tool_calls: bool = False,
     await_result: bool = False,
diff --git a/mellea/stdlib/session.py b/mellea/stdlib/session.py
index d1e3cd844..80ad14adb 100644
--- a/mellea/stdlib/session.py
+++ b/mellea/stdlib/session.py
@@ -552,7 +552,8 @@ def instruct(
         output_prefix: str | CBlock | None = None,
         strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2),
         return_sampling_results: bool = False,
-        format: type[BaseModelSubclass] | None = None,
+        format: type[Any]
+        | None = None,  # widened: TypeVar only needed in @overload signatures
         model_options: dict | None = None,
         tool_calls: bool = False,
     ) -> ModelOutputThunk[Any] | SamplingResult[Any]:
@@ -1047,7 +1048,8 @@ async def ainstruct(
         output_prefix: str | CBlock | None = None,
         strategy: SamplingStrategy | None = RejectionSamplingStrategy(loop_budget=2),
         return_sampling_results: bool = False,
-        format: type[BaseModelSubclass] | None = None,
+        format: type[Any]
+        | None = None,  # widened: TypeVar only needed in @overload signatures
         model_options: dict | None = None,
         tool_calls: bool = False,
         await_result: bool = False,