From 40b97d63d548d402eb5473a861488516be410240 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Mon, 22 Jun 2026 13:53:20 -0400 Subject: [PATCH 1/2] refactor(core): promote ModelOutputThunk.thinking and deprecate _thinking alias Signed-off-by: Phil Williams --- docs/docs/integrations/openai.md | 4 +-- mellea/backends/litellm.py | 8 ++--- mellea/backends/ollama.py | 6 ++-- mellea/backends/openai.py | 8 ++--- mellea/backends/watsonx.py | 8 ++--- mellea/core/base.py | 29 ++++++++++++++++--- mellea/stdlib/requirements/safety/guardian.py | 2 +- test/backends/test_litellm_thinking.py | 22 +++++++------- test/backends/test_openai_unit.py | 14 ++++----- test/core/test_base.py | 13 +++++++++ 10 files changed, 74 insertions(+), 40 deletions(-) diff --git a/docs/docs/integrations/openai.md b/docs/docs/integrations/openai.md index b37268c02..a7396fbe4 100644 --- a/docs/docs/integrations/openai.md +++ b/docs/docs/integrations/openai.md @@ -357,7 +357,7 @@ Diagnose with: result = m.instruct("What is 2 + 2?") print(repr(result.value)) # '' print(result.generation.usage) # {'completion_tokens': 9, ...} -print(result._thinking) # populated reasoning content, if any +print(result.thinking) # populated reasoning content, if any ``` This affects models that default to thinking mode, most commonly Qwen3 served @@ -383,7 +383,7 @@ m = MelleaSession( Other inference servers expose the same control under different names — check your runtime's documentation. If you intend to use thinking mode, read the -reasoning trace from `result._thinking` rather than `result.value`. +reasoning trace from `result.thinking` rather than `result.value`. --- diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py index 0caee3eb0..0f3c24e66 100644 --- a/mellea/backends/litellm.py +++ b/mellea/backends/litellm.py @@ -470,8 +470,8 @@ async def processing( chunk (litellm.ModelResponse | litellm.ModelResponseStream): A single response object or streaming chunk from LiteLLM. """ - if mot._thinking is None: - mot._thinking = "" + if mot.thinking is None: + mot.thinking = "" if mot._underlying_value is None: mot._underlying_value = "" @@ -490,7 +490,7 @@ async def processing( if thinking_chunk is None: thinking_chunk = message.get("reasoning") if thinking_chunk is not None: - mot._thinking += thinking_chunk + mot.thinking += thinking_chunk content_chunk = message.content if content_chunk is not None: @@ -509,7 +509,7 @@ async def processing( if thinking_chunk is None: thinking_chunk = message_delta.get("reasoning") if thinking_chunk is not None: - mot._thinking += thinking_chunk + mot.thinking += thinking_chunk content_chunk = message_delta.content if content_chunk is not None: diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py index 0a7c28297..a26bc77f5 100644 --- a/mellea/backends/ollama.py +++ b/mellea/backends/ollama.py @@ -683,11 +683,11 @@ async def processing( tools (dict[str, AbstractMelleaTool]): Available tools, keyed by name, used for extracting tool call requests from the response. """ - if mot._thinking is None: - mot._thinking = "" + if mot.thinking is None: + mot.thinking = "" thinking_chunk = chunk.message.thinking if thinking_chunk is not None: - mot._thinking += thinking_chunk + mot.thinking += thinking_chunk if mot._underlying_value is None: mot._underlying_value = "" diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py index 3683a81a3..17839b516 100644 --- a/mellea/backends/openai.py +++ b/mellea/backends/openai.py @@ -1002,8 +1002,8 @@ async def processing( chunk (ChatCompletion | ChatCompletionChunk): A single response object or streaming delta from the OpenAI API. """ - if mot._thinking is None: - mot._thinking = "" + if mot.thinking is None: + mot.thinking = "" if mot._underlying_value is None: mot._underlying_value = "" @@ -1016,7 +1016,7 @@ async def processing( if thinking_chunk is None: thinking_chunk = (message.model_extra or {}).get("reasoning") if thinking_chunk is not None: - mot._thinking += thinking_chunk + mot.thinking += thinking_chunk content_chunk = message.content if content_chunk is not None: @@ -1041,7 +1041,7 @@ async def processing( if thinking_chunk is None: thinking_chunk = (message_delta.model_extra or {}).get("reasoning") if thinking_chunk is not None: - mot._thinking += thinking_chunk + mot.thinking += thinking_chunk content_chunk = message_delta.content if content_chunk is not None: diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py index 87b42cccb..13d2a28a6 100644 --- a/mellea/backends/watsonx.py +++ b/mellea/backends/watsonx.py @@ -484,8 +484,8 @@ async def processing(self, mot: ModelOutputThunk, chunk: dict): mot (ModelOutputThunk): The output thunk being populated. chunk (dict): A single response dict or streaming delta from the WatsonX API. """ - if mot._thinking is None: - mot._thinking = "" + if mot.thinking is None: + mot.thinking = "" if mot._underlying_value is None: mot._underlying_value = "" @@ -499,7 +499,7 @@ async def processing(self, mot: ModelOutputThunk, chunk: dict): thinking_chunk = message.get("reasoning_content", None) if thinking_chunk is not None: - mot._thinking += thinking_chunk + mot.thinking += thinking_chunk content_chunk = message.get("content", "") if content_chunk is not None: @@ -515,7 +515,7 @@ async def processing(self, mot: ModelOutputThunk, chunk: dict): thinking_chunk = message_delta.get("reasoning_content", None) if thinking_chunk is not None: - mot._thinking += thinking_chunk + mot.thinking += thinking_chunk content_chunk = message_delta.get("content", None) if content_chunk is not None: diff --git a/mellea/core/base.py b/mellea/core/base.py index ff2202559..933dc4e73 100644 --- a/mellea/core/base.py +++ b/mellea/core/base.py @@ -18,6 +18,7 @@ import datetime import enum import logging +import warnings from collections.abc import Callable, Coroutine, Iterable, Mapping from copy import copy, deepcopy from dataclasses import dataclass @@ -394,7 +395,7 @@ def __init__( # Additional fields that should be standardized across apis. self.tool_calls = tool_calls - self._thinking: str | None = None + self.thinking: str | None = None self.generation: GenerationMetadata = GenerationMetadata() """Backend execution metadata populated during generation.""" @@ -594,7 +595,7 @@ def _copy_from(self, other: ModelOutputThunk) -> None: self._meta = other._meta self.parsed_repr = other.parsed_repr self.tool_calls = other.tool_calls - self._thinking = other._thinking + self.thinking = other.thinking self.generation = other.generation self._generate_log = other._generate_log self._cancelled = other._cancelled @@ -611,6 +612,26 @@ def is_computed(self) -> bool: """ return self._computed + @property + def _thinking(self) -> str | None: + """Deprecated alias for :attr:`thinking`. + + Returns: + str | None: The model's reasoning/thinking trace. + """ + warnings.warn( + "`ModelOutputThunk._thinking` is deprecated and will be removed in a " + "future minor release. Use `ModelOutputThunk.thinking` instead.", + DeprecationWarning, + stacklevel=2, + ) + return self.thinking + + @_thinking.setter + def _thinking(self, value: str | None) -> None: + """Deprecated write alias for :attr:`thinking`.""" + self.thinking = value + @property def value(self) -> str | None: """Gets the value of the block.""" @@ -829,7 +850,7 @@ def __copy__(self) -> ModelOutputThunk: # _cancel_hook is not forwarded: a copied MOT is a distinct computation # and must not share the original's backend thread signal. copied._cancel_hook = None - copied._thinking = self._thinking + copied.thinking = self.thinking copied._action = self._action copied._context = self._context copied._generate_log = self._generate_log @@ -862,7 +883,7 @@ def __deepcopy__(self, memo: dict) -> ModelOutputThunk: # _cancel_hook is not forwarded: a deepcopied MOT is a distinct computation # and must not share the original's backend thread signal. deepcopied._cancel_hook = None - deepcopied._thinking = self._thinking + deepcopied.thinking = self.thinking deepcopied._action = deepcopy(self._action) deepcopied._context = copy( self._context diff --git a/mellea/stdlib/requirements/safety/guardian.py b/mellea/stdlib/requirements/safety/guardian.py index 42c463612..58d15de64 100644 --- a/mellea/stdlib/requirements/safety/guardian.py +++ b/mellea/stdlib/requirements/safety/guardian.py @@ -389,7 +389,7 @@ async def validate( await mot.avalue() # Prefer explicit thinking if available, else try to split from output text. - trace = getattr(mot, "_thinking", None) + trace = mot.thinking text = mot.value or "" if trace is None and "" in text: parts = text.split("") diff --git a/test/backends/test_litellm_thinking.py b/test/backends/test_litellm_thinking.py index dfd08fe7e..68b182186 100644 --- a/test/backends/test_litellm_thinking.py +++ b/test/backends/test_litellm_thinking.py @@ -1,4 +1,4 @@ -"""Unit tests for LiteLLMBackend mot._thinking population. +"""Unit tests for LiteLLMBackend mot.thinking population. Covers the vLLM case where the wire key is ``"reasoning"`` instead of ``"reasoning_content"``, and the case where LiteLLM has already normalised @@ -75,7 +75,7 @@ async def test_processing_non_streaming_reasoning_content_key(backend: LiteLLMBa reasoning_value="France has its capital in Paris.", ) await backend.processing(mot, chunk) - assert mot._thinking == "France has its capital in Paris." + assert mot.thinking == "France has its capital in Paris." assert mot._underlying_value == "Paris" @@ -88,7 +88,7 @@ async def test_processing_non_streaming_reasoning_raw_key(backend: LiteLLMBacken reasoning_value="France has its capital in Paris.", ) await backend.processing(mot, chunk) - assert mot._thinking == "France has its capital in Paris." + assert mot.thinking == "France has its capital in Paris." assert mot._underlying_value == "Paris" @@ -109,7 +109,7 @@ async def test_processing_non_streaming_reasoning_content_wins_over_reasoning( object="chat.completion", ) await backend.processing(mot, chunk) - assert mot._thinking == "from_reasoning_content" + assert mot.thinking == "from_reasoning_content" async def test_processing_non_streaming_no_reasoning(backend: LiteLLMBackend): @@ -121,7 +121,7 @@ async def test_processing_non_streaming_no_reasoning(backend: LiteLLMBackend): reasoning_value="should be ignored", ) await backend.processing(mot, chunk) - assert mot._thinking == "" + assert mot.thinking == "" assert mot._underlying_value == "Paris" @@ -147,7 +147,7 @@ async def test_processing_non_streaming_empty_reasoning_content_does_not_fall_ba object="chat.completion", ) await backend.processing(mot, chunk) - assert mot._thinking == "" + assert mot.thinking == "" # --------------------------------------------------------------------------- @@ -163,7 +163,7 @@ async def test_processing_streaming_reasoning_content_key(backend: LiteLLMBacken content="", reasoning_key="reasoning_content", reasoning_value=text ) await backend.processing(mot, stream_chunk) - assert mot._thinking == "chunk1 chunk2" + assert mot.thinking == "chunk1 chunk2" async def test_processing_streaming_reasoning_raw_key(backend: LiteLLMBackend): @@ -174,7 +174,7 @@ async def test_processing_streaming_reasoning_raw_key(backend: LiteLLMBackend): content="", reasoning_key="reasoning", reasoning_value=text ) await backend.processing(mot, stream_chunk) - assert mot._thinking == "chunk1 chunk2" + assert mot.thinking == "chunk1 chunk2" async def test_processing_streaming_reasoning_content_wins_over_reasoning( @@ -190,7 +190,7 @@ async def test_processing_streaming_reasoning_content_wins_over_reasoning( id="test", choices=[chunk_choice], created=0, model="openai/qwen3" ) await backend.processing(mot, stream_chunk) - assert mot._thinking == "from_reasoning_content" + assert mot.thinking == "from_reasoning_content" async def test_processing_streaming_no_reasoning(backend: LiteLLMBackend): @@ -200,7 +200,7 @@ async def test_processing_streaming_no_reasoning(backend: LiteLLMBackend): content="Paris", reasoning_key="unrelated_key", reasoning_value="ignored" ) await backend.processing(mot, stream_chunk) - assert mot._thinking == "" + assert mot.thinking == "" assert mot._underlying_value == "Paris" @@ -220,7 +220,7 @@ async def test_processing_streaming_empty_reasoning_content_does_not_fall_back( id="test", choices=[chunk_choice], created=0, model="openai/qwen3" ) await backend.processing(mot, stream_chunk) - assert mot._thinking == "" + assert mot.thinking == "" # --------------------------------------------------------------------------- diff --git a/test/backends/test_openai_unit.py b/test/backends/test_openai_unit.py index 03bd70df0..7ee7d84ea 100644 --- a/test/backends/test_openai_unit.py +++ b/test/backends/test_openai_unit.py @@ -193,7 +193,7 @@ def _vllm_chat_completion(reasoning: str, content: str | None) -> ChatCompletion async def test_processing_captures_vllm_reasoning_field(backend): - """Non-streaming: mot._thinking captures the raw ``reasoning`` key from vLLM.""" + """Non-streaming: mot.thinking captures the raw ``reasoning`` key from vLLM.""" mot: ModelOutputThunk = ModelOutputThunk(value=None) chunk = _vllm_chat_completion(reasoning="2 + 2 equals 4.", content="4") # Sanity check: the SDK object does not expose reasoning_content @@ -201,7 +201,7 @@ async def test_processing_captures_vllm_reasoning_field(backend): await backend.processing(mot, chunk) - assert mot._thinking == "2 + 2 equals 4." + assert mot.thinking == "2 + 2 equals 4." assert mot._underlying_value == "4" @@ -212,12 +212,12 @@ async def test_processing_vllm_reasoning_with_null_content(backend): await backend.processing(mot, chunk) - assert mot._thinking == "some thinking" + assert mot.thinking == "some thinking" assert mot._underlying_value == "" async def test_processing_streaming_captures_vllm_reasoning_field(backend): - """Streaming: per-chunk ``reasoning`` deltas accumulate into mot._thinking.""" + """Streaming: per-chunk ``reasoning`` deltas accumulate into mot.thinking.""" mot: ModelOutputThunk = ModelOutputThunk(value=None) chunk_a = ChatCompletionChunk.model_validate( { @@ -257,7 +257,7 @@ async def test_processing_streaming_captures_vllm_reasoning_field(backend): await backend.processing(mot, chunk_a) await backend.processing(mot, chunk_b) - assert mot._thinking == "first second" + assert mot.thinking == "first second" assert mot._underlying_value == "ans" @@ -287,7 +287,7 @@ async def test_processing_reasoning_content_still_used(backend): mot: ModelOutputThunk = ModelOutputThunk(value=None) await backend.processing(mot, chunk) - assert mot._thinking == "attribute-style trace" + assert mot.thinking == "attribute-style trace" assert mot._underlying_value == "answer" @@ -311,7 +311,7 @@ async def test_processing_reasoning_content_takes_precedence_over_reasoning(back mot: ModelOutputThunk = ModelOutputThunk(value=None) await backend.processing(mot, chunk) - assert mot._thinking == "attr-trace" + assert mot.thinking == "attr-trace" assert mot._underlying_value == "answer" diff --git a/test/core/test_base.py b/test/core/test_base.py index a3424d0f4..89e04ef1b 100644 --- a/test/core/test_base.py +++ b/test/core/test_base.py @@ -275,6 +275,19 @@ def test_mot_error_carried_by_copy_methods() -> None: assert target.error is err +def test_mot_thinking_public_field_round_trip(): + mot = ModelOutputThunk(value="x") + mot.thinking = "reasoning trace" + assert mot.thinking == "reasoning trace" + + +def test_mot__thinking_deprecated_alias_warns_on_read(): + mot = ModelOutputThunk(value="x") + mot.thinking = "reasoning trace" + with pytest.warns(DeprecationWarning, match="ModelOutputThunk._thinking"): + assert mot._thinking == "reasoning trace" + + if __name__ == "__main__": pytest.main([__file__]) From 837fa073fa393d82995e6ecd978f03b2edfc52e9 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Mon, 22 Jun 2026 14:42:13 -0400 Subject: [PATCH 2/2] Add coverage for changes Signed-off-by: Phil Williams --- test/backends/test_ollama_unit.py | 12 +++++++ test/backends/test_stop_sequences_unit.py | 35 +++++++++++++++++++ test/core/test_base.py | 6 ++++ .../requirements/test_guardian_check_unit.py | 32 +++++++++++++++++ 4 files changed, 85 insertions(+) create mode 100644 test/stdlib/requirements/test_guardian_check_unit.py diff --git a/test/backends/test_ollama_unit.py b/test/backends/test_ollama_unit.py index 7d0c4b03f..992a7e709 100644 --- a/test/backends/test_ollama_unit.py +++ b/test/backends/test_ollama_unit.py @@ -204,6 +204,18 @@ def test_delta_merge_thinking_concatenated(): assert mot._meta["chat_response"].message.thinking == "step 1 step 2" +@pytest.mark.asyncio +async def test_processing_initializes_and_accumulates_thinking( + backend: OllamaModelBackend, +): + """processing() initializes thinking and accumulates chunk thinking text.""" + mot = ModelOutputThunk(value=None) + await backend.processing(mot, _make_delta("answer", thinking="step 1"), {}) + + assert mot.thinking == "step 1" + assert mot._underlying_value == "answer" + + # --- timeout wiring --- diff --git a/test/backends/test_stop_sequences_unit.py b/test/backends/test_stop_sequences_unit.py index d4e200f7e..06ee57a89 100644 --- a/test/backends/test_stop_sequences_unit.py +++ b/test/backends/test_stop_sequences_unit.py @@ -11,6 +11,7 @@ from mellea.backends import ModelOption from mellea.backends.ollama import OllamaModelBackend from mellea.backends.openai import OpenAIBackend +from mellea.core import ModelOutputThunk # --- OpenAI --- @@ -127,6 +128,40 @@ def test_watsonx_stop_sequences_round_trip(is_chat, native_key): assert ModelOption.STOP_SEQUENCES not in backend_specific +@pytest.mark.asyncio +async def test_watsonx_processing_non_streaming_captures_reasoning_content(): + backend = _make_watsonx_backend() + mot = ModelOutputThunk(value=None) + + chunk = { + "choices": [ + {"message": {"reasoning_content": "trace", "content": "answer content"}} + ] + } + await backend.processing(mot, chunk) + + assert mot.thinking == "trace" + assert mot._underlying_value == "answer content" + assert mot._meta["oai_chat_response_choice"] == chunk["choices"][0] + + +@pytest.mark.asyncio +async def test_watsonx_processing_streaming_captures_reasoning_content(): + backend = _make_watsonx_backend() + mot = ModelOutputThunk(value=None) + + await backend.processing( + mot, {"choices": [{"delta": {"reasoning_content": "a", "content": "x"}}]} + ) + await backend.processing( + mot, {"choices": [{"delta": {"reasoning_content": "b", "content": "y"}}]} + ) + + assert mot.thinking == "ab" + assert mot._underlying_value == "xy" + assert len(mot._meta["oai_chat_response_streamed"]) == 2 + + # --- HuggingFace --- diff --git a/test/core/test_base.py b/test/core/test_base.py index 89e04ef1b..99daa5dcc 100644 --- a/test/core/test_base.py +++ b/test/core/test_base.py @@ -288,6 +288,12 @@ def test_mot__thinking_deprecated_alias_warns_on_read(): assert mot._thinking == "reasoning trace" +def test_mot__thinking_deprecated_alias_write_sets_public_field(): + mot = ModelOutputThunk(value="x") + mot._thinking = "reasoning trace" + assert mot.thinking == "reasoning trace" + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/test/stdlib/requirements/test_guardian_check_unit.py b/test/stdlib/requirements/test_guardian_check_unit.py new file mode 100644 index 000000000..d34e26e5d --- /dev/null +++ b/test/stdlib/requirements/test_guardian_check_unit.py @@ -0,0 +1,32 @@ +"""Unit tests for GuardianCheck requirement behavior.""" + +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from mellea.core import ModelOutputThunk +from mellea.stdlib.components import Message +from mellea.stdlib.context import ChatContext +from mellea.stdlib.requirements.safety.guardian import GuardianCheck + + +@pytest.mark.asyncio +async def test_guardian_validate_uses_thinking_trace_in_reason() -> None: + """validate() should include explicit mot.thinking content in the reason.""" + mot = ModelOutputThunk(value="no") + mot.thinking = "grounded in provided content" + + backend = MagicMock() + backend.generate_from_context = AsyncMock(return_value=(mot, ChatContext())) + + with pytest.warns(DeprecationWarning, match="GuardianCheck is deprecated"): + req = GuardianCheck(risk="harm", backend=backend, backend_type="ollama") + + ctx = ChatContext().add(Message("user", "Is this safe?")).add( + Message("assistant", "Yes.") + ) + result = await req.validate(backend, ctx) + + assert result.as_bool() is True + assert result.reason is not None + assert "Reasoning: grounded in provided content" in result.reason