diff --git a/src/forge/clients/ollama.py b/src/forge/clients/ollama.py index 29c44ca..920c2a7 100644 --- a/src/forge/clients/ollama.py +++ b/src/forge/clients/ollama.py @@ -12,6 +12,7 @@ from forge.clients.sampling_defaults import apply_sampling_defaults from forge.core.workflow import LLMResponse, TextResponse, ToolCall, ToolSpec from forge.errors import BackendError, ThinkingNotSupportedError +from forge.prompts.think_tags import extract_think_tags _THINK_HEURISTIC_KEYWORDS = ("reason", "think") @@ -120,12 +121,17 @@ def _resolve_reasoning( ) -> str | None: """Gate reasoning capture on _think flag. - When _think is False, discard all reasoning. - When True: prefer thinking field, fall back to content. + When _think is False, discard all reasoning. When True: prefer the + structured ``thinking`` field; if absent, extract ```` tags from + content; finally fall back to the raw content (an instruct model + narrating before its tool call). Mirrors LlamafileClient. """ if not self._think: return None - return thinking or content or None + if thinking: + return thinking + think, _ = extract_think_tags(content) + return think or content or None def _record_usage(self, data: dict[str, Any]) -> None: """Extract token usage from an Ollama response.""" @@ -214,7 +220,10 @@ async def send( for i, tc in enumerate(tool_calls) ] - return TextResponse(content=msg.get("content", "")) + # No tool calls: strip inline thinking so the TextResponse carries + # clean content (parity with LlamafileClient). + _, content = extract_think_tags(msg.get("content", "")) + return TextResponse(content=content) async def send_stream( self, @@ -321,7 +330,8 @@ async def _iter_stream( content = msg.get("content", "") if content: accumulated_content += content - final = TextResponse(content=accumulated_content) + _, text = extract_think_tags(accumulated_content) + final = TextResponse(content=text) yield StreamChunk(type=ChunkType.FINAL, response=final) else: tool_calls = msg.get("tool_calls") diff --git a/src/forge/clients/vllm.py b/src/forge/clients/vllm.py index 6e64c94..a3da820 100644 --- a/src/forge/clients/vllm.py +++ b/src/forge/clients/vllm.py @@ -29,6 +29,7 @@ from forge.clients.sampling_defaults import apply_sampling_defaults from forge.core.workflow import LLMResponse, TextResponse, ToolCall, ToolSpec from forge.errors import BackendError +from forge.prompts.think_tags import extract_think_tags class VLLMClient: @@ -163,23 +164,26 @@ def _record_usage(self, data: dict[str, Any]) -> None: total_tokens=usage.get("total_tokens", 0), ) - def _resolve_reasoning( - self, message_or_accum: dict[str, Any] | str, accumulated_content: str = "", - ) -> str | None: - """Extract reasoning, gated on _think. - - vLLM 0.21 returns reasoning in the ``reasoning`` field of the - assistant message when ``--reasoning-parser`` is enabled at server - boot. If thinking is disabled or the field is empty, return None. - - Accepts either a message dict (from send()) or an accumulated - reasoning string (from send_stream()). + def _resolve_reasoning(self, reasoning: str, content: str) -> str | None: + """Build final reasoning from the structured field and content, gated + on _think. + + vLLM 0.21 returns reasoning in the ``reasoning`` field of the assistant + message when ``--reasoning-parser`` is enabled at server boot. When + that parser is absent — or doesn't split a given model's output — the + thinking instead arrives inline in ``content`` (often wrapped in + ``...``). To avoid silently dropping it (issue #110) and + to keep send() and send_stream() in lockstep with LlamafileClient, fall + back to ````-tag extraction and then to the raw content when the + structured field is empty. Both call sites pass the same (reasoning, + content) pair, so the two paths resolve identically. """ if not self._think: return None - if isinstance(message_or_accum, dict): - return message_or_accum.get("reasoning") or None - return message_or_accum or accumulated_content or None + if reasoning: + return reasoning + think, _ = extract_think_tags(content) + return think or content or None async def send( self, @@ -227,10 +231,16 @@ async def send( if tool_calls: return self._parse_tool_calls( tool_calls, - reasoning=self._resolve_reasoning(message), + reasoning=self._resolve_reasoning( + message.get("reasoning") or "", message.get("content") or "", + ), ) - return TextResponse(content=message.get("content") or "") + # No tool calls: strip any inline thinking — reasoning is only useful + # attached to a ToolCall; a TextResponse carries clean content (parity + # with LlamafileClient.send()). + _, content = extract_think_tags(message.get("content") or "") + return TextResponse(content=content) async def send_stream( self, @@ -334,7 +344,9 @@ async def send_stream( ), ) else: - final = TextResponse(content=accumulated_content) + # Strip inline thinking from the final text for parity with send(). + _, text = extract_think_tags(accumulated_content) + final = TextResponse(content=text) yield StreamChunk(type=ChunkType.FINAL, response=final) @staticmethod diff --git a/tests/unit/test_ollama_client.py b/tests/unit/test_ollama_client.py index 5961bf0..e09c94a 100644 --- a/tests/unit/test_ollama_client.py +++ b/tests/unit/test_ollama_client.py @@ -225,6 +225,41 @@ async def test_think_false_discards_reasoning(self) -> None: assert isinstance(result, list) assert result[0].reasoning is None + @pytest.mark.asyncio + async def test_extracts_think_tags_from_content_with_tool_call(self) -> None: + """ tags inline in content are extracted (not the raw tagged + string), when there is no structured thinking field.""" + client = _make_client(think=True) + client._http.post.return_value = _mock_response({ + "message": { + "role": "assistant", + "content": "price first", + "tool_calls": [ + {"function": {"name": "get_pricing", "arguments": {"part": "X"}}} + ], + } + }) + result = await client.send( + [{"role": "user", "content": "test"}], tools=[_make_spec()] + ) + assert isinstance(result, list) + assert result[0].reasoning == "price first" + + @pytest.mark.asyncio + async def test_think_tags_stripped_from_text_response(self) -> None: + """A bare text reply has tags stripped from its content.""" + client = _make_client() + client._http.post.return_value = _mock_response({ + "message": { + "role": "assistant", + "content": "ponderingHello there.", + "tool_calls": [], + } + }) + result = await client.send([{"role": "user", "content": "test"}]) + assert isinstance(result, TextResponse) + assert result.content == "Hello there." + @pytest.mark.asyncio async def test_think_true_explicit(self) -> None: """think=True explicitly → always in request body.""" @@ -499,6 +534,52 @@ async def test_streaming_captures_reasoning_from_deltas(self) -> None: assert isinstance(final.response, list) assert final.response[0].reasoning == "Let me think..." + @pytest.mark.asyncio + async def test_streaming_extracts_think_tags_from_content_with_tool_call(self) -> None: + """#110 (streaming): inline in streamed content (no thinking + deltas) is extracted onto the FINAL tool call.""" + client = _make_client(think=True) + lines = [ + json.dumps({"message": {"role": "assistant", "content": "price "}, "done": False}), + json.dumps({"message": {"role": "assistant", "content": "first"}, "done": False}), + json.dumps({ + "message": { + "role": "assistant", + "content": "", + "tool_calls": [ + {"function": {"name": "get_pricing", "arguments": {"part": "X"}}} + ], + }, + "done": True, + }), + ] + client._http.stream.return_value = _MockStreamResponse(lines) + chunks = [] + async for chunk in client.send_stream( + [{"role": "user", "content": "test"}], tools=[_make_spec()] + ): + chunks.append(chunk) + final = [c for c in chunks if c.type == ChunkType.FINAL][0] + assert isinstance(final.response, list) + assert final.response[0].reasoning == "price first" + + @pytest.mark.asyncio + async def test_streaming_strips_think_tags_from_text_response(self) -> None: + """A streamed bare text reply has tags stripped from FINAL.""" + client = _make_client() + lines = [ + json.dumps({"message": {"role": "assistant", "content": "pondering"}, "done": False}), + json.dumps({"message": {"role": "assistant", "content": "Hello there."}, "done": False}), + json.dumps({"message": {"role": "assistant", "content": ""}, "done": True}), + ] + client._http.stream.return_value = _MockStreamResponse(lines) + chunks = [] + async for chunk in client.send_stream([{"role": "user", "content": "test"}]): + chunks.append(chunk) + final = [c for c in chunks if c.type == ChunkType.FINAL][0] + assert isinstance(final.response, TextResponse) + assert final.response.content == "Hello there." + @pytest.mark.asyncio async def test_streaming_thinking_preferred_over_content(self) -> None: """Streamed thinking tokens are preferred over content for reasoning.""" diff --git a/tests/unit/test_vllm_client.py b/tests/unit/test_vllm_client.py index d598137..c1fccfa 100644 --- a/tests/unit/test_vllm_client.py +++ b/tests/unit/test_vllm_client.py @@ -179,6 +179,58 @@ async def test_think_false_discards_reasoning(self) -> None: assert isinstance(result, list) assert result[0].reasoning is None + @pytest.mark.asyncio + async def test_extracts_think_tags_from_content_with_tool_call(self) -> None: + """#110: thinking inline in content (no `reasoning` field) is captured.""" + client = _make_client(think=True) + client._http.post.return_value = _mock_response( + _tool_call_response(content="check the weather first"), + ) + result = await client.send( + [{"role": "user", "content": "x"}], tools=[_make_spec()], + ) + assert isinstance(result, list) + assert result[0].reasoning == "check the weather first" + + @pytest.mark.asyncio + async def test_reasoning_field_preferred_over_content_tags(self) -> None: + """Structured `reasoning` field wins over tags in content.""" + client = _make_client(think=True) + client._http.post.return_value = _mock_response( + _tool_call_response(reasoning="structured", content="inline"), + ) + result = await client.send( + [{"role": "user", "content": "x"}], tools=[_make_spec()], + ) + assert isinstance(result, list) + assert result[0].reasoning == "structured" + + @pytest.mark.asyncio + async def test_think_tags_stripped_from_text_response(self) -> None: + """A bare text reply has tags stripped from its content.""" + client = _make_client() + client._http.post.return_value = _mock_response( + _text_response("ponderingThe answer is 42."), + ) + result = await client.send([{"role": "user", "content": "x"}]) + assert isinstance(result, TextResponse) + assert result.content == "The answer is 42." + + @pytest.mark.asyncio + async def test_thinking_only_text_response_empty_after_strip(self) -> None: + """Thinking-only reply (no answer, no tool call) strips to empty content. + + Matches LlamafileClient; the empty TextResponse then rides the existing + ResponseValidator retry path (covered in the validator tests). + """ + client = _make_client() + client._http.post.return_value = _mock_response( + _text_response("just thinking, no answer yet"), + ) + result = await client.send([{"role": "user", "content": "x"}]) + assert isinstance(result, TextResponse) + assert result.content == "" + @pytest.mark.asyncio async def test_usage_recorded(self) -> None: client = _make_client() @@ -381,6 +433,47 @@ async def test_accumulates_reasoning_across_deltas(self) -> None: assert isinstance(result, list) assert result[0].reasoning == "Let me think... " + @pytest.mark.asyncio + async def test_stream_extracts_think_tags_from_content_with_tool_call(self) -> None: + """#110 (streaming): inline in streamed content (no reasoning + deltas) is captured on the FINAL tool call.""" + client = _make_client(think=True) + client._http.stream.return_value = _MockStreamResponse([ + _sse({"choices": [{"delta": {"content": "inline "}}]}), + _sse({"choices": [{"delta": {"content": "plan"}}]}), + _sse({"choices": [{"delta": { + "tool_calls": [{ + "index": 0, + "function": {"name": "get_weather", "arguments": '{"city": "P"}'} + }], + }}]}), + "data: [DONE]", + ]) + chunks = [] + async for chunk in client.send_stream( + [{"role": "user", "content": "x"}], tools=[_make_spec()], + ): + chunks.append(chunk) + result = [c for c in chunks if c.type == ChunkType.FINAL][0].response + assert isinstance(result, list) + assert result[0].reasoning == "inline plan" + + @pytest.mark.asyncio + async def test_stream_strips_think_tags_from_text_response(self) -> None: + """A streamed bare text reply has tags stripped from FINAL.""" + client = _make_client() + client._http.stream.return_value = _MockStreamResponse([ + _sse({"choices": [{"delta": {"content": "pondering"}}]}), + _sse({"choices": [{"delta": {"content": "The answer is 42."}}]}), + "data: [DONE]", + ]) + chunks = [] + async for chunk in client.send_stream([{"role": "user", "content": "x"}]): + chunks.append(chunk) + result = [c for c in chunks if c.type == ChunkType.FINAL][0].response + assert isinstance(result, TextResponse) + assert result.content == "The answer is 42." + @pytest.mark.asyncio async def test_non_200_raises_backend_error(self) -> None: client = _make_client()