diff --git a/CHANGELOG.md b/CHANGELOG.md index a153e55e..080d7b76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased] +### Fixed + +- **#2083** — Reasoning models (Qwen3-thinking via LM Studio, DeepSeek-R1, Kimi-K2, etc.) no longer trigger a budget-doubling retry on auto-title generation when the model emits hidden reasoning tokens but no visible content (`api/streaming.py:_extract_title_response` and `_title_retry_status`). Pre-fix: a reasoning model that burned its entire 512-token budget on hidden thinking returned `finish_reason: length` with non-empty `reasoning_content`. `_extract_title_response()` classified that as `llm_length`, which triggered the budget-doubling retry path — and since the next call produced the same empty-reasoning shape, the retry just doubled the GPU/credit burn. Repeated across the two prompts in `_title_prompts()` that was up to ~3000 reasoning tokens of GPU work per new chat, and on local LM Studio servers (where `is_lmstudio=False` for `custom:` providers means `reasoning_effort: "none"` never reaches the model) it presented as the GPU never going idle after a prompt. Fix: classify any reasoning-bearing empty response as `llm_empty_reasoning` regardless of `finish_reason`, and short-circuit both the within-prompt budget retry AND the cross-prompt iteration on that status. Length-truncated responses WITHOUT reasoning tokens still get the legitimate budget-doubling retry. Falls through to `_fallback_title_from_exchange` for a local-summary title. Reported by @darkopetrovic. Companion agent-side classifier work (matching LM Studio via `base_url` fingerprint for `custom:` providers) tracked separately on the hermes-agent side. + ## [v0.51.46] — 2026-05-11 — Release V (5-PR contributor batch — CSP report-only + logs panel polish + plugin slash commands + turn-journal crash-safe writer + lifecycle events) ### Added diff --git a/api/streaming.py b/api/streaming.py index 2e54193a..aef684ab 100644 --- a/api/streaming.py +++ b/api/streaming.py @@ -877,9 +877,31 @@ def _title_retry_completion_budget(provider: str = '', model: str = '', base_url def _title_retry_status(status: str) -> bool: + # Whether to grant a second budget attempt within the same prompt+model + # combination. ``llm_length`` indicates the model would have produced + # content with more headroom, so doubling the budget can help. + # + # ``llm_empty_reasoning`` historically also triggered a retry, but for + # reasoning models (Qwen3-thinking, DeepSeek-R1, Kimi-K2, etc.) that + # status means the model burned its entire budget on hidden reasoning + # tokens and emitted nothing visible. Doubling the budget in that case + # just doubles the GPU/credit cost without changing the outcome — the + # next attempt produces the same shape. We skip the retry for empty- + # reasoning statuses and let the title path fall through to the local + # fallback summary. See issue #2083 for the LM Studio + Qwen3 repro. return status in { 'llm_length', 'llm_length_aux', + } + + +def _title_should_skip_remaining_attempts(status: str) -> bool: + # When a reasoning model burns its budget on hidden reasoning, + # additional prompts against the same model will hit the same wall. + # Short-circuit the prompt-iteration loop so we don't issue a second + # full-budget LLM call (and twice the GPU/credit burn) only to land in + # the same fallback path. See issue #2083. + return status in { 'llm_empty_reasoning', 'llm_empty_reasoning_aux', } @@ -922,10 +944,16 @@ def _extract_title_response(resp, *, aux: bool = False) -> tuple[str, str]: or _safe_text_value(_safe_obj_value(message, 'reasoning_content')) or _safe_text_value(_safe_obj_value(message, 'thinking')) ) - if finish_reason == 'length': - return '', f'llm_length{suffix}' + # When the model emitted reasoning tokens but no visible content, it + # burned its budget on hidden thinking — retrying with a larger budget + # almost never recovers a useful title (see issue #2083: Qwen3-thinking + # via LM Studio loops indefinitely on auto-title generation). Report + # this case distinctly so callers can short-circuit instead of double- + # billing the GPU/credit on a near-certain repeat. if reasoning: return '', f'llm_empty_reasoning{suffix}' + if finish_reason == 'length': + return '', f'llm_length{suffix}' return '', f'llm_empty{suffix}' except Exception: return '', f'llm_empty{suffix}' @@ -978,6 +1006,11 @@ def generate_title_raw_via_aux( except Exception as e: last_status = 'llm_error_aux' logger.debug("Aux title generation attempt %s failed: %s", idx + 1, e) + # If the model just burned its budget on hidden reasoning, retrying + # the next prompt against the same model produces the same shape. + # Short-circuit to the local fallback path (#2083). + if _title_should_skip_remaining_attempts(last_status): + break return None, last_status except Exception as e: logger.debug("Aux title generation failed: %s", e) @@ -1077,6 +1110,11 @@ def generate_title_raw_via_agent(agent, user_text: str, assistant_text: str) -> getattr(agent, 'model', None), e, ) + # If the model just burned its budget on hidden reasoning, retrying + # the next prompt against the same model produces the same shape. + # Short-circuit to the local fallback path (#2083). + if _title_should_skip_remaining_attempts(last_status): + break return None, last_status except Exception as e: logger.debug("Agent title generation failed: %s", e) diff --git a/tests/test_title_aux_routing.py b/tests/test_title_aux_routing.py index 3027aef7..373f0747 100644 --- a/tests/test_title_aux_routing.py +++ b/tests/test_title_aux_routing.py @@ -133,19 +133,48 @@ class TestReasoningModelTitleGeneration(unittest.TestCase): self.assertEqual(_title_completion_budget(), 512) self.assertEqual(_title_retry_completion_budget(), 1024) - def test_aux_retries_empty_reasoning_length_response_with_larger_budget(self): - """If a reasoning model returns empty content at finish_reason=length, retry once.""" + def test_aux_short_circuits_on_empty_reasoning_without_retrying(self): + """Regression for #2083: reasoning models that emit only hidden + reasoning tokens (no visible content) must NOT trigger a budget-doubling + retry — the second call invariably produces the same empty-reasoning + shape and just doubles the GPU/credit burn. Short-circuit to the local + fallback path instead.""" from api.streaming import generate_title_raw_via_aux - responses = [ - { + call_count = [0] + + def fake_call_llm(**kwargs): + call_count[0] += 1 + return { 'choices': [ { 'message': {'content': '', 'reasoning': 'long hidden reasoning'}, 'finish_reason': 'length', } ] - }, + } + + with _patch_tg_config({'provider': 'ollama', 'model': 'kimi-k2.6', 'base_url': 'https://ollama.com/v1'}): + with patch('agent.auxiliary_client.call_llm', side_effect=fake_call_llm, create=True): + result, status = generate_title_raw_via_aux( + user_text='Hey nur ein kurzer Test', + assistant_text='Alles klar, ich helfe dir dabei.', + ) + + self.assertIsNone(result) + self.assertEqual(status, 'llm_empty_reasoning_aux') + # One call per prompt at the base budget — no retry on prompt 0, no + # second-prompt attempt either (short-circuited). + self.assertEqual(call_count[0], 1) + + def test_aux_still_retries_finish_length_without_reasoning(self): + """Length-truncated responses WITHOUT reasoning tokens still get the + budget-doubling retry — those are legitimately recoverable by giving + the model more headroom.""" + from api.streaming import generate_title_raw_via_aux + + responses = [ + {'choices': [{'message': {'content': ''}, 'finish_reason': 'length'}]}, {'choices': [{'message': {'content': 'Useful Session Title'}, 'finish_reason': 'stop'}]}, ] captured_budgets = [] @@ -187,21 +216,58 @@ class TestReasoningModelTitleGeneration(unittest.TestCase): ) self.assertIsNone(result) - self.assertEqual(status, 'llm_length_aux') + self.assertEqual(status, 'llm_empty_reasoning_aux') - def test_agent_route_retries_empty_reasoning_length_response(self): - """The active-agent route should get the same reasoning-model retry path as aux.""" + def test_agent_route_short_circuits_on_empty_reasoning_without_retrying(self): + """Regression for #2083 on the active-agent route: empty-reasoning + responses must NOT trigger a budget-doubling retry.""" from api.streaming import generate_title_raw_via_agent - responses = [ - { + call_count = [0] + + def fake_create(**kwargs): + call_count[0] += 1 + return { 'choices': [ { 'message': {'content': '', 'reasoning': 'long hidden reasoning'}, 'finish_reason': 'length', } ] - }, + } + + client = types.SimpleNamespace( + chat=types.SimpleNamespace( + completions=types.SimpleNamespace(create=fake_create) + ) + ) + agent = MagicMock() + agent.api_mode = 'openai' + agent.provider = 'ollama' + agent.model = 'kimi-k2.6' + agent.base_url = 'https://ollama.com/v1' + agent.reasoning_config = None + agent._build_api_kwargs.return_value = {} + agent._ensure_primary_openai_client.return_value = client + + result, status = generate_title_raw_via_agent( + agent, + user_text='Hey nur ein kurzer Test', + assistant_text='Alles klar, ich helfe dir dabei.', + ) + + self.assertIsNone(result) + self.assertEqual(status, 'llm_empty_reasoning') + # One call per prompt at base budget — no retry, no second-prompt attempt. + self.assertEqual(call_count[0], 1) + self.assertIsNone(agent.reasoning_config) + + def test_agent_route_still_retries_finish_length_without_reasoning(self): + """The active-agent route should preserve retry-on-length-no-reasoning.""" + from api.streaming import generate_title_raw_via_agent + + responses = [ + {'choices': [{'message': {'content': ''}, 'finish_reason': 'length'}]}, {'choices': [{'message': {'content': 'Agent Session Title'}, 'finish_reason': 'stop'}]}, ] captured_budgets = []