mirror of
https://github.com/nesquena/hermes-webui.git
synced 2026-05-29 13:10:17 +00:00
fix: skip budget-doubling title retry for reasoning-only responses (#2083)
Reasoning models (Qwen3-thinking via LM Studio, DeepSeek-R1, Kimi-K2,
etc.) can burn their entire output budget on hidden reasoning tokens and
emit no visible content. The previous title-generation retry path
classified that as llm_length and doubled the budget — but the second
call produces the same shape, so the retry only doubled the GPU/credit
burn. Repeated across the two prompts in _title_prompts() this came to
~3000 reasoning tokens of GPU work per new chat. On local LM Studio
servers behind a custom: provider (where is_lmstudio=False means
reasoning_effort: none never reaches the model) it manifested as the GPU
never going idle after a prompt.
Fix:
- _extract_title_response: classify reasoning-bearing empty responses
as llm_empty_reasoning regardless of finish_reason. The presence of
reasoning_content is the diagnostic signal, not finish_reason.
- _title_retry_status: drop llm_empty_reasoning from the retry set.
Length-truncated responses WITHOUT reasoning still retry (those are
legitimately recoverable by a larger budget).
- Add _title_should_skip_remaining_attempts() and break out of the
prompt-iteration loop on empty-reasoning. A second prompt against
the same model would produce the same shape.
- Falls through to _fallback_title_from_exchange for a local-summary
title.
Tests updated to invert the previous reasoning-retry assertions:
- test_aux_short_circuits_on_empty_reasoning_without_retrying
- test_aux_still_retries_finish_length_without_reasoning
- test_agent_route_short_circuits_on_empty_reasoning_without_retrying
- test_agent_route_still_retries_finish_length_without_reasoning
Companion agent-side work (LM Studio classifier for custom: providers)
is tracked separately on the hermes-agent side; this WebUI fix is the
belt-and-braces guard so the loop stops regardless of agent classifier
state.
Reported by @darkopetrovic. Closes #2083.
Co-authored-by: darkopetrovic <darkopetrovic@users.noreply.github.com>
This commit is contained in:
@@ -2,6 +2,10 @@
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Fixed
|
||||
|
||||
- **#2083** — Reasoning models (Qwen3-thinking via LM Studio, DeepSeek-R1, Kimi-K2, etc.) no longer trigger a budget-doubling retry on auto-title generation when the model emits hidden reasoning tokens but no visible content (`api/streaming.py:_extract_title_response` and `_title_retry_status`). Pre-fix: a reasoning model that burned its entire 512-token budget on hidden thinking returned `finish_reason: length` with non-empty `reasoning_content`. `_extract_title_response()` classified that as `llm_length`, which triggered the budget-doubling retry path — and since the next call produced the same empty-reasoning shape, the retry just doubled the GPU/credit burn. Repeated across the two prompts in `_title_prompts()` that was up to ~3000 reasoning tokens of GPU work per new chat, and on local LM Studio servers (where `is_lmstudio=False` for `custom:` providers means `reasoning_effort: "none"` never reaches the model) it presented as the GPU never going idle after a prompt. Fix: classify any reasoning-bearing empty response as `llm_empty_reasoning` regardless of `finish_reason`, and short-circuit both the within-prompt budget retry AND the cross-prompt iteration on that status. Length-truncated responses WITHOUT reasoning tokens still get the legitimate budget-doubling retry. Falls through to `_fallback_title_from_exchange` for a local-summary title. Reported by @darkopetrovic. Companion agent-side classifier work (matching LM Studio via `base_url` fingerprint for `custom:` providers) tracked separately on the hermes-agent side.
|
||||
|
||||
## [v0.51.46] — 2026-05-11 — Release V (5-PR contributor batch — CSP report-only + logs panel polish + plugin slash commands + turn-journal crash-safe writer + lifecycle events)
|
||||
|
||||
### Added
|
||||
|
||||
+40
-2
@@ -877,9 +877,31 @@ def _title_retry_completion_budget(provider: str = '', model: str = '', base_url
|
||||
|
||||
|
||||
def _title_retry_status(status: str) -> bool:
|
||||
# Whether to grant a second budget attempt within the same prompt+model
|
||||
# combination. ``llm_length`` indicates the model would have produced
|
||||
# content with more headroom, so doubling the budget can help.
|
||||
#
|
||||
# ``llm_empty_reasoning`` historically also triggered a retry, but for
|
||||
# reasoning models (Qwen3-thinking, DeepSeek-R1, Kimi-K2, etc.) that
|
||||
# status means the model burned its entire budget on hidden reasoning
|
||||
# tokens and emitted nothing visible. Doubling the budget in that case
|
||||
# just doubles the GPU/credit cost without changing the outcome — the
|
||||
# next attempt produces the same shape. We skip the retry for empty-
|
||||
# reasoning statuses and let the title path fall through to the local
|
||||
# fallback summary. See issue #2083 for the LM Studio + Qwen3 repro.
|
||||
return status in {
|
||||
'llm_length',
|
||||
'llm_length_aux',
|
||||
}
|
||||
|
||||
|
||||
def _title_should_skip_remaining_attempts(status: str) -> bool:
|
||||
# When a reasoning model burns its budget on hidden reasoning,
|
||||
# additional prompts against the same model will hit the same wall.
|
||||
# Short-circuit the prompt-iteration loop so we don't issue a second
|
||||
# full-budget LLM call (and twice the GPU/credit burn) only to land in
|
||||
# the same fallback path. See issue #2083.
|
||||
return status in {
|
||||
'llm_empty_reasoning',
|
||||
'llm_empty_reasoning_aux',
|
||||
}
|
||||
@@ -922,10 +944,16 @@ def _extract_title_response(resp, *, aux: bool = False) -> tuple[str, str]:
|
||||
or _safe_text_value(_safe_obj_value(message, 'reasoning_content'))
|
||||
or _safe_text_value(_safe_obj_value(message, 'thinking'))
|
||||
)
|
||||
if finish_reason == 'length':
|
||||
return '', f'llm_length{suffix}'
|
||||
# When the model emitted reasoning tokens but no visible content, it
|
||||
# burned its budget on hidden thinking — retrying with a larger budget
|
||||
# almost never recovers a useful title (see issue #2083: Qwen3-thinking
|
||||
# via LM Studio loops indefinitely on auto-title generation). Report
|
||||
# this case distinctly so callers can short-circuit instead of double-
|
||||
# billing the GPU/credit on a near-certain repeat.
|
||||
if reasoning:
|
||||
return '', f'llm_empty_reasoning{suffix}'
|
||||
if finish_reason == 'length':
|
||||
return '', f'llm_length{suffix}'
|
||||
return '', f'llm_empty{suffix}'
|
||||
except Exception:
|
||||
return '', f'llm_empty{suffix}'
|
||||
@@ -978,6 +1006,11 @@ def generate_title_raw_via_aux(
|
||||
except Exception as e:
|
||||
last_status = 'llm_error_aux'
|
||||
logger.debug("Aux title generation attempt %s failed: %s", idx + 1, e)
|
||||
# If the model just burned its budget on hidden reasoning, retrying
|
||||
# the next prompt against the same model produces the same shape.
|
||||
# Short-circuit to the local fallback path (#2083).
|
||||
if _title_should_skip_remaining_attempts(last_status):
|
||||
break
|
||||
return None, last_status
|
||||
except Exception as e:
|
||||
logger.debug("Aux title generation failed: %s", e)
|
||||
@@ -1077,6 +1110,11 @@ def generate_title_raw_via_agent(agent, user_text: str, assistant_text: str) ->
|
||||
getattr(agent, 'model', None),
|
||||
e,
|
||||
)
|
||||
# If the model just burned its budget on hidden reasoning, retrying
|
||||
# the next prompt against the same model produces the same shape.
|
||||
# Short-circuit to the local fallback path (#2083).
|
||||
if _title_should_skip_remaining_attempts(last_status):
|
||||
break
|
||||
return None, last_status
|
||||
except Exception as e:
|
||||
logger.debug("Agent title generation failed: %s", e)
|
||||
|
||||
@@ -133,19 +133,48 @@ class TestReasoningModelTitleGeneration(unittest.TestCase):
|
||||
self.assertEqual(_title_completion_budget(), 512)
|
||||
self.assertEqual(_title_retry_completion_budget(), 1024)
|
||||
|
||||
def test_aux_retries_empty_reasoning_length_response_with_larger_budget(self):
|
||||
"""If a reasoning model returns empty content at finish_reason=length, retry once."""
|
||||
def test_aux_short_circuits_on_empty_reasoning_without_retrying(self):
|
||||
"""Regression for #2083: reasoning models that emit only hidden
|
||||
reasoning tokens (no visible content) must NOT trigger a budget-doubling
|
||||
retry — the second call invariably produces the same empty-reasoning
|
||||
shape and just doubles the GPU/credit burn. Short-circuit to the local
|
||||
fallback path instead."""
|
||||
from api.streaming import generate_title_raw_via_aux
|
||||
|
||||
responses = [
|
||||
{
|
||||
call_count = [0]
|
||||
|
||||
def fake_call_llm(**kwargs):
|
||||
call_count[0] += 1
|
||||
return {
|
||||
'choices': [
|
||||
{
|
||||
'message': {'content': '', 'reasoning': 'long hidden reasoning'},
|
||||
'finish_reason': 'length',
|
||||
}
|
||||
]
|
||||
},
|
||||
}
|
||||
|
||||
with _patch_tg_config({'provider': 'ollama', 'model': 'kimi-k2.6', 'base_url': 'https://ollama.com/v1'}):
|
||||
with patch('agent.auxiliary_client.call_llm', side_effect=fake_call_llm, create=True):
|
||||
result, status = generate_title_raw_via_aux(
|
||||
user_text='Hey nur ein kurzer Test',
|
||||
assistant_text='Alles klar, ich helfe dir dabei.',
|
||||
)
|
||||
|
||||
self.assertIsNone(result)
|
||||
self.assertEqual(status, 'llm_empty_reasoning_aux')
|
||||
# One call per prompt at the base budget — no retry on prompt 0, no
|
||||
# second-prompt attempt either (short-circuited).
|
||||
self.assertEqual(call_count[0], 1)
|
||||
|
||||
def test_aux_still_retries_finish_length_without_reasoning(self):
|
||||
"""Length-truncated responses WITHOUT reasoning tokens still get the
|
||||
budget-doubling retry — those are legitimately recoverable by giving
|
||||
the model more headroom."""
|
||||
from api.streaming import generate_title_raw_via_aux
|
||||
|
||||
responses = [
|
||||
{'choices': [{'message': {'content': ''}, 'finish_reason': 'length'}]},
|
||||
{'choices': [{'message': {'content': 'Useful Session Title'}, 'finish_reason': 'stop'}]},
|
||||
]
|
||||
captured_budgets = []
|
||||
@@ -187,21 +216,58 @@ class TestReasoningModelTitleGeneration(unittest.TestCase):
|
||||
)
|
||||
|
||||
self.assertIsNone(result)
|
||||
self.assertEqual(status, 'llm_length_aux')
|
||||
self.assertEqual(status, 'llm_empty_reasoning_aux')
|
||||
|
||||
def test_agent_route_retries_empty_reasoning_length_response(self):
|
||||
"""The active-agent route should get the same reasoning-model retry path as aux."""
|
||||
def test_agent_route_short_circuits_on_empty_reasoning_without_retrying(self):
|
||||
"""Regression for #2083 on the active-agent route: empty-reasoning
|
||||
responses must NOT trigger a budget-doubling retry."""
|
||||
from api.streaming import generate_title_raw_via_agent
|
||||
|
||||
responses = [
|
||||
{
|
||||
call_count = [0]
|
||||
|
||||
def fake_create(**kwargs):
|
||||
call_count[0] += 1
|
||||
return {
|
||||
'choices': [
|
||||
{
|
||||
'message': {'content': '', 'reasoning': 'long hidden reasoning'},
|
||||
'finish_reason': 'length',
|
||||
}
|
||||
]
|
||||
},
|
||||
}
|
||||
|
||||
client = types.SimpleNamespace(
|
||||
chat=types.SimpleNamespace(
|
||||
completions=types.SimpleNamespace(create=fake_create)
|
||||
)
|
||||
)
|
||||
agent = MagicMock()
|
||||
agent.api_mode = 'openai'
|
||||
agent.provider = 'ollama'
|
||||
agent.model = 'kimi-k2.6'
|
||||
agent.base_url = 'https://ollama.com/v1'
|
||||
agent.reasoning_config = None
|
||||
agent._build_api_kwargs.return_value = {}
|
||||
agent._ensure_primary_openai_client.return_value = client
|
||||
|
||||
result, status = generate_title_raw_via_agent(
|
||||
agent,
|
||||
user_text='Hey nur ein kurzer Test',
|
||||
assistant_text='Alles klar, ich helfe dir dabei.',
|
||||
)
|
||||
|
||||
self.assertIsNone(result)
|
||||
self.assertEqual(status, 'llm_empty_reasoning')
|
||||
# One call per prompt at base budget — no retry, no second-prompt attempt.
|
||||
self.assertEqual(call_count[0], 1)
|
||||
self.assertIsNone(agent.reasoning_config)
|
||||
|
||||
def test_agent_route_still_retries_finish_length_without_reasoning(self):
|
||||
"""The active-agent route should preserve retry-on-length-no-reasoning."""
|
||||
from api.streaming import generate_title_raw_via_agent
|
||||
|
||||
responses = [
|
||||
{'choices': [{'message': {'content': ''}, 'finish_reason': 'length'}]},
|
||||
{'choices': [{'message': {'content': 'Agent Session Title'}, 'finish_reason': 'stop'}]},
|
||||
]
|
||||
captured_budgets = []
|
||||
|
||||
Reference in New Issue
Block a user