diff --git a/CHANGELOG.md b/CHANGELOG.md index a1f805e7..64fa994f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ ## [Unreleased] +### Fixed + +- Clarify `Response interrupted` recovery markers so they report that the live response stream stopped instead of asserting that the WebUI process restarted. The same stale-recovery path also covers browser/SSE disconnects and lost worker bookkeeping, so the marker now matches systemd evidence instead of implying a restart that did not happen. + ## [v0.51.131] — 2026-05-24 — Release DC (stage-batch13 — 6-PR notes-drawer + context-parity + PWA-swipe + locale polish) ### Added @@ -33,8 +37,6 @@ - **Opus Advisor verdict: SHIP-AS-IS.** Zero MUST-FIX. Three SHOULD-FIX items filed as follow-up issues (incomplete locale coverage for notes-drawer i18n keys, `_joplin_api_get` URL-token defense-in-depth, prefill `setattr` cache-reuse safety net). - **#2527 i18n coverage**: 10 of the 11 non-en locales currently ship the English string `'Third-party notes'` for the drawer header. Since the drawer is default-off, user impact is zero today; follow-up issue tracks proper translations before any default-on transition. - - ## [v0.51.130] — 2026-05-24 — Release DB (stage-batch12 — 3-PR profile-isolation + boot-precedence + workspace Artifacts tab) ### Fixed @@ -143,6 +145,7 @@ - UX evidence for #2812 captured at 1280/1440/1920/mobile (iPhone 14 emulation); Telegram-approved. - File a follow-up issue for pdeathsig-on-supervisor-thread hardening (#2854 deferred Option B) and French-locale `open_in_vscode` parity gap (predates this batch, Opus advisor flagged). + ## [v0.51.126] — 2026-05-24 — Release CX (stage-batch8 — 2-PR low-risk batch — kanban markdown + live activity timeline) ### Added @@ -326,7 +329,6 @@ - **PR #2738** by @weidzhou — `_write_session_index()` full-rebuild path now deduplicates entries by `session_id`. When old-format `session_*.json` files coexist with WebUI-format `xxx.json` files sharing the same `session_id`, the index produced duplicate Vue `:key` entries and crashed the frontend with a blank page. The lazy rebuild now uses `dict[session_id → compact_entry]` keyed on session_id, with the higher `message_count` entry winning on conflict. - **PR #2730** by @ashbuildslife — Sanitize git fetch diagnostics before returning update-check errors to the browser. New `_sanitize_git_diagnostic()` in `api/updates.py` strips credentialed URL userinfo (`user:token@host`), GitHub token shapes (`ghp_*`, `gho_*`, `github_pat_*`), and secret-looking query parameters (`?access_token=`, `?token=`, `?password=`, `?auth=`, `?key=`), then caps the message at 300 characters. Empirically verified that plain `https://github.com/owner/repo.git` URLs and SSH-style `git@host:owner/repo` remotes pass through untouched — only credentialed shapes are redacted. Update-check failure context (e.g. `Authentication failed`, network errors) is preserved. - **PR #2742** by @Isla-Liu — Per-turn SQLite connection leak in handoff-summary path (#2233). Two functions on the `/api/session/handoff-summary` hot path were opening `sqlite3.connect(...)` inside a bare `with` statement, which commits the transaction at scope exit but does NOT close the connection. Per-turn invocations accumulated `state.db`/`state.db-wal` file descriptors and CPython heap pages on long-lived worker threads, surfacing as multi-GB VmRSS / 6× duplicated state.db fds on long-running installs. Wrapped both call sites with `contextlib.closing(...)` (already imported and used at 7 other sites in the same files) so the connection is closed deterministically: `api/models.py::count_conversation_rounds` and `api/routes.py::_persist_handoff_summary_to_state_db`. Regression test loops both functions 20× against a tmp `state.db` and asserts `/proc//fd` count does not grow more than 2. Live soak: fd growth = 0, VmRSS growth = 0 KB across 20 POSTs. - ## [v0.51.107] — 2026-05-21 — Release CE (stage-400 — 8-PR batch — pinned-sessions-limit getter rename + uploaded-file user-turn dedupe + active-run repair guard + incremental KaTeX streaming + profile default model on fresh boot + French locale completion + update-check error surfacing + release-update apply path) ### Fixed diff --git a/api/models.py b/api/models.py index 0a734bc5..1317427b 100644 --- a/api/models.py +++ b/api/models.py @@ -764,18 +764,18 @@ def _get_profile_home(profile) -> Path: _INTERRUPTED_RECOVERED_WORDING = ( '**Response interrupted.**\n\n' - 'The WebUI process restarted before this turn finished. ' + 'The live response stream stopped before this turn finished. ' 'The partial output above was recovered from the run journal, ' 'but the interrupted agent process could not continue.' ) _INTERRUPTED_NO_OUTPUT_WORDING = ( '**Response interrupted.**\n\n' - 'The WebUI process restarted before this turn finished. ' + 'The live response stream stopped before this turn finished. ' 'The user message above was preserved, but no agent output was recovered.' ) _INTERRUPTED_PENDING_RETRY_WORDING = ( '**Response interrupted.**\n\n' - 'The WebUI process restarted before this turn finished. ' + 'The live response stream stopped before this turn finished. ' 'Recovering the partial output from the run journal — ' 'reload this session to retry.' ) @@ -783,7 +783,7 @@ _INTERRUPTED_PENDING_RETRY_WORDING = ( # or the marker has been pending longer than _JOURNAL_RETRY_GIVEUP_SECONDS). _INTERRUPTED_NEUTRAL_WORDING = ( '**Response interrupted.**\n\n' - 'The WebUI process restarted before this turn finished. ' + 'The live response stream stopped before this turn finished. ' 'Partial output may have been lost.' ) diff --git a/api/run_journal.py b/api/run_journal.py index 0a0f42ff..92b42e50 100644 --- a/api/run_journal.py +++ b/api/run_journal.py @@ -262,7 +262,7 @@ def stale_interrupted_event(session_id: str, run_id: str, *, after_seq: int | No return None payload = { "type": "interrupted", - "message": "WebUI restarted or lost the live worker before this run finished.", + "message": "The live worker stopped before this run finished.", "hint": "The transcript was restored to the last journaled event. Start a new turn if you still need the task to continue.", "session_id": session_id, "stream_id": run_id, diff --git a/api/streaming.py b/api/streaming.py index d319d9d7..db648803 100644 --- a/api/streaming.py +++ b/api/streaming.py @@ -2613,7 +2613,7 @@ def _stream_writeback_can_supersede_recovery_marker(session, msg_text): if last.get('type') != 'interrupted': return False content = str(last.get('content') or '') - if 'Response interrupted' not in content or 'WebUI process restarted' not in content: + if 'Response interrupted' not in content or 'before this turn finished' not in content: return False expected = ' '.join(str(msg_text or '').split()) diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 78f5f4ec..7464284e 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -87,9 +87,9 @@ If after running steps 1-4 the import still fails *and* `pip install -e .` succe ## "Response interrupted." marker keeps saying "no agent output was recovered" -**Symptom.** After the WebUI process restarts mid-turn (manual restart, OOM, crash, …), the affected chat shows an `**Response interrupted.**` marker with the wording *"The user message above was preserved, but no agent output was recovered."*, even though the run-journal for that turn is present on disk and contains the partial tokens the agent had already streamed. +**Symptom.** After a live response stream stops before a turn completes (manual restart, OOM, crash, browser/SSE disconnect, lost worker bookkeeping, …), the affected chat shows an `**Response interrupted.**` marker with the wording *"The user message above was preserved, but no agent output was recovered."*, even though the run-journal for that turn is present on disk and contains the partial tokens the agent had already streamed. -**Why.** Sidecar repair re-checks the run-journal at restart and uses the result as a one-shot signal. On WSL2 (9p / DrvFs) and on some network-backed setups, the run-journal `.jsonl` is written by the dead worker but the WebUI process reads it through a page-cache state that has not yet seen those writes — recovery returns "empty" and the marker is baked permanently. The fix introduces a *lazy* retry path: when sidecar repair cannot read visible output but knows the stream id, it stores a `_pending_journal_recovery` flag on the marker and re-attempts recovery from `get_session()` until the journal becomes readable (or the retry budget is exhausted). +**Why.** Sidecar repair re-checks the run-journal after it detects a stale stream and uses the result as a one-shot signal. On WSL2 (9p / DrvFs) and on some network-backed setups, the run-journal `.jsonl` is written by the stopped worker but the WebUI process reads it through a page-cache state that has not yet seen those writes — recovery returns "empty" and the marker is baked permanently. The fix introduces a *lazy* retry path: when sidecar repair cannot read visible output but knows the stream id, it stores a `_pending_journal_recovery` flag on the marker and re-attempts recovery from `get_session()` until the journal becomes readable (or the retry budget is exhausted). **Diagnostic.** diff --git a/tests/test_run_journal.py b/tests/test_run_journal.py index b7fb0b2d..3fd96ac9 100644 --- a/tests/test_run_journal.py +++ b/tests/test_run_journal.py @@ -112,12 +112,16 @@ def test_stale_interrupted_event_reports_non_terminal_journal(tmp_path, monkeypa monkeypatch.setattr("api.run_journal._default_session_dir", lambda: tmp_path) event = stale_interrupted_event("session_1", "run_1") + assert event is not None assert event["event"] == "apperror" assert event["seq"] == 2 assert event["terminal_state"] == "stale-from-restart" assert event["payload"]["type"] == "interrupted" assert "last journaled event" in event["payload"]["hint"] + assert "process restarted" not in event["payload"]["message"] + assert "lost the live worker" not in event["payload"]["message"] + assert "live worker stopped" in event["payload"]["message"] def test_stale_interrupted_event_skips_terminal_journal(tmp_path, monkeypatch): diff --git a/tests/test_session_lost_response_regression.py b/tests/test_session_lost_response_regression.py index 3349652d..04bddea6 100644 --- a/tests/test_session_lost_response_regression.py +++ b/tests/test_session_lost_response_regression.py @@ -2,8 +2,8 @@ The scenario this test pins down: -1. A WebUI process restarts mid-stream. On the first sidecar repair attempt - the run-journal for the dead stream is NOT visible yet (page-cache loss, +1. A WebUI live response stream stops mid-turn. On the first sidecar repair + attempt the run-journal for the dead stream is NOT visible yet (page-cache loss, un-fsynced writes, slow network FS, etc.) so `_append_journaled_partial_output` returns False. 2. Pre-fix the repair path baked a permanent "no agent output was recovered" @@ -251,6 +251,27 @@ def test_state_db_middle_segment_replay_does_not_append_after_sidecar_tail(): assert merged[-1]["content"] == "opened browser preview" +def test_interrupted_recovery_markers_do_not_claim_restart_as_fact(): + """A stale live worker is not always a WebUI process restart. + + Broken SSE connections, browser disconnects, lost worker bookkeeping, and + real restarts all enter the same recovery marker path. User-visible wording + must describe the generic interruption instead of asserting a process + restart that systemd evidence may later disprove. + """ + marker_texts = [ + models._INTERRUPTED_RECOVERED_WORDING, + models._INTERRUPTED_NO_OUTPUT_WORDING, + models._INTERRUPTED_PENDING_RETRY_WORDING, + models._INTERRUPTED_NEUTRAL_WORDING, + ] + + for text in marker_texts: + assert "Response interrupted" in text + assert "process restarted" not in text + assert "before this turn finished" in text + + def test_lost_response_recovered_on_second_read(hermes_home): sid = "9f14583f0e4e4444aaaa111122223333" stream_id = "7c8b4108d52b4aba9af362d3a54f47ac" diff --git a/tests/test_session_sidecar_repair.py b/tests/test_session_sidecar_repair.py index cae192f5..44a6edb4 100644 --- a/tests/test_session_sidecar_repair.py +++ b/tests/test_session_sidecar_repair.py @@ -319,7 +319,8 @@ class TestDraftRecovery: f"Error marker should not say 'preserved as a draft', got: {content}" ) assert "Response interrupted" in content - assert "WebUI process restarted" in content + assert "live response stream stopped" in content + assert "WebUI process restarted" not in content # The marker now arms the lazy-retry hook when a stream id is known # ("Recovering the partial output… reload to retry."). The legacy # "user message above was preserved" wording is reserved for the @@ -624,7 +625,8 @@ class TestNonEmptyMessagesPendingCleared: error_msgs = [m for m in s.messages if m.get("_error")] assert len(error_msgs) == 1 assert "Response interrupted" in error_msgs[0]["content"] - assert "WebUI process restarted" in error_msgs[0]["content"] + assert "live response stream stopped" in error_msgs[0]["content"] + assert "WebUI process restarted" not in error_msgs[0]["content"] assert error_msgs[0].get("type") == "interrupted" # Pending fields fully cleared