From b34ce63c978ed12238e8cd4d407204a92e99789d Mon Sep 17 00:00:00 2001 From: Nathan Esquenazi Date: Mon, 4 May 2026 14:49:38 -0700 Subject: [PATCH 1/4] fix(oauth): honor cancel during Codex device-token exchange (follow-up to #1652) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Codex OAuth onboarding worker introduced in #1652 had a cancel-vs-worker race: a `cancel_onboarding_oauth_flow` request that arrived while the worker was mid-network-call (between the `live = dict(...)` snapshot and the next status check) would be silently overridden: 1. User clicks Cancel → server sets flow.status = "cancelled" and drops sensitive lifecycle fields under the lock. 2. Worker is mid-`_poll_codex_authorization` / `_exchange_codex_authorization` using the local `live` snapshot it captured before the cancel. 3. Worker calls `_persist_codex_credentials(...)` — auth.json gets written. 4. Worker calls `_set_flow_status(flow_id, "success")` — overrides the cancelled status. Net effect: the user's explicit cancel is ignored, credentials are persisted, and the UI reports success. Reproduced with a behavioural harness that drove a real worker thread against patched network helpers and confirmed: pre-fix : flow status `success`, auth.json written despite cancel post-fix: flow status `cancelled`, auth.json NOT written The fix re-checks the flow status under `_OAUTH_FLOWS_LOCK` after the token exchange completes and before persisting. If the status is no longer `pending`, the worker exits without persisting credentials and without overwriting the terminal status. Regression test `test_cancel_during_token_exchange_does_not_persist_credentials` drives the worker against threading.Event-gated network stubs to reproduce the race deterministically and lock the new invariant. Trace verified against fresh hermes-agent tarball — credential_pool entry shape (`auth_type=oauth`, `source=manual:device_code`, `priority=0`, base_url) remains compatible with `agent.credential_pool.load_pool("openai-codex")` and the agent CLI's `_save_codex_tokens` legacy fallback path. Tests: - 10/10 in tests/test_issue1362_codex_oauth_onboarding.py - Full suite: 4230 passed, 57 skipped, 3 xpassed, 0 failed in 33.82s Co-Authored-By: Claude Opus 4.7 (1M context) --- api/oauth.py | 7 +++ .../test_issue1362_codex_oauth_onboarding.py | 54 +++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/api/oauth.py b/api/oauth.py index 621a4814..e70f2a6b 100644 --- a/api/oauth.py +++ b/api/oauth.py @@ -347,6 +347,13 @@ def _run_codex_oauth_worker(flow_id: str) -> None: if not authorization_code or not code_verifier: raise RuntimeError("Device auth response missing authorization_code or code_verifier") tokens = _exchange_codex_authorization(authorization_code, code_verifier) + # Re-check status under lock before persisting: a cancel/expire that + # raced with the device-token + token-exchange network calls must + # win, so we don't persist credentials the user explicitly aborted. + with _OAUTH_FLOWS_LOCK: + current = _OAUTH_FLOWS.get(flow_id) + if not current or current.get("status") != "pending": + return _persist_codex_credentials(Path(live["hermes_home"]), tokens) _set_flow_status(flow_id, "success") return diff --git a/tests/test_issue1362_codex_oauth_onboarding.py b/tests/test_issue1362_codex_oauth_onboarding.py index 669fd90e..bc8d1149 100644 --- a/tests/test_issue1362_codex_oauth_onboarding.py +++ b/tests/test_issue1362_codex_oauth_onboarding.py @@ -111,6 +111,60 @@ def test_cancel_marks_flow_cancelled_and_poll_stops(tmp_path): assert polled["status"] == "cancelled" +def test_cancel_during_token_exchange_does_not_persist_credentials(monkeypatch, tmp_path): + """Cancel arriving while the worker is mid-network-call must win. + + Without the post-exchange status re-check, the worker would proceed to + persist credentials to auth.json AND override the cancelled status with + "success" — silently storing tokens the user explicitly aborted. + """ + import threading + import api.oauth as oauth + + oauth._OAUTH_FLOWS.clear() + + poll_started = threading.Event() + poll_continue = threading.Event() + + def _slow_poll(device_auth_id, user_code): + poll_started.set() + assert poll_continue.wait(timeout=5) + return {"authorization_code": "auth-code", "code_verifier": "verifier"} + + def _exchange(authorization_code, code_verifier): + return {"access_token": "ACCESS", "refresh_token": "REFRESH"} + + monkeypatch.setattr(oauth, "_poll_codex_authorization", _slow_poll) + monkeypatch.setattr(oauth, "_exchange_codex_authorization", _exchange) + + flow_id = "race-flow" + oauth._OAUTH_FLOWS[flow_id] = { + "provider": "openai-codex", + "status": "pending", + "device_auth_id": "device-secret", + "user_code": "ABCD-EFGH", + "expires_at": time.time() + 600, + "poll_interval_seconds": 1, + "hermes_home": str(tmp_path), + "created_at": time.time(), + "updated_at": time.time(), + } + + worker = threading.Thread(target=oauth._run_codex_oauth_worker, args=(flow_id,), daemon=True) + worker.start() + assert poll_started.wait(timeout=5) + + oauth.cancel_onboarding_oauth_flow({"flow_id": flow_id}) + assert oauth._OAUTH_FLOWS[flow_id]["status"] == "cancelled" + + poll_continue.set() + worker.join(timeout=5) + assert not worker.is_alive() + + assert oauth._OAUTH_FLOWS[flow_id]["status"] == "cancelled" + assert not (tmp_path / "auth.json").exists() + + def test_expired_flow_reports_expired_and_drops_sensitive_lifecycle(tmp_path): import api.oauth as oauth From ca135c2015ad09d11c0177b1439a2d569c89886e Mon Sep 17 00:00:00 2001 From: Michael Lam Date: Mon, 4 May 2026 15:30:37 -0700 Subject: [PATCH 2/4] fix: harden persistent WebUI health checks --- api/routes.py | 140 ++++++++++++++++++-- docs/supervisor.md | 41 ++++++ server.py | 60 +++++++++ tests/test_issue1458_stability_hardening.py | 66 +++++++++ 4 files changed, 296 insertions(+), 11 deletions(-) create mode 100644 tests/test_issue1458_stability_hardening.py diff --git a/api/routes.py b/api/routes.py index 8ef98b63..42a90bbd 100644 --- a/api/routes.py +++ b/api/routes.py @@ -12,6 +12,7 @@ import queue import re import platform import shutil +import sqlite3 import subprocess import sys import threading @@ -19,6 +20,7 @@ import time import uuid import re from pathlib import Path +from contextlib import closing from urllib.parse import parse_qs from api.agent_sessions import MESSAGING_SOURCES @@ -1222,6 +1224,7 @@ from api.models import ( title_from, _write_session_index, SESSION_INDEX_FILE, + _active_state_db_path, load_projects, save_projects, import_cli_session, @@ -1658,6 +1661,131 @@ def _handle_insights(handler, parsed) -> bool: # ── GET routes ──────────────────────────────────────────────────────────────── +def _accept_loop_health(handler) -> dict: + server = getattr(handler, "server", None) + return { + "requests_total": int(getattr(server, "accept_loop_requests_total", 0) or 0), + "last_request_at": round(float(getattr(server, "accept_loop_last_request_at", 0.0) or 0.0), 3), + } + + +def _streams_lock_health(timeout_seconds: float = 0.5) -> dict: + t0 = time.time() + acquired = STREAMS_LOCK.acquire(timeout=timeout_seconds) + elapsed_ms = round((time.time() - t0) * 1000, 1) + if not acquired: + return { + "status": "blocked", + "timeout_seconds": timeout_seconds, + "ms": elapsed_ms, + } + try: + return { + "status": "ok", + "active_streams": len(STREAMS), + "ms": elapsed_ms, + } + finally: + STREAMS_LOCK.release() + + +def _deep_health_checks() -> tuple[dict, bool]: + """Run cheap probes that exercise the state paths used by the UI shell. + + Plain /health intentionally stays tiny. /health?deep=1 is for supervisors + and watchdogs that need to know whether the process can still touch the + shared stream map, sidebar/session path, project state, and Hermes state.db + without hitting the RST-before-write failure mode from #1458. + """ + checks: dict[str, dict] = {} + + checks["streams_lock"] = _streams_lock_health() + if checks["streams_lock"].get("status") != "ok": + return checks, False + + t0 = time.time() + try: + sessions = all_sessions() + checks["sessions"] = { + "status": "ok", + "count": len(sessions), + "ms": round((time.time() - t0) * 1000, 1), + } + except Exception as exc: + checks["sessions"] = { + "status": "error", + "error": type(exc).__name__, + "ms": round((time.time() - t0) * 1000, 1), + } + + t0 = time.time() + try: + projects = load_projects(_migrate=False) + checks["projects"] = { + "status": "ok", + "count": len(projects), + "ms": round((time.time() - t0) * 1000, 1), + } + except Exception as exc: + checks["projects"] = { + "status": "error", + "error": type(exc).__name__, + "ms": round((time.time() - t0) * 1000, 1), + } + + t0 = time.time() + try: + db_path = _active_state_db_path() + if not db_path.exists(): + checks["state_db"] = { + "status": "missing", + "ms": round((time.time() - t0) * 1000, 1), + } + else: + with closing(sqlite3.connect(str(db_path))) as conn: + conn.execute("PRAGMA schema_version").fetchone() + checks["state_db"] = { + "status": "ok", + "ms": round((time.time() - t0) * 1000, 1), + } + except Exception as exc: + checks["state_db"] = { + "status": "error", + "error": type(exc).__name__, + "ms": round((time.time() - t0) * 1000, 1), + } + + healthy = all( + check.get("status") in {"ok", "missing"} + for check in checks.values() + ) + return checks, healthy + + +def _handle_health(handler, parsed): + deep = parse_qs(parsed.query or "").get("deep", [""])[0].lower() in {"1", "true", "yes", "on"} + stream_check = _streams_lock_health() + payload = { + "status": "ok" if stream_check.get("status") == "ok" else "degraded", + "sessions": len(SESSIONS), + "active_streams": int(stream_check.get("active_streams") or 0), + "uptime_seconds": round(time.time() - SERVER_START_TIME, 1), + "accept_loop": _accept_loop_health(handler), + } + if deep: + if stream_check.get("status") != "ok": + payload["checks"] = {"streams_lock": stream_check} + return j(handler, payload, status=503) + checks, healthy = _deep_health_checks() + payload["checks"] = checks + if not healthy: + payload["status"] = "degraded" + return j(handler, payload, status=503) + if payload["status"] != "ok": + return j(handler, payload, status=503) + return j(handler, payload) + + def handle_get(handler, parsed) -> bool: """Handle all GET routes. Returns True if handled, False for 404.""" @@ -1776,17 +1904,7 @@ def handle_get(handler, parsed) -> bool: return _handle_insights(handler, parsed) if parsed.path == "/health": - with STREAMS_LOCK: - n_streams = len(STREAMS) - return j( - handler, - { - "status": "ok", - "sessions": len(SESSIONS), - "active_streams": n_streams, - "uptime_seconds": round(time.time() - SERVER_START_TIME, 1), - }, - ) + return _handle_health(handler, parsed) if parsed.path == "/api/models": return j(handler, get_available_models()) diff --git a/docs/supervisor.md b/docs/supervisor.md index 4ec433b0..85821a69 100644 --- a/docs/supervisor.md +++ b/docs/supervisor.md @@ -235,3 +235,44 @@ PID PPID CMD If PPID is ``1`` (init) when it should be the supervisor, the orphan-server loop is happening — re-check that ``--foreground`` (or one of the env vars) is reaching the process. + +## HTTP watchdog / deep health + +``KeepAlive`` / ``Restart=always`` only recover a process that exits. If the +process is still listening on the port but request handling is wedged, pair your +supervisor with an HTTP probe and force a restart when the probe fails. + +Hermes Web UI exposes two health levels: + +- ``/health`` — cheap liveness probe with ``active_streams``, uptime, and an + ``accept_loop`` heartbeat counter. +- ``/health?deep=1`` — readiness probe that briefly acquires the stream lock, + reads the sidebar/session path, reads projects state, and touches Hermes + ``state.db`` if it exists. Use this for watchdogs. + +At startup the server also tries to raise its file-descriptor soft limit to +4096 on platforms that support ``RLIMIT_NOFILE``. That is defense in depth for +persistent hosts: leaks should still be fixed, but a higher soft limit gives +you more diagnostic headroom before request handling falls over. + +Minimal macOS launchd watchdog script: + +```bash +#!/usr/bin/env bash +set -euo pipefail +LABEL="com.example.hermes-webui" +BASE="http://127.0.0.1:8787" + +if ! curl -fsS --max-time 10 "$BASE/health?deep=1" >/dev/null; then + launchctl kickstart -k "gui/$(id -u)/$LABEL" +fi +``` + +Run it every few minutes from a separate ``StartInterval`` LaunchAgent. For +systemd, prefer a timer/service pair that runs the same curl probe and +``systemctl --user restart hermes-webui.service`` on failure. + +The ``accept_loop.requests_total`` value should increase when probes arrive. If +it stays flat while the process is still alive, the server accept loop is not +making progress; capture logs/thread samples before restarting if you are +collecting diagnostics for a bug report. diff --git a/server.py b/server.py index 16691bb4..5b9a7062 100644 --- a/server.py +++ b/server.py @@ -9,6 +9,11 @@ import sys import time import traceback from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer + +try: + import resource +except ImportError: # pragma: no cover - resource is Unix-only + resource = None from urllib.parse import urlparse logger = logging.getLogger(__name__) @@ -26,6 +31,22 @@ class QuietHTTPServer(ThreadingHTTPServer): """Custom HTTP server that silently handles common network errors.""" daemon_threads = True request_queue_size = 64 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.accept_loop_requests_total = 0 + self.accept_loop_last_request_at = 0.0 + + def _handle_request_noblock(self): + """Record accept-loop progress before dispatching a request handler. + + A process can be alive and still stop accepting/dispatching requests. + Exposing this heartbeat on /health gives supervisors and watchdogs a + cheap signal that the accept loop is still moving. + """ + self.accept_loop_requests_total += 1 + self.accept_loop_last_request_at = time.time() + return super()._handle_request_noblock() def handle_error(self, request, client_address): """Override to suppress logging for common client disconnect errors.""" @@ -129,11 +150,50 @@ class Handler(BaseHTTPRequestHandler): clear_request_profile() +def _raise_fd_soft_limit(target: int = 4096) -> dict: + """Best-effort raise of RLIMIT_NOFILE for persistent WebUI hosts. + + macOS launchd jobs often start with a 256 soft limit. If a future FD leak + regresses, that low ceiling turns a leak into a hard HTTP wedge quickly. + Raising the soft limit does not hide leaks; it buys enough headroom for + diagnostics and watchdog recovery. + """ + if resource is None: + return {"status": "unsupported"} + try: + soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) + except Exception as exc: + return {"status": "error", "error": str(exc)} + + # On Unix, RLIM_INFINITY is commonly a large int; keep the logic explicit + # so tests can use ordinary integers without depending on platform values. + desired = int(target) + if hard not in (-1, getattr(resource, "RLIM_INFINITY", object())): + desired = min(desired, int(hard)) + if soft >= desired: + return {"status": "unchanged", "soft": soft, "hard": hard} + try: + resource.setrlimit(resource.RLIMIT_NOFILE, (desired, hard)) + except Exception as exc: + return {"status": "error", "soft": soft, "hard": hard, "error": str(exc)} + return {"status": "raised", "soft": desired, "hard": hard, "previous_soft": soft} + + def main() -> None: from api.config import print_startup_config, verify_hermes_imports, _HERMES_FOUND print_startup_config() + fd_limit = _raise_fd_soft_limit() + if fd_limit.get("status") == "raised": + print( + f"[ok] Raised file descriptor soft limit " + f"{fd_limit.get('previous_soft')} -> {fd_limit.get('soft')}", + flush=True, + ) + elif fd_limit.get("status") == "error": + print(f"[!!] WARNING: Could not raise file descriptor limit: {fd_limit.get('error')}", flush=True) + # Fix sensitive file permissions before doing anything else fix_credential_permissions() diff --git a/tests/test_issue1458_stability_hardening.py b/tests/test_issue1458_stability_hardening.py new file mode 100644 index 00000000..63dd29c6 --- /dev/null +++ b/tests/test_issue1458_stability_hardening.py @@ -0,0 +1,66 @@ +"""Regression coverage for issue #1458 persistent-host hardening.""" +import json +import urllib.request + +from tests._pytest_port import BASE + + +def _get(path): + with urllib.request.urlopen(BASE + path, timeout=10) as r: + return json.loads(r.read()), r.status + + +def test_health_exposes_accept_loop_heartbeat(): + data, status = _get("/health") + + assert status == 200 + heartbeat = data.get("accept_loop") + assert isinstance(heartbeat, dict) + assert isinstance(heartbeat.get("requests_total"), int) + assert heartbeat["requests_total"] >= 1 + assert isinstance(heartbeat.get("last_request_at"), (int, float)) + assert heartbeat["last_request_at"] > 0 + + +def test_deep_health_exercises_session_project_and_sqlite_paths(): + data, status = _get("/health?deep=1") + + assert status == 200 + assert data["status"] == "ok" + checks = data.get("checks") + assert isinstance(checks, dict) + assert checks["streams_lock"]["status"] == "ok" + assert isinstance(checks["streams_lock"].get("active_streams"), int) + assert checks["sessions"]["status"] == "ok" + assert isinstance(checks["sessions"].get("count"), int) + assert checks["projects"]["status"] == "ok" + assert isinstance(checks["projects"].get("count"), int) + # The isolated test home may not have a Hermes state.db yet. Deep health + # should still report the state-db probe explicitly so watchdogs can tell + # whether sqlite was checked or absent. + assert checks["state_db"]["status"] in {"ok", "missing"} + + +def test_server_raises_fd_soft_limit_when_resource_allows(monkeypatch): + import server + + calls = [] + + class FakeResource: + RLIMIT_NOFILE = object() + + @staticmethod + def getrlimit(which): + return (256, 8192) + + @staticmethod + def setrlimit(which, limits): + calls.append((which, limits)) + + monkeypatch.setattr(server, "resource", FakeResource, raising=False) + + result = server._raise_fd_soft_limit(target=4096) + + assert result["status"] == "raised" + assert result["soft"] == 4096 + assert calls == [(FakeResource.RLIMIT_NOFILE, (4096, 8192))] From d4385f8aa2b97c0b7677c2d9cea2d42b93e6c04e Mon Sep 17 00:00:00 2001 From: bergeouss Date: Mon, 4 May 2026 22:37:55 +0000 Subject: [PATCH 3/4] fix: false read-only detection in docker_init.bash (#1470 follow-up) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The read-only rootfs guard added in PR #1635 (issue #1470) checks [ ! -w /etc/group ] as the current user (hermeswebuitoo, non-root). On a normal writable rootfs this always fails because /etc/group is owned by root — causing a false positive that crashes the container with "Cannot modify /etc/group or /etc/passwd (read-only root fs)". Fix: use sudo to test writability, since groupmod/usermod already use sudo a few lines below. If sudo can write, the fs is not read-only and the guard should not trigger. Refs #1470 --- docker_init.bash | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docker_init.bash b/docker_init.bash index 750f31d2..dc77b246 100644 --- a/docker_init.bash +++ b/docker_init.bash @@ -189,10 +189,13 @@ if [ "A${whoami}" == "Ahermeswebuitoo" ]; then # using usermod for the already create hermeswebui user, knowing it is not already in use # per usermod manual: "You must make certain that the named user is not executing any processes when this command is being executed" # Guard for read-only root filesystem (podman with read_only=true, issue #1470). + # The script runs as hermeswebuitoo (non-root), but groupmod/usermod use sudo. + # So we must check writability via sudo — a non-root user cannot write /etc/group + # even on a normal writable rootfs, which caused a false positive (issue #1658). _readonly_root=false - if [ ! -w /etc/group ] || [ ! -w /etc/passwd ]; then + if ! sudo sh -c 'test -w /etc/group && test -w /etc/passwd' 2>/dev/null; then _readonly_root=true - echo " !! Detected read-only root filesystem — /etc/group or /etc/passwd is not writable" + echo " !! Detected read-only root filesystem — /etc/group or /etc/passwd is not writable (even via sudo)" fi if [ "A${_readonly_root}" == "Atrue" ]; then _current_hermeswebui_gid=$(id -g hermeswebui 2>/dev/null || echo "") From 3005bfc4912f7bf940ed6666803c9e1e213413b1 Mon Sep 17 00:00:00 2001 From: Hermes Agent Date: Mon, 4 May 2026 22:50:57 +0000 Subject: [PATCH 4/4] =?UTF-8?q?chore(release):=20stamp=20v0.50.297=20?= =?UTF-8?q?=E2=80=94=203-PR=20batch=20+=20Opus=20pass=20+=202=20follow-ups?= =?UTF-8?q?=20absorbed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Constituent PRs: #1659 by @bergeouss — Docker readonly false-positive (closes #1658, fixes v0.50.295 regression) #1653 by @nesquena — OAuth cancel race fix (follow-up to v0.50.296 #1652) #1657 by @Michaelyklam — health diagnostics + watchdog hardening (refs #1458 Bug #3) Opus advisor SHIP verdict on stage-297. Two follow-ups absorbed in-release: - _deep_health_checks(stream_check=...) reuses pre-computed lock probe - _handle_request_noblock docstring documents single-thread safety PR #1656 closed as superseded by #1657 (same author, both target #1458, #1657 is functional superset). 4284 → 4288 tests passing (+4). --- CHANGELOG.md | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ ROADMAP.md | 2 +- TESTING.md | 4 ++-- api/routes.py | 10 +++++++--- server.py | 6 ++++++ 5 files changed, 65 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e46b02c9..db622883 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,54 @@ # Hermes Web UI -- Changelog +## [v0.50.297] — 2026-05-04 + +### Fixed (3 PRs — closes #1658; refs #1458, #1652) + +- **Docker container no longer enters a crash loop on every normal Docker setup** (#1659 by @bergeouss, closes #1658) — PR #1635 (v0.50.295) added a writability guard `[ ! -w /etc/group ] || [ ! -w /etc/passwd ]` for podman `read_only=true` containers. Bug: the script runs as the non-root `hermeswebuitoo` user, so `/etc/group` (owned by root) is **always** non-writable from that user — guard fires on EVERY normal Docker setup, container enters a crash loop with `!! ERROR: Cannot modify /etc/group or /etc/passwd (read-only root fs)`. Affects all users running standard Docker after upgrading to v0.50.295. **Fix:** replace `[ ! -w ]` with `! sudo sh -c 'test -w /etc/group && test -w /etc/passwd' 2>/dev/null` — matches the fact that `groupmod`/`usermod` already use sudo a few lines below. Truly read-only rootfs (podman) → sudo can't write → guard fires correctly. Writable rootfs (normal Docker) → sudo can write → guard doesn't fire → groupmod/usermod runs normally. **3 LOC `docker_init.bash` change.** P0 regression fix. + +- **OAuth Cancel during Codex device-token exchange now wins the race** (#1653 by @nesquena, follow-up to #1652 / refs #1362) — race in v0.50.296's Codex OAuth onboarding flow where a `POST /api/onboarding/oauth/cancel` arriving while the worker was mid-network-call would be silently overridden: credentials would still get persisted to `auth.json` and the flow status would flip from `cancelled` → `success`. Net effect: the user's explicit cancel was ignored, credentials persisted, UI reported success. **Fix:** re-check `_OAUTH_FLOWS[flow_id].status` under `_OAUTH_FLOWS_LOCK` immediately AFTER `_exchange_codex_authorization()` returns and BEFORE writing `auth.json`. If status is no longer `pending`, return cleanly — no persistence, no status overwrite. Behavioral test using `threading.Event` deterministically reproduces the race. UX-inconsistency severity, not a security bug (the credentials that get persisted ARE tokens the user authorized in their browser), but the cancel button stops doing what it says, violating the design intent of #1650's server-owned lifecycle. + +- **Persistent-host health diagnostics + watchdog hardening** (#1657 by @Michaelyklam, refs #1458) — addresses the residual #1458 Bug #3 failure mode (process alive + port listening but HTTP requests not advancing), the wedge that survives after v0.50.275's FD-leak fix and v0.50.269's bootstrap fix. Adds three signals process supervisors can use to distinguish "process exists" from "request handling is still advancing": + - **Accept-loop heartbeat**: `QuietHTTPServer.accept_loop_requests_total` + `accept_loop_last_request_at` instance attributes, incremented in `_handle_request_noblock()` (single `serve_forever()` thread, un-locked `+=` is safe). Surfaced in `/health` as `accept_loop: {requests_total, last_request_at}`. + - **`/health?deep=1` readiness probe**: bounded `STREAMS_LOCK.acquire(timeout=0.5)` + `all_sessions()` walk + `load_projects(_migrate=False)` + `sqlite3.connect(state.db) + PRAGMA schema_version`. Returns 503 with `status: degraded` when streams lock blocks or any deep check errors. Watchdogs polling `/health?deep=1` every 30s open-and-close 2880 short-lived sqlite connections per day per probe — bounded FD usage, no leak surface. + - **`RLIMIT_NOFILE` raise to 4096** at startup (best-effort, defense in depth for macOS launchd jobs that start at 256). Doesn't hide future FD leaks; gives diagnostic headroom before request handling falls over. + - **`docs/supervisor.md` updates**: launchd/systemd HTTP watchdog recipe using `curl -fsS --max-time 10 /health?deep=1` + `launchctl kickstart -k`. Notes `accept_loop.requests_total` should advance — if it stays flat while the process is alive, the accept loop is wedged. + + Per Opus advisor on stage-297: refactored `_deep_health_checks(stream_check=...)` to accept the pre-computed stream check from `_handle_health()` so we don't acquire `STREAMS_LOCK` twice on the same `/health?deep=1` request (cosmetic inefficiency, not a correctness bug — but also could false-fail when the second acquire times out under contention). Plus a docstring note on `_handle_request_noblock` documenting why the un-locked `+=` is safe (single-thread-only call site in CPython socketserver). + + PR #1656 by the same author (smaller, module-level globals approach) was closed as superseded by #1657 (instance-level + state.db check + projects check + supervisor.md docs). + +### Tests + +4284 → **4288 passing** (+4 regression tests across `tests/test_issue1458_stability_hardening.py` (3) + `tests/test_issue1362_codex_oauth_onboarding.py::test_cancel_during_token_exchange_does_not_persist_credentials` (1)). 0 regressions. Full suite ~118s. + +### Pre-release verification + +- **Opus advisor on stage-297 combined diff: SHIP verdict.** All 9 verification questions cleared: + - `_active_state_db_path()` verified at `api/models.py:924`, returns Path without opening connection + - 500ms `STREAMS_LOCK.acquire(timeout=...)` ceiling reasonable for watchdog timeouts (10s curl `--max-time` typical) + - `with closing(sqlite3.connect(...))` deterministically releases FD, `PRAGMA schema_version` is read-only + - `_handle_request_noblock` heartbeat increment is BEFORE super() — counter advances even if request handling raises, correct accept-loop semantics + - `_raise_fd_soft_limit()` correctly clamps to hard limit, only RAISES soft limit (won't lower below launchd's `LimitNOFILE` setting) + - OAuth fix narrows race window from "seconds-long network call" to "microseconds-long file write" — minimal correct change at the right layer + - Docker fix `sudo sh -c 'test -w'` correctly handles all 3 cases (writable+sudo / readonly+sudo / no-sudo) +- **Two minor Opus follow-ups absorbed in-release**: + - `_deep_health_checks(stream_check=...)` reuses pre-computed stream check from `_handle_health()` — saves redundant lock acquisition + - Docstring note on `_handle_request_noblock` documenting single-thread safety of un-locked `+=` +- **Self-built #1653** has thorough `threading.Event`-gated behavioral test demonstrating the race exists pre-fix and is fixed post-fix. +- **Browser API sanity**: 11/11 endpoints OK on stage server. +- **Conflict resolution**: zero file overlap across all 3 PRs (#1659 → docker_init.bash; #1653 → api/oauth.py; #1657 → api/routes.py + server.py + docs/supervisor.md). Auto-merged clean. + +### Authors + +- @bergeouss — 1 PR (#1659, AI-assisted via Hermes Agent) — fixing their own v0.50.295 #1635 regression +- @nesquena (self-built) — 1 PR (#1653, follow-up to v0.50.296 #1652) +- @Michaelyklam — 1 PR (#1657, hardening for #1458 Bug #3) + +### Note on closed-as-superseded + +PR #1656 (also @Michaelyklam) was closed as superseded by #1657. Both target #1458 Bug #3, both add accept-loop heartbeat + `/health?deep=1` + 503-on-degraded. #1657 adds beyond #1656: state.db connectivity check, projects state check, FD soft-limit raise, and `docs/supervisor.md` watchdog recipe. Same author iterated; the second PR was the keeper. + ## [v0.50.296] — 2026-05-04 ### Fixed (3 PRs — closes #1406, #1617; refs #1362) diff --git a/ROADMAP.md b/ROADMAP.md index 54a10f45..2db1b4a9 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -2,7 +2,7 @@ > Web companion to the Hermes Agent CLI. Same workflows, browser-native. > -> Last updated: v0.50.296 (May 04, 2026) — 4284 tests collected +> Last updated: v0.50.297 (May 04, 2026) — 4288 tests collected > Test source: `pytest tests/ --collect-only -q` > Per-version detail: see [CHANGELOG.md](./CHANGELOG.md) diff --git a/TESTING.md b/TESTING.md index eeeb72d1..749af862 100644 --- a/TESTING.md +++ b/TESTING.md @@ -1835,8 +1835,8 @@ Bridged CLI sessions: --- -*Last updated: v0.50.296, May 04, 2026* -*Total automated tests collected: 4284* +*Last updated: v0.50.297, May 04, 2026* +*Total automated tests collected: 4288* *Regression gate: tests/test_regressions.py* *Run: pytest tests/ -v --timeout=60* *Source: /* diff --git a/api/routes.py b/api/routes.py index 42a90bbd..92109eec 100644 --- a/api/routes.py +++ b/api/routes.py @@ -1689,17 +1689,21 @@ def _streams_lock_health(timeout_seconds: float = 0.5) -> dict: STREAMS_LOCK.release() -def _deep_health_checks() -> tuple[dict, bool]: +def _deep_health_checks(stream_check: dict | None = None) -> tuple[dict, bool]: """Run cheap probes that exercise the state paths used by the UI shell. Plain /health intentionally stays tiny. /health?deep=1 is for supervisors and watchdogs that need to know whether the process can still touch the shared stream map, sidebar/session path, project state, and Hermes state.db without hitting the RST-before-write failure mode from #1458. + + `stream_check` is the result from a prior `_streams_lock_health()` call; + if provided, it's reused so we don't acquire `STREAMS_LOCK` twice on the + same /health?deep=1 request (per Opus advisor on stage-297). """ checks: dict[str, dict] = {} - checks["streams_lock"] = _streams_lock_health() + checks["streams_lock"] = stream_check if stream_check is not None else _streams_lock_health() if checks["streams_lock"].get("status") != "ok": return checks, False @@ -1776,7 +1780,7 @@ def _handle_health(handler, parsed): if stream_check.get("status") != "ok": payload["checks"] = {"streams_lock": stream_check} return j(handler, payload, status=503) - checks, healthy = _deep_health_checks() + checks, healthy = _deep_health_checks(stream_check=stream_check) payload["checks"] = checks if not healthy: payload["status"] = "degraded" diff --git a/server.py b/server.py index 5b9a7062..52a6dd28 100644 --- a/server.py +++ b/server.py @@ -43,6 +43,12 @@ class QuietHTTPServer(ThreadingHTTPServer): A process can be alive and still stop accepting/dispatching requests. Exposing this heartbeat on /health gives supervisors and watchdogs a cheap signal that the accept loop is still moving. + + Note: this method is called only from the single ``serve_forever()`` + thread in CPython socketserver, so the un-locked ``+=`` increment is + safe — there is no other thread mutating these counters. The /health + readers may see a stale value momentarily but never an inconsistent + one (Python int reads are atomic). Per Opus advisor on stage-297. """ self.accept_loop_requests_total += 1 self.accept_loop_last_request_at = time.time()