From b34ce63c978ed12238e8cd4d407204a92e99789d Mon Sep 17 00:00:00 2001
From: Nathan Esquenazi <nesquena@gmail.com>
Date: Mon, 4 May 2026 14:49:38 -0700
Subject: [PATCH 1/4] fix(oauth): honor cancel during Codex device-token
 exchange (follow-up to #1652)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Codex OAuth onboarding worker introduced in #1652 had a cancel-vs-worker
race: a `cancel_onboarding_oauth_flow` request that arrived while the worker
was mid-network-call (between the `live = dict(...)` snapshot and the next
status check) would be silently overridden:

  1. User clicks Cancel → server sets flow.status = "cancelled" and drops
     sensitive lifecycle fields under the lock.
  2. Worker is mid-`_poll_codex_authorization` / `_exchange_codex_authorization`
     using the local `live` snapshot it captured before the cancel.
  3. Worker calls `_persist_codex_credentials(...)` — auth.json gets written.
  4. Worker calls `_set_flow_status(flow_id, "success")` — overrides the
     cancelled status.

Net effect: the user's explicit cancel is ignored, credentials are persisted,
and the UI reports success. Reproduced with a behavioural harness that drove
a real worker thread against patched network helpers and confirmed:

  pre-fix : flow status `success`, auth.json written despite cancel
  post-fix: flow status `cancelled`, auth.json NOT written

The fix re-checks the flow status under `_OAUTH_FLOWS_LOCK` after the token
exchange completes and before persisting. If the status is no longer
`pending`, the worker exits without persisting credentials and without
overwriting the terminal status.

Regression test `test_cancel_during_token_exchange_does_not_persist_credentials`
drives the worker against threading.Event-gated network stubs to reproduce
the race deterministically and lock the new invariant.

Trace verified against fresh hermes-agent tarball — credential_pool entry
shape (`auth_type=oauth`, `source=manual:device_code`, `priority=0`, base_url)
remains compatible with `agent.credential_pool.load_pool("openai-codex")` and
the agent CLI's `_save_codex_tokens` legacy fallback path.

Tests:
- 10/10 in tests/test_issue1362_codex_oauth_onboarding.py
- Full suite: 4230 passed, 57 skipped, 3 xpassed, 0 failed in 33.82s

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 api/oauth.py                                  |  7 +++
 .../test_issue1362_codex_oauth_onboarding.py  | 54 +++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/api/oauth.py b/api/oauth.py
index 621a4814..e70f2a6b 100644
--- a/api/oauth.py
+++ b/api/oauth.py
@@ -347,6 +347,13 @@ def _run_codex_oauth_worker(flow_id: str) -> None:
             if not authorization_code or not code_verifier:
                 raise RuntimeError("Device auth response missing authorization_code or code_verifier")
             tokens = _exchange_codex_authorization(authorization_code, code_verifier)
+            # Re-check status under lock before persisting: a cancel/expire that
+            # raced with the device-token + token-exchange network calls must
+            # win, so we don't persist credentials the user explicitly aborted.
+            with _OAUTH_FLOWS_LOCK:
+                current = _OAUTH_FLOWS.get(flow_id)
+                if not current or current.get("status") != "pending":
+                    return
             _persist_codex_credentials(Path(live["hermes_home"]), tokens)
             _set_flow_status(flow_id, "success")
             return
diff --git a/tests/test_issue1362_codex_oauth_onboarding.py b/tests/test_issue1362_codex_oauth_onboarding.py
index 669fd90e..bc8d1149 100644
--- a/tests/test_issue1362_codex_oauth_onboarding.py
+++ b/tests/test_issue1362_codex_oauth_onboarding.py
@@ -111,6 +111,60 @@ def test_cancel_marks_flow_cancelled_and_poll_stops(tmp_path):
     assert polled["status"] == "cancelled"
 
 
+def test_cancel_during_token_exchange_does_not_persist_credentials(monkeypatch, tmp_path):
+    """Cancel arriving while the worker is mid-network-call must win.
+
+    Without the post-exchange status re-check, the worker would proceed to
+    persist credentials to auth.json AND override the cancelled status with
+    "success" — silently storing tokens the user explicitly aborted.
+    """
+    import threading
+    import api.oauth as oauth
+
+    oauth._OAUTH_FLOWS.clear()
+
+    poll_started = threading.Event()
+    poll_continue = threading.Event()
+
+    def _slow_poll(device_auth_id, user_code):
+        poll_started.set()
+        assert poll_continue.wait(timeout=5)
+        return {"authorization_code": "auth-code", "code_verifier": "verifier"}
+
+    def _exchange(authorization_code, code_verifier):
+        return {"access_token": "ACCESS", "refresh_token": "REFRESH"}
+
+    monkeypatch.setattr(oauth, "_poll_codex_authorization", _slow_poll)
+    monkeypatch.setattr(oauth, "_exchange_codex_authorization", _exchange)
+
+    flow_id = "race-flow"
+    oauth._OAUTH_FLOWS[flow_id] = {
+        "provider": "openai-codex",
+        "status": "pending",
+        "device_auth_id": "device-secret",
+        "user_code": "ABCD-EFGH",
+        "expires_at": time.time() + 600,
+        "poll_interval_seconds": 1,
+        "hermes_home": str(tmp_path),
+        "created_at": time.time(),
+        "updated_at": time.time(),
+    }
+
+    worker = threading.Thread(target=oauth._run_codex_oauth_worker, args=(flow_id,), daemon=True)
+    worker.start()
+    assert poll_started.wait(timeout=5)
+
+    oauth.cancel_onboarding_oauth_flow({"flow_id": flow_id})
+    assert oauth._OAUTH_FLOWS[flow_id]["status"] == "cancelled"
+
+    poll_continue.set()
+    worker.join(timeout=5)
+    assert not worker.is_alive()
+
+    assert oauth._OAUTH_FLOWS[flow_id]["status"] == "cancelled"
+    assert not (tmp_path / "auth.json").exists()
+
+
 def test_expired_flow_reports_expired_and_drops_sensitive_lifecycle(tmp_path):
     import api.oauth as oauth
 

From ca135c2015ad09d11c0177b1439a2d569c89886e Mon Sep 17 00:00:00 2001
From: Michael Lam <Michaelyklam1@gmail.com>
Date: Mon, 4 May 2026 15:30:37 -0700
Subject: [PATCH 2/4] fix: harden persistent WebUI health checks

---
 api/routes.py                               | 140 ++++++++++++++++++--
 docs/supervisor.md                          |  41 ++++++
 server.py                                   |  60 +++++++++
 tests/test_issue1458_stability_hardening.py |  66 +++++++++
 4 files changed, 296 insertions(+), 11 deletions(-)
 create mode 100644 tests/test_issue1458_stability_hardening.py

diff --git a/api/routes.py b/api/routes.py
index 8ef98b63..42a90bbd 100644
--- a/api/routes.py
+++ b/api/routes.py
@@ -12,6 +12,7 @@ import queue
 import re
 import platform
 import shutil
+import sqlite3
 import subprocess
 import sys
 import threading
@@ -19,6 +20,7 @@ import time
 import uuid
 import re
 from pathlib import Path
+from contextlib import closing
 from urllib.parse import parse_qs
 from api.agent_sessions import MESSAGING_SOURCES
 
@@ -1222,6 +1224,7 @@ from api.models import (
     title_from,
     _write_session_index,
     SESSION_INDEX_FILE,
+    _active_state_db_path,
     load_projects,
     save_projects,
     import_cli_session,
@@ -1658,6 +1661,131 @@ def _handle_insights(handler, parsed) -> bool:
 # ── GET routes ────────────────────────────────────────────────────────────────
 
 
+def _accept_loop_health(handler) -> dict:
+    server = getattr(handler, "server", None)
+    return {
+        "requests_total": int(getattr(server, "accept_loop_requests_total", 0) or 0),
+        "last_request_at": round(float(getattr(server, "accept_loop_last_request_at", 0.0) or 0.0), 3),
+    }
+
+
+def _streams_lock_health(timeout_seconds: float = 0.5) -> dict:
+    t0 = time.time()
+    acquired = STREAMS_LOCK.acquire(timeout=timeout_seconds)
+    elapsed_ms = round((time.time() - t0) * 1000, 1)
+    if not acquired:
+        return {
+            "status": "blocked",
+            "timeout_seconds": timeout_seconds,
+            "ms": elapsed_ms,
+        }
+    try:
+        return {
+            "status": "ok",
+            "active_streams": len(STREAMS),
+            "ms": elapsed_ms,
+        }
+    finally:
+        STREAMS_LOCK.release()
+
+
+def _deep_health_checks() -> tuple[dict, bool]:
+    """Run cheap probes that exercise the state paths used by the UI shell.
+
+    Plain /health intentionally stays tiny. /health?deep=1 is for supervisors
+    and watchdogs that need to know whether the process can still touch the
+    shared stream map, sidebar/session path, project state, and Hermes state.db
+    without hitting the RST-before-write failure mode from #1458.
+    """
+    checks: dict[str, dict] = {}
+
+    checks["streams_lock"] = _streams_lock_health()
+    if checks["streams_lock"].get("status") != "ok":
+        return checks, False
+
+    t0 = time.time()
+    try:
+        sessions = all_sessions()
+        checks["sessions"] = {
+            "status": "ok",
+            "count": len(sessions),
+            "ms": round((time.time() - t0) * 1000, 1),
+        }
+    except Exception as exc:
+        checks["sessions"] = {
+            "status": "error",
+            "error": type(exc).__name__,
+            "ms": round((time.time() - t0) * 1000, 1),
+        }
+
+    t0 = time.time()
+    try:
+        projects = load_projects(_migrate=False)
+        checks["projects"] = {
+            "status": "ok",
+            "count": len(projects),
+            "ms": round((time.time() - t0) * 1000, 1),
+        }
+    except Exception as exc:
+        checks["projects"] = {
+            "status": "error",
+            "error": type(exc).__name__,
+            "ms": round((time.time() - t0) * 1000, 1),
+        }
+
+    t0 = time.time()
+    try:
+        db_path = _active_state_db_path()
+        if not db_path.exists():
+            checks["state_db"] = {
+                "status": "missing",
+                "ms": round((time.time() - t0) * 1000, 1),
+            }
+        else:
+            with closing(sqlite3.connect(str(db_path))) as conn:
+                conn.execute("PRAGMA schema_version").fetchone()
+            checks["state_db"] = {
+                "status": "ok",
+                "ms": round((time.time() - t0) * 1000, 1),
+            }
+    except Exception as exc:
+        checks["state_db"] = {
+            "status": "error",
+            "error": type(exc).__name__,
+            "ms": round((time.time() - t0) * 1000, 1),
+        }
+
+    healthy = all(
+        check.get("status") in {"ok", "missing"}
+        for check in checks.values()
+    )
+    return checks, healthy
+
+
+def _handle_health(handler, parsed):
+    deep = parse_qs(parsed.query or "").get("deep", [""])[0].lower() in {"1", "true", "yes", "on"}
+    stream_check = _streams_lock_health()
+    payload = {
+        "status": "ok" if stream_check.get("status") == "ok" else "degraded",
+        "sessions": len(SESSIONS),
+        "active_streams": int(stream_check.get("active_streams") or 0),
+        "uptime_seconds": round(time.time() - SERVER_START_TIME, 1),
+        "accept_loop": _accept_loop_health(handler),
+    }
+    if deep:
+        if stream_check.get("status") != "ok":
+            payload["checks"] = {"streams_lock": stream_check}
+            return j(handler, payload, status=503)
+        checks, healthy = _deep_health_checks()
+        payload["checks"] = checks
+        if not healthy:
+            payload["status"] = "degraded"
+            return j(handler, payload, status=503)
+    if payload["status"] != "ok":
+        return j(handler, payload, status=503)
+    return j(handler, payload)
+
+
 def handle_get(handler, parsed) -> bool:
     """Handle all GET routes. Returns True if handled, False for 404."""
 
@@ -1776,17 +1904,7 @@ def handle_get(handler, parsed) -> bool:
         return _handle_insights(handler, parsed)
 
     if parsed.path == "/health":
-        with STREAMS_LOCK:
-            n_streams = len(STREAMS)
-        return j(
-            handler,
-            {
-                "status": "ok",
-                "sessions": len(SESSIONS),
-                "active_streams": n_streams,
-                "uptime_seconds": round(time.time() - SERVER_START_TIME, 1),
-            },
-        )
+        return _handle_health(handler, parsed)
 
     if parsed.path == "/api/models":
         return j(handler, get_available_models())
diff --git a/docs/supervisor.md b/docs/supervisor.md
index 4ec433b0..85821a69 100644
--- a/docs/supervisor.md
+++ b/docs/supervisor.md
@@ -235,3 +235,44 @@ PID    PPID  CMD
 If PPID is ``1`` (init) when it should be the supervisor, the orphan-server
 loop is happening — re-check that ``--foreground`` (or one of the env vars)
 is reaching the process.
+
+## HTTP watchdog / deep health
+
+``KeepAlive`` / ``Restart=always`` only recover a process that exits. If the
+process is still listening on the port but request handling is wedged, pair your
+supervisor with an HTTP probe and force a restart when the probe fails.
+
+Hermes Web UI exposes two health levels:
+
+- ``/health`` — cheap liveness probe with ``active_streams``, uptime, and an
+  ``accept_loop`` heartbeat counter.
+- ``/health?deep=1`` — readiness probe that briefly acquires the stream lock,
+  reads the sidebar/session path, reads projects state, and touches Hermes
+  ``state.db`` if it exists. Use this for watchdogs.
+
+At startup the server also tries to raise its file-descriptor soft limit to
+4096 on platforms that support ``RLIMIT_NOFILE``. That is defense in depth for
+persistent hosts: leaks should still be fixed, but a higher soft limit gives
+you more diagnostic headroom before request handling falls over.
+
+Minimal macOS launchd watchdog script:
+
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+LABEL="com.example.hermes-webui"
+BASE="http://127.0.0.1:8787"
+
+if ! curl -fsS --max-time 10 "$BASE/health?deep=1" >/dev/null; then
+  launchctl kickstart -k "gui/$(id -u)/$LABEL"
+fi
+```
+
+Run it every few minutes from a separate ``StartInterval`` LaunchAgent. For
+systemd, prefer a timer/service pair that runs the same curl probe and
+``systemctl --user restart hermes-webui.service`` on failure.
+
+The ``accept_loop.requests_total`` value should increase when probes arrive. If
+it stays flat while the process is still alive, the server accept loop is not
+making progress; capture logs/thread samples before restarting if you are
+collecting diagnostics for a bug report.
diff --git a/server.py b/server.py
index 16691bb4..5b9a7062 100644
--- a/server.py
+++ b/server.py
@@ -9,6 +9,11 @@ import sys
 import time
 import traceback
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+
+try:
+    import resource
+except ImportError:  # pragma: no cover - resource is Unix-only
+    resource = None
 from urllib.parse import urlparse
 
 logger = logging.getLogger(__name__)
@@ -26,6 +31,22 @@ class QuietHTTPServer(ThreadingHTTPServer):
     """Custom HTTP server that silently handles common network errors."""
     daemon_threads = True
     request_queue_size = 64
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.accept_loop_requests_total = 0
+        self.accept_loop_last_request_at = 0.0
+
+    def _handle_request_noblock(self):
+        """Record accept-loop progress before dispatching a request handler.
+
+        A process can be alive and still stop accepting/dispatching requests.
+        Exposing this heartbeat on /health gives supervisors and watchdogs a
+        cheap signal that the accept loop is still moving.
+        """
+        self.accept_loop_requests_total += 1
+        self.accept_loop_last_request_at = time.time()
+        return super()._handle_request_noblock()
     
     def handle_error(self, request, client_address):
         """Override to suppress logging for common client disconnect errors."""
@@ -129,11 +150,50 @@ class Handler(BaseHTTPRequestHandler):
             clear_request_profile()
 
 
+def _raise_fd_soft_limit(target: int = 4096) -> dict:
+    """Best-effort raise of RLIMIT_NOFILE for persistent WebUI hosts.
+
+    macOS launchd jobs often start with a 256 soft limit. If a future FD leak
+    regresses, that low ceiling turns a leak into a hard HTTP wedge quickly.
+    Raising the soft limit does not hide leaks; it buys enough headroom for
+    diagnostics and watchdog recovery.
+    """
+    if resource is None:
+        return {"status": "unsupported"}
+    try:
+        soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+    except Exception as exc:
+        return {"status": "error", "error": str(exc)}
+
+    # On Unix, RLIM_INFINITY is commonly a large int; keep the logic explicit
+    # so tests can use ordinary integers without depending on platform values.
+    desired = int(target)
+    if hard not in (-1, getattr(resource, "RLIM_INFINITY", object())):
+        desired = min(desired, int(hard))
+    if soft >= desired:
+        return {"status": "unchanged", "soft": soft, "hard": hard}
+    try:
+        resource.setrlimit(resource.RLIMIT_NOFILE, (desired, hard))
+    except Exception as exc:
+        return {"status": "error", "soft": soft, "hard": hard, "error": str(exc)}
+    return {"status": "raised", "soft": desired, "hard": hard, "previous_soft": soft}
+
+
 def main() -> None:
     from api.config import print_startup_config, verify_hermes_imports, _HERMES_FOUND
 
     print_startup_config()
 
+    fd_limit = _raise_fd_soft_limit()
+    if fd_limit.get("status") == "raised":
+        print(
+            f"[ok] Raised file descriptor soft limit "
+            f"{fd_limit.get('previous_soft')} -> {fd_limit.get('soft')}",
+            flush=True,
+        )
+    elif fd_limit.get("status") == "error":
+        print(f"[!!] WARNING: Could not raise file descriptor limit: {fd_limit.get('error')}", flush=True)
+
     # Fix sensitive file permissions before doing anything else
     fix_credential_permissions()
 
diff --git a/tests/test_issue1458_stability_hardening.py b/tests/test_issue1458_stability_hardening.py
new file mode 100644
index 00000000..63dd29c6
--- /dev/null
+++ b/tests/test_issue1458_stability_hardening.py
@@ -0,0 +1,66 @@
+"""Regression coverage for issue #1458 persistent-host hardening."""
+import json
+import urllib.request
+
+from tests._pytest_port import BASE
+
+
+def _get(path):
+    with urllib.request.urlopen(BASE + path, timeout=10) as r:
+        return json.loads(r.read()), r.status
+
+
+def test_health_exposes_accept_loop_heartbeat():
+    data, status = _get("/health")
+
+    assert status == 200
+    heartbeat = data.get("accept_loop")
+    assert isinstance(heartbeat, dict)
+    assert isinstance(heartbeat.get("requests_total"), int)
+    assert heartbeat["requests_total"] >= 1
+    assert isinstance(heartbeat.get("last_request_at"), (int, float))
+    assert heartbeat["last_request_at"] > 0
+
+
+def test_deep_health_exercises_session_project_and_sqlite_paths():
+    data, status = _get("/health?deep=1")
+
+    assert status == 200
+    assert data["status"] == "ok"
+    checks = data.get("checks")
+    assert isinstance(checks, dict)
+    assert checks["streams_lock"]["status"] == "ok"
+    assert isinstance(checks["streams_lock"].get("active_streams"), int)
+    assert checks["sessions"]["status"] == "ok"
+    assert isinstance(checks["sessions"].get("count"), int)
+    assert checks["projects"]["status"] == "ok"
+    assert isinstance(checks["projects"].get("count"), int)
+    # The isolated test home may not have a Hermes state.db yet. Deep health
+    # should still report the state-db probe explicitly so watchdogs can tell
+    # whether sqlite was checked or absent.
+    assert checks["state_db"]["status"] in {"ok", "missing"}
+
+
+def test_server_raises_fd_soft_limit_when_resource_allows(monkeypatch):
+    import server
+
+    calls = []
+
+    class FakeResource:
+        RLIMIT_NOFILE = object()
+
+        @staticmethod
+        def getrlimit(which):
+            return (256, 8192)
+
+        @staticmethod
+        def setrlimit(which, limits):
+            calls.append((which, limits))
+
+    monkeypatch.setattr(server, "resource", FakeResource, raising=False)
+
+    result = server._raise_fd_soft_limit(target=4096)
+
+    assert result["status"] == "raised"
+    assert result["soft"] == 4096
+    assert calls == [(FakeResource.RLIMIT_NOFILE, (4096, 8192))]

From d4385f8aa2b97c0b7677c2d9cea2d42b93e6c04e Mon Sep 17 00:00:00 2001
From: bergeouss <bergeouss@users.noreply.github.com>
Date: Mon, 4 May 2026 22:37:55 +0000
Subject: [PATCH 3/4] fix: false read-only detection in docker_init.bash (#1470
 follow-up)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The read-only rootfs guard added in PR #1635 (issue #1470) checks
[ ! -w /etc/group ] as the current user (hermeswebuitoo, non-root).
On a normal writable rootfs this always fails because /etc/group is
owned by root — causing a false positive that crashes the container
with "Cannot modify /etc/group or /etc/passwd (read-only root fs)".

Fix: use sudo to test writability, since groupmod/usermod already
use sudo a few lines below. If sudo can write, the fs is not
read-only and the guard should not trigger.

Refs #1470
---
 docker_init.bash | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docker_init.bash b/docker_init.bash
index 750f31d2..dc77b246 100644
--- a/docker_init.bash
+++ b/docker_init.bash
@@ -189,10 +189,13 @@ if [ "A${whoami}" == "Ahermeswebuitoo" ]; then
   # using usermod for the already create hermeswebui user, knowing it is not already in use
   # per usermod manual: "You must make certain that the named user is not executing any processes when this command is being executed"
   # Guard for read-only root filesystem (podman with read_only=true, issue #1470).
+  # The script runs as hermeswebuitoo (non-root), but groupmod/usermod use sudo.
+  # So we must check writability via sudo — a non-root user cannot write /etc/group
+  # even on a normal writable rootfs, which caused a false positive (issue #1658).
   _readonly_root=false
-  if [ ! -w /etc/group ] || [ ! -w /etc/passwd ]; then
+  if ! sudo sh -c 'test -w /etc/group && test -w /etc/passwd' 2>/dev/null; then
     _readonly_root=true
-    echo "  !! Detected read-only root filesystem — /etc/group or /etc/passwd is not writable"
+    echo "  !! Detected read-only root filesystem — /etc/group or /etc/passwd is not writable (even via sudo)"
   fi
   if [ "A${_readonly_root}" == "Atrue" ]; then
     _current_hermeswebui_gid=$(id -g hermeswebui 2>/dev/null || echo "")

From 3005bfc4912f7bf940ed6666803c9e1e213413b1 Mon Sep 17 00:00:00 2001
From: Hermes Agent <hermes-agent@nesquena-hermes.local>
Date: Mon, 4 May 2026 22:50:57 +0000
Subject: [PATCH 4/4] =?UTF-8?q?chore(release):=20stamp=20v0.50.297=20?=
 =?UTF-8?q?=E2=80=94=203-PR=20batch=20+=20Opus=20pass=20+=202=20follow-ups?=
 =?UTF-8?q?=20absorbed?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Constituent PRs:
  #1659 by @bergeouss — Docker readonly false-positive (closes #1658, fixes v0.50.295 regression)
  #1653 by @nesquena — OAuth cancel race fix (follow-up to v0.50.296 #1652)
  #1657 by @Michaelyklam — health diagnostics + watchdog hardening (refs #1458 Bug #3)

Opus advisor SHIP verdict on stage-297. Two follow-ups absorbed in-release:
- _deep_health_checks(stream_check=...) reuses pre-computed lock probe
- _handle_request_noblock docstring documents single-thread safety

PR #1656 closed as superseded by #1657 (same author, both target #1458,
#1657 is functional superset).

4284 → 4288 tests passing (+4).
---
 CHANGELOG.md  | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 ROADMAP.md    |  2 +-
 TESTING.md    |  4 ++--
 api/routes.py | 10 +++++++---
 server.py     |  6 ++++++
 5 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e46b02c9..db622883 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,54 @@
 # Hermes Web UI -- Changelog
 
+## [v0.50.297] — 2026-05-04
+
+### Fixed (3 PRs — closes #1658; refs #1458, #1652)
+
+- **Docker container no longer enters a crash loop on every normal Docker setup** (#1659 by @bergeouss, closes #1658) — PR #1635 (v0.50.295) added a writability guard `[ ! -w /etc/group ] || [ ! -w /etc/passwd ]` for podman `read_only=true` containers. Bug: the script runs as the non-root `hermeswebuitoo` user, so `/etc/group` (owned by root) is **always** non-writable from that user — guard fires on EVERY normal Docker setup, container enters a crash loop with `!! ERROR: Cannot modify /etc/group or /etc/passwd (read-only root fs)`. Affects all users running standard Docker after upgrading to v0.50.295. **Fix:** replace `[ ! -w ]` with `! sudo sh -c 'test -w /etc/group && test -w /etc/passwd' 2>/dev/null` — matches the fact that `groupmod`/`usermod` already use sudo a few lines below. Truly read-only rootfs (podman) → sudo can't write → guard fires correctly. Writable rootfs (normal Docker) → sudo can write → guard doesn't fire → groupmod/usermod runs normally. **3 LOC `docker_init.bash` change.** P0 regression fix.
+
+- **OAuth Cancel during Codex device-token exchange now wins the race** (#1653 by @nesquena, follow-up to #1652 / refs #1362) — race in v0.50.296's Codex OAuth onboarding flow where a `POST /api/onboarding/oauth/cancel` arriving while the worker was mid-network-call would be silently overridden: credentials would still get persisted to `auth.json` and the flow status would flip from `cancelled` → `success`. Net effect: the user's explicit cancel was ignored, credentials persisted, UI reported success. **Fix:** re-check `_OAUTH_FLOWS[flow_id].status` under `_OAUTH_FLOWS_LOCK` immediately AFTER `_exchange_codex_authorization()` returns and BEFORE writing `auth.json`. If status is no longer `pending`, return cleanly — no persistence, no status overwrite. Behavioral test using `threading.Event` deterministically reproduces the race. UX-inconsistency severity, not a security bug (the credentials that get persisted ARE tokens the user authorized in their browser), but the cancel button stops doing what it says, violating the design intent of #1650's server-owned lifecycle.
+
+- **Persistent-host health diagnostics + watchdog hardening** (#1657 by @Michaelyklam, refs #1458) — addresses the residual #1458 Bug #3 failure mode (process alive + port listening but HTTP requests not advancing), the wedge that survives after v0.50.275's FD-leak fix and v0.50.269's bootstrap fix. Adds three signals process supervisors can use to distinguish "process exists" from "request handling is still advancing":
+  - **Accept-loop heartbeat**: `QuietHTTPServer.accept_loop_requests_total` + `accept_loop_last_request_at` instance attributes, incremented in `_handle_request_noblock()` (single `serve_forever()` thread, un-locked `+=` is safe). Surfaced in `/health` as `accept_loop: {requests_total, last_request_at}`.
+  - **`/health?deep=1` readiness probe**: bounded `STREAMS_LOCK.acquire(timeout=0.5)` + `all_sessions()` walk + `load_projects(_migrate=False)` + `sqlite3.connect(state.db) + PRAGMA schema_version`. Returns 503 with `status: degraded` when streams lock blocks or any deep check errors. Watchdogs polling `/health?deep=1` every 30s open-and-close 2880 short-lived sqlite connections per day per probe — bounded FD usage, no leak surface.
+  - **`RLIMIT_NOFILE` raise to 4096** at startup (best-effort, defense in depth for macOS launchd jobs that start at 256). Doesn't hide future FD leaks; gives diagnostic headroom before request handling falls over.
+  - **`docs/supervisor.md` updates**: launchd/systemd HTTP watchdog recipe using `curl -fsS --max-time 10 /health?deep=1` + `launchctl kickstart -k`. Notes `accept_loop.requests_total` should advance — if it stays flat while the process is alive, the accept loop is wedged.
+
+  Per Opus advisor on stage-297: refactored `_deep_health_checks(stream_check=...)` to accept the pre-computed stream check from `_handle_health()` so we don't acquire `STREAMS_LOCK` twice on the same `/health?deep=1` request (cosmetic inefficiency, not a correctness bug — but also could false-fail when the second acquire times out under contention). Plus a docstring note on `_handle_request_noblock` documenting why the un-locked `+=` is safe (single-thread-only call site in CPython socketserver).
+
+  PR #1656 by the same author (smaller, module-level globals approach) was closed as superseded by #1657 (instance-level + state.db check + projects check + supervisor.md docs).
+
+### Tests
+
+4284 → **4288 passing** (+4 regression tests across `tests/test_issue1458_stability_hardening.py` (3) + `tests/test_issue1362_codex_oauth_onboarding.py::test_cancel_during_token_exchange_does_not_persist_credentials` (1)). 0 regressions. Full suite ~118s.
+
+### Pre-release verification
+
+- **Opus advisor on stage-297 combined diff: SHIP verdict.** All 9 verification questions cleared:
+  - `_active_state_db_path()` verified at `api/models.py:924`, returns Path without opening connection
+  - 500ms `STREAMS_LOCK.acquire(timeout=...)` ceiling reasonable for watchdog timeouts (10s curl `--max-time` typical)
+  - `with closing(sqlite3.connect(...))` deterministically releases FD, `PRAGMA schema_version` is read-only
+  - `_handle_request_noblock` heartbeat increment is BEFORE super() — counter advances even if request handling raises, correct accept-loop semantics
+  - `_raise_fd_soft_limit()` correctly clamps to hard limit, only RAISES soft limit (won't lower below launchd's `LimitNOFILE` setting)
+  - OAuth fix narrows race window from "seconds-long network call" to "microseconds-long file write" — minimal correct change at the right layer
+  - Docker fix `sudo sh -c 'test -w'` correctly handles all 3 cases (writable+sudo / readonly+sudo / no-sudo)
+- **Two minor Opus follow-ups absorbed in-release**:
+  - `_deep_health_checks(stream_check=...)` reuses pre-computed stream check from `_handle_health()` — saves redundant lock acquisition
+  - Docstring note on `_handle_request_noblock` documenting single-thread safety of un-locked `+=`
+- **Self-built #1653** has thorough `threading.Event`-gated behavioral test demonstrating the race exists pre-fix and is fixed post-fix.
+- **Browser API sanity**: 11/11 endpoints OK on stage server.
+- **Conflict resolution**: zero file overlap across all 3 PRs (#1659 → docker_init.bash; #1653 → api/oauth.py; #1657 → api/routes.py + server.py + docs/supervisor.md). Auto-merged clean.
+
+### Authors
+
+- @bergeouss — 1 PR (#1659, AI-assisted via Hermes Agent) — fixing their own v0.50.295 #1635 regression
+- @nesquena (self-built) — 1 PR (#1653, follow-up to v0.50.296 #1652)
+- @Michaelyklam — 1 PR (#1657, hardening for #1458 Bug #3)
+
+### Note on closed-as-superseded
+
+PR #1656 (also @Michaelyklam) was closed as superseded by #1657. Both target #1458 Bug #3, both add accept-loop heartbeat + `/health?deep=1` + 503-on-degraded. #1657 adds beyond #1656: state.db connectivity check, projects state check, FD soft-limit raise, and `docs/supervisor.md` watchdog recipe. Same author iterated; the second PR was the keeper.
+
 ## [v0.50.296] — 2026-05-04
 
 ### Fixed (3 PRs — closes #1406, #1617; refs #1362)
diff --git a/ROADMAP.md b/ROADMAP.md
index 54a10f45..2db1b4a9 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -2,7 +2,7 @@
 
 > Web companion to the Hermes Agent CLI. Same workflows, browser-native.
 >
-> Last updated: v0.50.296 (May 04, 2026) — 4284 tests collected
+> Last updated: v0.50.297 (May 04, 2026) — 4288 tests collected
 > Test source: `pytest tests/ --collect-only -q`
 > Per-version detail: see [CHANGELOG.md](./CHANGELOG.md)
 
diff --git a/TESTING.md b/TESTING.md
index eeeb72d1..749af862 100644
--- a/TESTING.md
+++ b/TESTING.md
@@ -1835,8 +1835,8 @@ Bridged CLI sessions:
 
 ---
 
-*Last updated: v0.50.296, May 04, 2026*
-*Total automated tests collected: 4284*
+*Last updated: v0.50.297, May 04, 2026*
+*Total automated tests collected: 4288*
 *Regression gate: tests/test_regressions.py*
 *Run: pytest tests/ -v --timeout=60*
 *Source: <repo>/*
diff --git a/api/routes.py b/api/routes.py
index 42a90bbd..92109eec 100644
--- a/api/routes.py
+++ b/api/routes.py
@@ -1689,17 +1689,21 @@ def _streams_lock_health(timeout_seconds: float = 0.5) -> dict:
         STREAMS_LOCK.release()
 
 
-def _deep_health_checks() -> tuple[dict, bool]:
+def _deep_health_checks(stream_check: dict | None = None) -> tuple[dict, bool]:
     """Run cheap probes that exercise the state paths used by the UI shell.
 
     Plain /health intentionally stays tiny. /health?deep=1 is for supervisors
     and watchdogs that need to know whether the process can still touch the
     shared stream map, sidebar/session path, project state, and Hermes state.db
     without hitting the RST-before-write failure mode from #1458.
+
+    `stream_check` is the result from a prior `_streams_lock_health()` call;
+    if provided, it's reused so we don't acquire `STREAMS_LOCK` twice on the
+    same /health?deep=1 request (per Opus advisor on stage-297).
     """
     checks: dict[str, dict] = {}
 
-    checks["streams_lock"] = _streams_lock_health()
+    checks["streams_lock"] = stream_check if stream_check is not None else _streams_lock_health()
     if checks["streams_lock"].get("status") != "ok":
         return checks, False
 
@@ -1776,7 +1780,7 @@ def _handle_health(handler, parsed):
         if stream_check.get("status") != "ok":
             payload["checks"] = {"streams_lock": stream_check}
             return j(handler, payload, status=503)
-        checks, healthy = _deep_health_checks()
+        checks, healthy = _deep_health_checks(stream_check=stream_check)
         payload["checks"] = checks
         if not healthy:
             payload["status"] = "degraded"
diff --git a/server.py b/server.py
index 5b9a7062..52a6dd28 100644
--- a/server.py
+++ b/server.py
@@ -43,6 +43,12 @@ class QuietHTTPServer(ThreadingHTTPServer):
         A process can be alive and still stop accepting/dispatching requests.
         Exposing this heartbeat on /health gives supervisors and watchdogs a
         cheap signal that the accept loop is still moving.
+
+        Note: this method is called only from the single ``serve_forever()``
+        thread in CPython socketserver, so the un-locked ``+=`` increment is
+        safe — there is no other thread mutating these counters. The /health
+        readers may see a stale value momentarily but never an inconsistent
+        one (Python int reads are atomic). Per Opus advisor on stage-297.
         """
         self.accept_loop_requests_total += 1
         self.accept_loop_last_request_at = time.time()