diff --git a/CHANGELOG.md b/CHANGELOG.md index eeaa0e04..45e3150a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Hermes Web UI -- Changelog +## Unreleased + +- **Gateway heartbeat stale stopped state** — treat an old root + `gateway_state.json` with `gateway_state: "stopped"` as an unknown / + unconfigured root gateway instead of a live outage, so users running only + profile-scoped gateways do not get a persistent heartbeat-down alert from a + fossilized clean-stop file. Fresh stopped states still report down. Closes + #1944. (`api/agent_health.py`, + `tests/test_issue1879_cross_container_gateway_liveness.py`) + ## [v0.51.30] — 2026-05-08 — 3-PR contributor batch (Release G: offline recovery + PWA hardening + opt-in session jump buttons + opt-in endless-scroll) ### Added (3 PRs, all from @ai-ag2026) diff --git a/api/agent_health.py b/api/agent_health.py index c68375b8..ea3bc572 100644 --- a/api/agent_health.py +++ b/api/agent_health.py @@ -91,6 +91,41 @@ def _runtime_status_is_fresh( return age_s <= threshold_s +def _runtime_status_is_stale_stopped( + runtime_status: dict[str, Any] | None, + *, + now: datetime | None = None, + threshold_s: float = GATEWAY_FRESHNESS_THRESHOLD_S, +) -> bool: + """Return ``True`` for an old clean-stop root gateway state. + + A user may run only profile-scoped gateways while a root + ``gateway_state.json`` from an older, intentionally stopped gateway remains + on disk (#1944). Treat that stale stopped file like "no root gateway + configured" so the heartbeat banner does not keep warning about a service + the user is not running. Fresh stopped state still reports down. + """ + if not isinstance(runtime_status, dict): + return False + if runtime_status.get("gateway_state") != "stopped": + return False + + raw_updated_at = runtime_status.get("updated_at") + if not isinstance(raw_updated_at, str) or not raw_updated_at: + return False + + try: + updated_at = datetime.fromisoformat(raw_updated_at) + except (TypeError, ValueError): + return False + if updated_at.tzinfo is None: + return False + + reference = now if now is not None else datetime.now(timezone.utc) + age_s = (reference - updated_at).total_seconds() + return age_s > threshold_s + + def _gateway_status_module(): """Load gateway.status lazily so tests and WebUI-only installs stay isolated.""" return importlib.import_module("gateway.status") @@ -263,6 +298,17 @@ def build_agent_health_payload() -> dict[str, Any]: }, } + if _runtime_status_is_stale_stopped(runtime_status): + return { + "alive": None, + "checked_at": checked_at, + "details": { + "state": "unknown", + "reason": "gateway_stale_stopped_state", + **safe_details, + }, + } + if isinstance(runtime_status, dict): return { "alive": False, diff --git a/tests/test_issue1879_cross_container_gateway_liveness.py b/tests/test_issue1879_cross_container_gateway_liveness.py index 262b8f23..2eeaf38e 100644 --- a/tests/test_issue1879_cross_container_gateway_liveness.py +++ b/tests/test_issue1879_cross_container_gateway_liveness.py @@ -17,6 +17,8 @@ These tests pin every behavior the fix promises: * fresh + running gateway_state, no PID → alive (cross-container path) * stale updated_at + running → down (no false positives) * fresh updated_at + non-running state → down (crash-without-cleanup case) + * stale updated_at + stopped state → unknown (old root gateway was + intentionally stopped; do not nag profile-gateway users) * malformed / missing / naive timestamp → down (no parser-quirk false alive) * future timestamp within threshold → alive (clock skew tolerance) * future timestamp beyond threshold → down (broken clock rejected) @@ -152,6 +154,52 @@ def test_fresh_updated_at_with_non_running_state_reports_down(monkeypatch): assert payload["details"]["state"] == "down" +def test_stale_stopped_runtime_status_reports_unknown_not_down(monkeypatch): + """#1944: a fossilized clean-stop root state should not trigger the alert. + + Users can run profile-scoped gateways without a root gateway. If an old + root gateway_state.json says "stopped", treating it as down makes the + heartbeat banner fire forever even though no root gateway is configured. + """ + from api import agent_health + + stale_ts = _iso(datetime.now(timezone.utc) - timedelta(days=7)) + runtime = _runtime_status(stale_ts, gateway_state="stopped", active_agents=0) + + monkeypatch.setattr( + agent_health, + "_gateway_status_module", + lambda: _FakeGatewayStatus(runtime, running_pid=None), + ) + + payload = agent_health.build_agent_health_payload() + + assert payload["alive"] is None + assert payload["details"]["state"] == "unknown" + assert payload["details"]["reason"] == "gateway_stale_stopped_state" + assert payload["details"]["gateway_state"] == "stopped" + + +def test_fresh_stopped_runtime_status_still_reports_down(monkeypatch): + """A recent stopped state still means the configured gateway is down.""" + from api import agent_health + + fresh_ts = _iso(datetime.now(timezone.utc) - timedelta(seconds=10)) + runtime = _runtime_status(fresh_ts, gateway_state="stopped", active_agents=0) + + monkeypatch.setattr( + agent_health, + "_gateway_status_module", + lambda: _FakeGatewayStatus(runtime, running_pid=None), + ) + + payload = agent_health.build_agent_health_payload() + + assert payload["alive"] is False + assert payload["details"]["state"] == "down" + assert payload["details"]["reason"] == "gateway_not_running" + + @pytest.mark.parametrize( "broken_value", [