Stage 326: PR #1950 — Mute stale stopped gateway heartbeat by @franksong2702

This commit is contained in:
nesquena-hermes
2026-05-09 18:16:16 +00:00
3 changed files with 104 additions and 0 deletions
+10
View File
@@ -1,5 +1,15 @@
# Hermes Web UI -- Changelog
## Unreleased
- **Gateway heartbeat stale stopped state** — treat an old root
`gateway_state.json` with `gateway_state: "stopped"` as an unknown /
unconfigured root gateway instead of a live outage, so users running only
profile-scoped gateways do not get a persistent heartbeat-down alert from a
fossilized clean-stop file. Fresh stopped states still report down. Closes
#1944. (`api/agent_health.py`,
`tests/test_issue1879_cross_container_gateway_liveness.py`)
## [v0.51.30] — 2026-05-08 — 3-PR contributor batch (Release G: offline recovery + PWA hardening + opt-in session jump buttons + opt-in endless-scroll)
### Added (3 PRs, all from @ai-ag2026)
+46
View File
@@ -91,6 +91,41 @@ def _runtime_status_is_fresh(
return age_s <= threshold_s
def _runtime_status_is_stale_stopped(
runtime_status: dict[str, Any] | None,
*,
now: datetime | None = None,
threshold_s: float = GATEWAY_FRESHNESS_THRESHOLD_S,
) -> bool:
"""Return ``True`` for an old clean-stop root gateway state.
A user may run only profile-scoped gateways while a root
``gateway_state.json`` from an older, intentionally stopped gateway remains
on disk (#1944). Treat that stale stopped file like "no root gateway
configured" so the heartbeat banner does not keep warning about a service
the user is not running. Fresh stopped state still reports down.
"""
if not isinstance(runtime_status, dict):
return False
if runtime_status.get("gateway_state") != "stopped":
return False
raw_updated_at = runtime_status.get("updated_at")
if not isinstance(raw_updated_at, str) or not raw_updated_at:
return False
try:
updated_at = datetime.fromisoformat(raw_updated_at)
except (TypeError, ValueError):
return False
if updated_at.tzinfo is None:
return False
reference = now if now is not None else datetime.now(timezone.utc)
age_s = (reference - updated_at).total_seconds()
return age_s > threshold_s
def _gateway_status_module():
"""Load gateway.status lazily so tests and WebUI-only installs stay isolated."""
return importlib.import_module("gateway.status")
@@ -263,6 +298,17 @@ def build_agent_health_payload() -> dict[str, Any]:
},
}
if _runtime_status_is_stale_stopped(runtime_status):
return {
"alive": None,
"checked_at": checked_at,
"details": {
"state": "unknown",
"reason": "gateway_stale_stopped_state",
**safe_details,
},
}
if isinstance(runtime_status, dict):
return {
"alive": False,
@@ -17,6 +17,8 @@ These tests pin every behavior the fix promises:
* fresh + running gateway_state, no PID → alive (cross-container path)
* stale updated_at + running → down (no false positives)
* fresh updated_at + non-running state → down (crash-without-cleanup case)
* stale updated_at + stopped state → unknown (old root gateway was
intentionally stopped; do not nag profile-gateway users)
* malformed / missing / naive timestamp → down (no parser-quirk false alive)
* future timestamp within threshold → alive (clock skew tolerance)
* future timestamp beyond threshold → down (broken clock rejected)
@@ -152,6 +154,52 @@ def test_fresh_updated_at_with_non_running_state_reports_down(monkeypatch):
assert payload["details"]["state"] == "down"
def test_stale_stopped_runtime_status_reports_unknown_not_down(monkeypatch):
"""#1944: a fossilized clean-stop root state should not trigger the alert.
Users can run profile-scoped gateways without a root gateway. If an old
root gateway_state.json says "stopped", treating it as down makes the
heartbeat banner fire forever even though no root gateway is configured.
"""
from api import agent_health
stale_ts = _iso(datetime.now(timezone.utc) - timedelta(days=7))
runtime = _runtime_status(stale_ts, gateway_state="stopped", active_agents=0)
monkeypatch.setattr(
agent_health,
"_gateway_status_module",
lambda: _FakeGatewayStatus(runtime, running_pid=None),
)
payload = agent_health.build_agent_health_payload()
assert payload["alive"] is None
assert payload["details"]["state"] == "unknown"
assert payload["details"]["reason"] == "gateway_stale_stopped_state"
assert payload["details"]["gateway_state"] == "stopped"
def test_fresh_stopped_runtime_status_still_reports_down(monkeypatch):
"""A recent stopped state still means the configured gateway is down."""
from api import agent_health
fresh_ts = _iso(datetime.now(timezone.utc) - timedelta(seconds=10))
runtime = _runtime_status(fresh_ts, gateway_state="stopped", active_agents=0)
monkeypatch.setattr(
agent_health,
"_gateway_status_module",
lambda: _FakeGatewayStatus(runtime, running_pid=None),
)
payload = agent_health.build_agent_health_payload()
assert payload["alive"] is False
assert payload["details"]["state"] == "down"
assert payload["details"]["reason"] == "gateway_not_running"
@pytest.mark.parametrize(
"broken_value",
[