diff --git a/api/agent_health.py b/api/agent_health.py new file mode 100644 index 00000000..cc3d354a --- /dev/null +++ b/api/agent_health.py @@ -0,0 +1,132 @@ +"""Hermes agent/gateway heartbeat payload helpers (#716). + +The WebUI process is not always paired with a long-running Hermes gateway. Some +setups use WebUI only, while self-hosted messaging deployments run a separate +Hermes gateway daemon that records runtime metadata in the Hermes Agent home. +This module turns those existing safe runtime signals into a small UI-facing +heartbeat without shelling out or adding psutil as a hard dependency. +""" + +from __future__ import annotations + +import importlib +from datetime import datetime, timezone +from typing import Any + + +def _checked_at() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _gateway_status_module(): + """Load gateway.status lazily so tests and WebUI-only installs stay isolated.""" + return importlib.import_module("gateway.status") + + +def _runtime_detail_subset(runtime_status: dict[str, Any] | None) -> dict[str, Any]: + """Return only non-sensitive runtime fields for the browser. + + gateway.status records argv/PID metadata so the CLI can validate process + identity. The WebUI alert only needs health semantics, never raw command + lines, paths, environment, or tokens. + """ + if not isinstance(runtime_status, dict): + return {} + + details: dict[str, Any] = {} + gateway_state = runtime_status.get("gateway_state") + if isinstance(gateway_state, str) and gateway_state: + details["gateway_state"] = gateway_state + + updated_at = runtime_status.get("updated_at") + if isinstance(updated_at, str) and updated_at: + details["updated_at"] = updated_at + + try: + details["active_agents"] = max(0, int(runtime_status.get("active_agents") or 0)) + except (TypeError, ValueError): + pass + + platforms = runtime_status.get("platforms") + if isinstance(platforms, dict): + details["platform_count"] = len(platforms) + states: dict[str, int] = {} + for payload in platforms.values(): + if not isinstance(payload, dict): + continue + state = payload.get("state") + if isinstance(state, str) and state: + states[state] = states.get(state, 0) + 1 + if states: + details["platform_states"] = states + + return details + + +def build_agent_health_payload() -> dict[str, Any]: + """Return `{alive, checked_at, details}` for the Hermes gateway/agent. + + `alive` is intentionally tri-state: + * True: a gateway runtime signal says the process is alive. + * False: gateway metadata exists, but no live gateway process owns it. + * None: no gateway metadata/status is available, so this WebUI setup is + probably not configured with a separate gateway process. + """ + checked_at = _checked_at() + try: + gateway_status = _gateway_status_module() + except Exception as exc: + return { + "alive": None, + "checked_at": checked_at, + "details": { + "state": "unknown", + "reason": "gateway_status_unavailable", + "error": type(exc).__name__, + }, + } + + runtime_status = None + try: + runtime_status = gateway_status.read_runtime_status() + except Exception: + runtime_status = None + + try: + running_pid = gateway_status.get_running_pid(cleanup_stale=False) + except TypeError: + # Older agent versions may not expose cleanup_stale. Keep compatibility. + running_pid = gateway_status.get_running_pid() + except Exception: + running_pid = None + + safe_details = _runtime_detail_subset(runtime_status) + if running_pid is not None: + return { + "alive": True, + "checked_at": checked_at, + "details": { + "state": "alive", + **safe_details, + }, + } + + if isinstance(runtime_status, dict): + return { + "alive": False, + "checked_at": checked_at, + "details": { + "state": "down", + "reason": "gateway_not_running", + **safe_details, + }, + } + + return { + "alive": None, + "checked_at": checked_at, + "details": { + "state": "unknown", + "reason": "gateway_not_configured", + }, + } diff --git a/api/routes.py b/api/routes.py index 85bb6627..ae146eab 100644 --- a/api/routes.py +++ b/api/routes.py @@ -479,6 +479,7 @@ from api.helpers import ( redact_session_data, _redact_text, ) +from api.agent_health import build_agent_health_payload def _clear_stale_stream_state(session) -> bool: @@ -2487,6 +2488,9 @@ def handle_get(handler, parsed) -> bool: if parsed.path == "/health": return _handle_health(handler, parsed) + if parsed.path == "/api/health/agent": + return j(handler, build_agent_health_payload()) + if parsed.path == "/api/models": return j(handler, get_available_models()) diff --git a/docs/pr-media/716/agent-health-alert.png b/docs/pr-media/716/agent-health-alert.png new file mode 100644 index 00000000..86b9be89 Binary files /dev/null and b/docs/pr-media/716/agent-health-alert.png differ diff --git a/static/index.html b/static/index.html index bf5ae23a..0903badd 100644 --- a/static/index.html +++ b/static/index.html @@ -342,6 +342,13 @@ +
diff --git a/static/style.css b/static/style.css index 3022b147..12a4222d 100644 --- a/static/style.css +++ b/static/style.css @@ -519,6 +519,13 @@ .reconnect-banner.visible{display:flex;} .reconnect-btn{padding:6px 12px;border-radius:8px;font-size:12px;font-weight:600;background:var(--accent-bg-strong);border:1px solid var(--accent-bg-strong);color:var(--accent-text);cursor:pointer;} .reconnect-btn:hover{background:var(--accent-bg-strong);} + .agent-health-banner{position:sticky;bottom:0;z-index:4;display:none;align-items:center;justify-content:space-between;gap:12px;margin:10px auto 0;max-width:var(--msg-max);width:calc(100% - 40px);padding:12px 16px;border:1px solid color-mix(in srgb,var(--error) 55%,var(--surface));border-radius:12px;background:color-mix(in srgb,var(--error) 14%,var(--surface));color:var(--text);box-shadow:0 10px 32px rgba(0,0,0,.16);} + .agent-health-banner.visible{display:flex;} + .agent-health-copy{display:flex;flex-direction:column;gap:3px;min-width:0;font-size:13px;line-height:1.35;} + .agent-health-copy strong{color:var(--error);font-size:13px;} + .agent-health-copy span{color:var(--muted);} + .agent-health-dismiss{flex-shrink:0;padding:6px 12px;border-radius:8px;border:1px solid color-mix(in srgb,var(--error) 45%,var(--surface));background:color-mix(in srgb,var(--error) 10%,var(--surface));color:var(--error);font-size:12px;font-weight:600;cursor:pointer;} + .agent-health-dismiss:hover{background:color-mix(in srgb,var(--error) 18%,var(--surface));} /* ── Update banner ── */ .update-banner{display:none;background:var(--surface);border:1px solid var(--accent);border-radius:10px;padding:10px 16px;margin:10px auto;max-width:780px;font-size:13px;color:var(--accent-text);align-items:center;justify-content:space-between;gap:12px;} .update-banner.visible{display:flex;} diff --git a/static/ui.js b/static/ui.js index 3d7a7fcd..d6e5ace0 100644 --- a/static/ui.js +++ b/static/ui.js @@ -3021,6 +3021,82 @@ function dismissReconnect() { $('reconnectBanner').classList.remove('visible'); clearInflight(); } + +// ── Hermes agent/gateway heartbeat alert (#716) ── +const AGENT_HEALTH_INTERVAL_MS=30000; +const AGENT_HEALTH_DISMISSED_KEY='agent-health-dismissed'; +let _agentHealthTimer=null; +let _agentHealthLastState='unknown'; +function _agentHealthDismissed(){ + try{return localStorage.getItem(AGENT_HEALTH_DISMISSED_KEY)==='1';} + catch(_){return false;} +} +function _setAgentHealthDismissed(value){ + try{ + if(value)localStorage.setItem(AGENT_HEALTH_DISMISSED_KEY,'1'); + else localStorage.removeItem(AGENT_HEALTH_DISMISSED_KEY); + }catch(_){ } +} +function _hideAgentHealthAlert(){ + const banner=$('agentHealthBanner'); + if(banner){banner.classList.remove('visible');banner.hidden=true;} +} +function _showAgentHealthAlert(payload){ + if(_agentHealthDismissed()) return; + const banner=$('agentHealthBanner'); + const title=$('agentHealthTitle'); + const details=$('agentHealthDetails'); + if(!banner) return; + if(title) title.textContent='Hermes agent is not responding'; + const state=payload&&payload.details&&payload.details.gateway_state?` State: ${payload.details.gateway_state}.`:''; + if(details) details.textContent=`Gateway heartbeat failed.${state} Messages may not be delivered until it comes back.`; + banner.hidden=false; + banner.classList.add('visible'); +} +function dismissAgentHealthAlert(){ + _setAgentHealthDismissed(true); + _hideAgentHealthAlert(); +} +async function pollAgentHealth(){ + if(document.visibilityState !== 'visible') return; + try{ + const payload=await api('/api/health/agent'); + if(payload.alive === true){ + _agentHealthLastState='alive'; + _setAgentHealthDismissed(false); + _hideAgentHealthAlert(); + return; + } + if(payload.alive === false){ + _agentHealthLastState='down'; + _showAgentHealthAlert(payload); + return; + } + if(payload.alive == null){ + _agentHealthLastState='unknown'; + _hideAgentHealthAlert(); + } + }catch(_){ + _agentHealthLastState='unknown'; + _hideAgentHealthAlert(); + } +} +function startAgentHealthMonitor(){ + if(document.visibilityState !== 'visible') return; + if(_agentHealthTimer) return; + void pollAgentHealth(); + _agentHealthTimer=setInterval(pollAgentHealth, AGENT_HEALTH_INTERVAL_MS); +} +function stopAgentHealthMonitor(){ + if(_agentHealthTimer){clearInterval(_agentHealthTimer);_agentHealthTimer=null;} +} +function _syncAgentHealthMonitorVisibility(){ + if(document.visibilityState === 'visible') startAgentHealthMonitor(); + else stopAgentHealthMonitor(); +} +document.addEventListener('visibilitychange',_syncAgentHealthMonitorVisibility); +if(document.readyState==='loading') document.addEventListener('DOMContentLoaded',startAgentHealthMonitor); +else startAgentHealthMonitor(); async function refreshSession() { // When the banner is in post-update restart mode, the "Reload" button // should do a full page reload — a session refresh would just 502 while diff --git a/tests/test_issue716_agent_heartbeat.py b/tests/test_issue716_agent_heartbeat.py new file mode 100644 index 00000000..2a2033cb --- /dev/null +++ b/tests/test_issue716_agent_heartbeat.py @@ -0,0 +1,156 @@ +"""Regression coverage for #716 Hermes agent/gateway heartbeat monitor.""" + +from __future__ import annotations + +import pathlib + + +REPO_ROOT = pathlib.Path(__file__).parent.parent +UI_JS = (REPO_ROOT / "static" / "ui.js").read_text(encoding="utf-8") +INDEX_HTML = (REPO_ROOT / "static" / "index.html").read_text(encoding="utf-8") +STYLE_CSS = (REPO_ROOT / "static" / "style.css").read_text(encoding="utf-8") +ROUTES_PY = (REPO_ROOT / "api" / "routes.py").read_text(encoding="utf-8") + + +class _FakeGatewayStatus: + def __init__(self, runtime_status, running_pid): + self._runtime_status = runtime_status + self._running_pid = running_pid + + def read_runtime_status(self): + return self._runtime_status + + def get_running_pid(self, cleanup_stale=False): + assert cleanup_stale is False + return self._running_pid + + +def _runtime_status(**overrides): + payload = { + "gateway_state": "running", + "updated_at": "2026-05-04T12:00:00+00:00", + "active_agents": 2, + "platforms": { + "discord": {"state": "connected"}, + "telegram": {"state": "starting"}, + }, + # Sensitive/raw process fields that must never reach the browser. + "pid": 12345, + "argv": ["hermes", "gateway", "--token", "secret-token"], + "command": "hermes gateway --token secret-token", + "executable": "/home/user/.hermes/hermes-agent/venv/bin/python", + "env": {"API_KEY": "secret"}, + } + payload.update(overrides) + return payload + + +def test_agent_health_payload_alive_uses_safe_runtime_details(monkeypatch): + from api import agent_health + + monkeypatch.setattr( + agent_health, + "_gateway_status_module", + lambda: _FakeGatewayStatus(_runtime_status(), running_pid=12345), + ) + + payload = agent_health.build_agent_health_payload() + + assert payload["alive"] is True + assert payload["checked_at"] + assert payload["details"] == { + "state": "alive", + "gateway_state": "running", + "updated_at": "2026-05-04T12:00:00+00:00", + "active_agents": 2, + "platform_count": 2, + "platform_states": {"connected": 1, "starting": 1}, + } + rendered = repr(payload) + assert "secret-token" not in rendered + assert "API_KEY" not in rendered + assert "argv" not in rendered + assert "command" not in rendered + assert "executable" not in rendered + assert "pid" not in payload["details"] + + +def test_agent_health_payload_down_when_gateway_metadata_exists_but_no_process(monkeypatch): + from api import agent_health + + monkeypatch.setattr( + agent_health, + "_gateway_status_module", + lambda: _FakeGatewayStatus(_runtime_status(gateway_state="stale"), running_pid=None), + ) + + payload = agent_health.build_agent_health_payload() + + assert payload["alive"] is False + assert payload["details"]["state"] == "down" + assert payload["details"]["reason"] == "gateway_not_running" + assert payload["details"]["gateway_state"] == "stale" + + +def test_agent_health_payload_unknown_when_gateway_is_not_configured(monkeypatch): + from api import agent_health + + monkeypatch.setattr( + agent_health, + "_gateway_status_module", + lambda: _FakeGatewayStatus(runtime_status=None, running_pid=None), + ) + + payload = agent_health.build_agent_health_payload() + + assert payload["alive"] is None + assert payload["details"] == {"state": "unknown", "reason": "gateway_not_configured"} + + +def test_agent_health_route_is_registered_with_tri_state_payload_shape(): + assert 'parsed.path == "/api/health/agent"' in ROUTES_PY + assert "build_agent_health_payload()" in ROUTES_PY + src = (REPO_ROOT / "api" / "agent_health.py").read_text(encoding="utf-8") + assert '"alive"' in src + assert '"checked_at"' in src + assert '"details"' in src + + +def test_agent_health_banner_markup_and_styles_exist(): + assert 'id="agentHealthBanner"' in INDEX_HTML + assert 'role="alert"' in INDEX_HTML + assert 'aria-live="assertive"' in INDEX_HTML + assert 'onclick="dismissAgentHealthAlert()"' in INDEX_HTML + assert ".agent-health-banner" in STYLE_CSS + assert ".agent-health-banner.visible" in STYLE_CSS + assert ".agent-health-dismiss" in STYLE_CSS + + +def test_agent_health_frontend_polls_only_visible_and_distinguishes_states(): + assert "const AGENT_HEALTH_INTERVAL_MS=30000" in UI_JS + assert "api('/api/health/agent')" in UI_JS + assert "document.visibilityState !== 'visible'" in UI_JS + assert "document.addEventListener('visibilitychange',_syncAgentHealthMonitorVisibility)" in UI_JS + assert "if(payload.alive === true)" in UI_JS + assert "if(payload.alive === false)" in UI_JS + assert "if(payload.alive == null)" in UI_JS + assert "_showAgentHealthAlert(payload)" in UI_JS + assert "_hideAgentHealthAlert()" in UI_JS + + +def test_agent_health_dismiss_persists_until_recovery(): + assert "const AGENT_HEALTH_DISMISSED_KEY='agent-health-dismissed'" in UI_JS + assert "localStorage.setItem(AGENT_HEALTH_DISMISSED_KEY,'1')" in UI_JS + assert "localStorage.removeItem(AGENT_HEALTH_DISMISSED_KEY)" in UI_JS + assert "function dismissAgentHealthAlert()" in UI_JS + assert "if(_agentHealthDismissed()) return;" in UI_JS + assert "_setAgentHealthDismissed(false)" in UI_JS + + +def test_agent_health_backend_does_not_use_shell_or_expose_raw_process_fields(): + src = (REPO_ROOT / "api" / "agent_health.py").read_text(encoding="utf-8") + assert "import subprocess" not in src + assert "import psutil" not in src + for private_field in ("argv", "command", "executable", "env"): + assert f'details["{private_field}"]' not in src + assert f"details['{private_field}']" not in src