mirror of
https://github.com/nesquena/hermes-webui.git
synced 2026-05-25 19:20:16 +00:00
5b41f03a92
Two functions on the /api/session/handoff-summary hot path were opening sqlite3.connect(...) inside a bare `with` statement, which commits the transaction at scope exit but does NOT close the connection. Per-turn invocations accumulated state.db / state.db-wal file descriptors and CPython heap pages on long-lived worker threads, surfacing as the multi-GB VmRSS and 6x duplicated state.db fds observed on the live instance (D0 pre-restart baseline: VmRSS 1,334,248 kB, 55 fds; cold baseline after restart: VmRSS 136,668 kB, 10 fds). Wrap both call sites with contextlib.closing(...) (already imported and used at seven other sites in the same files) so the connection is closed deterministically: - api/models.py :: count_conversation_rounds - api/routes.py :: _persist_handoff_summary_to_state_db Regression test: tests/test_issue2233_sqlite_connection_leak.py loops both functions 20 times against a tmp state.db and asserts /proc/<pid>/fd count does not grow more than 2. Linux-only via sys.platform skip. D1 live soak against a freshly-built worktree server (port 8799, isolated HERMES_HOME / HERMES_WEBUI_STATE_DIR) hitting /api/session/handoff-summary 20 times: fd_before = 5 fd_after = 5 (growth 0, threshold < 5) vmrss_before = 52636 kB vmrss_after = 52636 kB (growth 0 kB, threshold < 30 MB) The patched fix curve trends below the leak curve. Rollback: single git revert <this-sha> reverts both file edits. Refs #2233.
108 lines
3.4 KiB
Python
108 lines
3.4 KiB
Python
"""Regression test for Issue #2233: per-turn SQLite connection leak.
|
|
|
|
Two functions on the /api/session/handoff-summary hot path were opening
|
|
``sqlite3.connect(...)`` inside a bare ``with`` statement, which commits
|
|
the transaction at scope exit but does NOT close the connection. Looping
|
|
those calls per chat turn accumulated file descriptors (state.db and
|
|
state.db-wal) and CPython heap pages on long-lived worker threads.
|
|
|
|
The fix wraps both connect() calls with ``contextlib.closing(...)`` so
|
|
the connection is closed deterministically:
|
|
|
|
* api/models.py :: count_conversation_rounds
|
|
* api/routes.py :: _persist_handoff_summary_to_state_db
|
|
|
|
This test loops the two patched functions ~20 times against a tmp state.db
|
|
and asserts the parent process open-fd count does not climb.
|
|
|
|
Linux-only because the check reads ``/proc/<pid>/fd`` directly. Skipped
|
|
on macOS/Windows.
|
|
"""
|
|
import os
|
|
import sqlite3
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
_IS_LINUX = sys.platform.startswith("linux")
|
|
|
|
|
|
def _open_fd_count() -> int:
|
|
return len(os.listdir(f"/proc/{os.getpid()}/fd"))
|
|
|
|
|
|
def _make_state_db(path: Path) -> None:
|
|
"""Create a state.db with the minimum schema the two patched functions touch."""
|
|
conn = sqlite3.connect(str(path))
|
|
try:
|
|
conn.execute(
|
|
"CREATE TABLE messages ("
|
|
" session_id TEXT,"
|
|
" role TEXT,"
|
|
" content TEXT,"
|
|
" timestamp REAL"
|
|
")"
|
|
)
|
|
conn.execute(
|
|
"CREATE TABLE sessions ("
|
|
" id TEXT PRIMARY KEY,"
|
|
" message_count INTEGER"
|
|
")"
|
|
)
|
|
conn.execute(
|
|
"INSERT INTO sessions (id, message_count) VALUES (?, 0)",
|
|
("20260101_000000_abcdef",),
|
|
)
|
|
conn.execute(
|
|
"INSERT INTO messages (session_id, role, content, timestamp) "
|
|
"VALUES (?, 'user', 'hi', 1.0)",
|
|
("20260101_000000_abcdef",),
|
|
)
|
|
conn.execute(
|
|
"INSERT INTO messages (session_id, role, content, timestamp) "
|
|
"VALUES (?, 'agent', 'hello', 2.0)",
|
|
("20260101_000000_abcdef",),
|
|
)
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
@pytest.mark.skipif(not _IS_LINUX, reason="fd counting via /proc only available on Linux")
|
|
def test_handoff_summary_path_does_not_leak_fds(tmp_path, monkeypatch):
|
|
"""Loop both patched functions and assert open-fd count stays bounded."""
|
|
hermes_home = tmp_path / "hermes"
|
|
hermes_home.mkdir()
|
|
state_db = hermes_home / "state.db"
|
|
_make_state_db(state_db)
|
|
|
|
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
|
|
|
|
from api.models import count_conversation_rounds
|
|
from api.routes import _persist_handoff_summary_to_state_db
|
|
|
|
sid = "20260101_000000_abcdef"
|
|
marker = {
|
|
"role": "tool",
|
|
"content": "{\"handoff_summary\": \"test\"}",
|
|
"timestamp": 3.0,
|
|
}
|
|
|
|
count_conversation_rounds(sid)
|
|
_persist_handoff_summary_to_state_db(sid, marker)
|
|
|
|
fd_before = _open_fd_count()
|
|
|
|
for _ in range(20):
|
|
count_conversation_rounds(sid)
|
|
_persist_handoff_summary_to_state_db(sid, marker)
|
|
|
|
fd_after = _open_fd_count()
|
|
growth = fd_after - fd_before
|
|
|
|
assert growth <= 2, (
|
|
f"open fd count grew by {growth} (before={fd_before}, after={fd_after}); "
|
|
"suggests sqlite3 connections from the handoff-summary path are not being closed"
|
|
)
|