Files
hermes-webui/api/terminal.py
T
nesquena-hermes 71ba863ce5 fix(terminal): drop PR_SET_PDEATHSIG preexec_fn that killed every Linux shell
Fixes #2853. The `_terminal_shell_preexec_fn` added in `71d8a8fb` called
`prctl(PR_SET_PDEATHSIG, SIGTERM)` so orphaned PTY shells would die when
the WebUI process crashed. But that signal is **per-thread**, not
per-process, and WebUI runs `ThreadingHTTPServer`: every HTTP request is
handled in its own short-lived worker thread.

Flow that broke every Linux user:

1. User clicks the terminal toggle → frontend hits `POST /api/terminal/start`.
2. ThreadingHTTPServer spins up a worker thread to handle that one request.
3. The worker thread calls `subprocess.Popen(..., preexec_fn=...)`.
4. The shell calls `prctl(PR_SET_PDEATHSIG, SIGTERM)` in its preexec_fn.
   Its registered "parent" is now the WebUI worker thread that called Popen.
5. The handler returns its JSON response and the worker thread exits.
6. The kernel sees the pdeathsig-parent thread has died and sends SIGTERM
   to the PTY shell. The shell dies within ~10 ms of being created.
7. The reader loop sees EIO on the master FD, emits `terminal_closed`, and
   the frontend writes `[terminal closed]`.

macOS users were unaffected because `libc.prctl` doesn't exist there —
`ctypes.CDLL(None)` returns a libc handle, `libc.prctl` raises
`AttributeError`, the bare-`except` swallows it, and the shell starts
with no pdeathsig configured.

Empirical verification on this Linux host (real PTY + `subprocess.Popen`
inside a `threading.Thread` that joins immediately):

  with    preexec_fn → proc.poll() == -15 (SIGTERM), master FD returns EIO
  without preexec_fn → proc.poll() == None (alive), master FD returns "HELLO\\r\\n"

Same shell, same PTY, same threading topology as WebUI.

Fix
---

Drop the `preexec_fn` entirely. The orphan-shell-on-crash case the original
PR was navigating is rare for self-hosted single-user installs, and the
existing `atexit.register(close_all_terminals)` + explicit `close_terminal`
paths cover graceful shutdown. A future fix (option B in the issue) can
re-introduce pdeathsig pinned to a long-lived supervisor thread, but that
is a follow-up — this PR is the smallest unbricks-Linux-today change.

Tests
-----

- Invert `test_terminal_shell_uses_parent_death_signal_preexec` →
  `test_terminal_shell_does_not_use_pdeathsig_preexec`: asserts
  `preexec_fn` is NOT in the Popen kwargs.
- Add `test_pty_shell_survives_when_spawning_thread_exits`: spawns a
  real PTY shell via `start_terminal` from a worker thread, waits for
  the worker to join, asserts the shell is still alive after a half-second
  grace window. This is the contract the original tests never exercised.
- Update `test_terminal_module_registers_graceful_shutdown_reaper` to
  refuse re-introduction of the preexec_fn or the `libc.prctl(1, SIGTERM)`
  call (treats either as a regression).

All 27 terminal-related tests pass locally.

Refs #2853
2026-05-24 17:13:31 +00:00

278 lines
8.8 KiB
Python

"""Embedded workspace terminal support for Hermes Web UI.
The terminal is intentionally independent from the agent execution path. It
starts a shell with an explicit cwd/env per process and never mutates
process-global os.environ, which avoids expanding the session-env race tracked
in the agent execution layer.
"""
from __future__ import annotations
import errno
import atexit
import codecs
import fcntl
import os
import queue
import select
import shutil
import signal
import struct
import subprocess
import termios
import threading
from dataclasses import dataclass, field
from pathlib import Path
def _set_nonblocking(fd: int) -> None:
flags = fcntl.fcntl(fd, fcntl.F_GETFL)
fcntl.fcntl(fd, fcntl.F_SETFL, flags | os.O_NONBLOCK)
def _winsize(rows: int, cols: int) -> bytes:
rows = max(8, min(int(rows or 24), 80))
cols = max(20, min(int(cols or 80), 240))
return struct.pack("HHHH", rows, cols, 0, 0)
@dataclass
class TerminalSession:
session_id: str
workspace: str
proc: subprocess.Popen
master_fd: int
rows: int = 24
cols: int = 80
output: queue.Queue = field(default_factory=lambda: queue.Queue(maxsize=2000))
closed: threading.Event = field(default_factory=threading.Event)
reader: threading.Thread | None = None
def is_alive(self) -> bool:
return not self.closed.is_set() and self.proc.poll() is None
def put_output(self, event: str, payload: dict) -> None:
try:
self.output.put_nowait((event, payload))
except queue.Full:
# Keep the terminal responsive by dropping the oldest queued chunk.
try:
self.output.get_nowait()
except queue.Empty:
pass
try:
self.output.put_nowait((event, payload))
except queue.Full:
pass
_TERMINALS: dict[str, TerminalSession] = {}
_LOCK = threading.RLock()
# NOTE on parent-death-signal: a previous version of this module set
# PR_SET_PDEATHSIG via a preexec_fn to terminate orphaned PTY shells when the
# WebUI process crashed. That broke every Linux user (#2853): WebUI runs a
# ThreadingHTTPServer, so the Popen call happens on a short-lived per-request
# thread, and PR_SET_PDEATHSIG is per-thread. The PTY shell registered the
# spawning thread as its "parent" and was killed with SIGTERM the instant that
# thread joined — within ~10 ms of opening the terminal — surfacing as the
# `[terminal closed]` banner. The graceful path is covered by
# `atexit.register(close_all_terminals)` and the explicit `close_terminal`
# call sites; hard kills of the WebUI process leak the shell, which is the
# tradeoff for working on Linux at all.
def _decode_terminal_output(decoder, data: bytes) -> str:
"""Decode PTY bytes without stripping terminal control sequences."""
return decoder.decode(data)
def _shell_path() -> str:
shell = os.environ.get("SHELL") or ""
if shell and Path(shell).exists():
return shell
return shutil.which("zsh") or shutil.which("bash") or shutil.which("sh") or "/bin/sh"
def _shell_argv(shell: str) -> list[str]:
name = Path(shell).name
if name in {"zsh", "bash", "sh"}:
return [shell, "-i"]
return [shell]
def _reader_loop(term: TerminalSession) -> None:
decoder = codecs.getincrementaldecoder("utf-8")("replace")
try:
while not term.closed.is_set():
if term.proc.poll() is not None:
break
try:
ready, _, _ = select.select([term.master_fd], [], [], 0.25)
except (OSError, ValueError):
break
if not ready:
continue
try:
data = os.read(term.master_fd, 8192)
except OSError as exc:
if exc.errno in (errno.EIO, errno.EBADF):
break
raise
if not data:
break
text = _decode_terminal_output(decoder, data)
if text:
term.put_output("output", {"text": text})
except Exception as exc:
term.put_output("terminal_error", {"error": str(exc)})
finally:
term.closed.set()
code = term.proc.poll()
term.put_output("terminal_closed", {"exit_code": code})
def _set_size(term: TerminalSession, rows: int, cols: int) -> None:
term.rows = max(8, min(int(rows or term.rows or 24), 80))
term.cols = max(20, min(int(cols or term.cols or 80), 240))
try:
fcntl.ioctl(term.master_fd, termios.TIOCSWINSZ, _winsize(term.rows, term.cols))
except OSError:
pass
try:
if term.proc.poll() is None:
os.killpg(term.proc.pid, signal.SIGWINCH)
except (OSError, ProcessLookupError):
pass
def start_terminal(session_id: str, workspace: Path, rows: int = 24, cols: int = 80, restart: bool = False) -> TerminalSession:
"""Start or return the embedded terminal for a WebUI session."""
sid = str(session_id or "").strip()
if not sid:
raise ValueError("session_id is required")
cwd = str(Path(workspace).expanduser().resolve())
if not Path(cwd).is_dir():
raise ValueError("workspace is not a directory")
with _LOCK:
current = _TERMINALS.get(sid)
if current and current.is_alive() and not restart and current.workspace == cwd:
_set_size(current, rows, cols)
return current
if current:
close_terminal(sid)
master_fd, slave_fd = os.openpty()
# Build a safe env: allowlist common shell vars, strip API keys and secrets.
# The PTY shell is an interactive UI surface — do not leak server credentials.
_SAFE_ENV_KEYS = {
"PATH", "HOME", "USER", "LOGNAME", "SHELL", "LANG", "LC_ALL",
"LC_CTYPE", "LC_MESSAGES", "LANGUAGE", "TZ", "TMPDIR", "TEMP",
"XDG_RUNTIME_DIR", "XDG_CONFIG_HOME", "XDG_DATA_HOME",
}
env = {k: v for k, v in os.environ.items() if k in _SAFE_ENV_KEYS}
env.update(
{
"TERM": "xterm-256color",
"COLORTERM": "truecolor",
"COLUMNS": str(cols),
"LINES": str(rows),
"PWD": cwd,
"HERMES_WEBUI_TERMINAL": "1",
}
)
shell = _shell_path()
proc = subprocess.Popen(
_shell_argv(shell),
cwd=cwd,
env=env,
stdin=slave_fd,
stdout=slave_fd,
stderr=slave_fd,
close_fds=True,
start_new_session=True,
)
os.close(slave_fd)
_set_nonblocking(master_fd)
term = TerminalSession(
session_id=sid,
workspace=cwd,
proc=proc,
master_fd=master_fd,
rows=rows,
cols=cols,
)
_set_size(term, rows, cols)
term.reader = threading.Thread(target=_reader_loop, args=(term,), daemon=True)
term.reader.start()
_TERMINALS[sid] = term
return term
def get_terminal(session_id: str) -> TerminalSession | None:
with _LOCK:
term = _TERMINALS.get(str(session_id or ""))
if term and term.is_alive():
return term
return term
def write_terminal(session_id: str, data: str) -> None:
term = get_terminal(session_id)
if not term or not term.is_alive():
raise KeyError("terminal not running")
os.write(term.master_fd, str(data or "").encode("utf-8", errors="replace"))
def resize_terminal(session_id: str, rows: int, cols: int) -> None:
term = get_terminal(session_id)
if not term:
raise KeyError("terminal not running")
_set_size(term, rows, cols)
def close_terminal(session_id: str) -> bool:
sid = str(session_id or "")
with _LOCK:
term = _TERMINALS.pop(sid, None)
if not term:
return False
term.closed.set()
try:
if term.proc.poll() is None:
try:
os.killpg(term.proc.pid, signal.SIGHUP)
except ProcessLookupError:
pass
try:
term.proc.wait(timeout=1.5)
except subprocess.TimeoutExpired:
try:
os.killpg(term.proc.pid, signal.SIGKILL)
except ProcessLookupError:
pass
try:
term.proc.wait(timeout=1.0)
except (subprocess.TimeoutExpired, ProcessLookupError):
pass
finally:
try:
os.close(term.master_fd)
except OSError:
pass
return True
def close_all_terminals() -> None:
"""Best-effort reap of embedded shells during graceful WebUI shutdown."""
with _LOCK:
session_ids = list(_TERMINALS)
for session_id in session_ids:
close_terminal(session_id)
atexit.register(close_all_terminals)