mirror of
https://github.com/nesquena/hermes-webui.git
synced 2026-05-25 11:10:18 +00:00
71ba863ce5
Fixes #2853. The `_terminal_shell_preexec_fn` added in `71d8a8fb` called `prctl(PR_SET_PDEATHSIG, SIGTERM)` so orphaned PTY shells would die when the WebUI process crashed. But that signal is **per-thread**, not per-process, and WebUI runs `ThreadingHTTPServer`: every HTTP request is handled in its own short-lived worker thread. Flow that broke every Linux user: 1. User clicks the terminal toggle → frontend hits `POST /api/terminal/start`. 2. ThreadingHTTPServer spins up a worker thread to handle that one request. 3. The worker thread calls `subprocess.Popen(..., preexec_fn=...)`. 4. The shell calls `prctl(PR_SET_PDEATHSIG, SIGTERM)` in its preexec_fn. Its registered "parent" is now the WebUI worker thread that called Popen. 5. The handler returns its JSON response and the worker thread exits. 6. The kernel sees the pdeathsig-parent thread has died and sends SIGTERM to the PTY shell. The shell dies within ~10 ms of being created. 7. The reader loop sees EIO on the master FD, emits `terminal_closed`, and the frontend writes `[terminal closed]`. macOS users were unaffected because `libc.prctl` doesn't exist there — `ctypes.CDLL(None)` returns a libc handle, `libc.prctl` raises `AttributeError`, the bare-`except` swallows it, and the shell starts with no pdeathsig configured. Empirical verification on this Linux host (real PTY + `subprocess.Popen` inside a `threading.Thread` that joins immediately): with preexec_fn → proc.poll() == -15 (SIGTERM), master FD returns EIO without preexec_fn → proc.poll() == None (alive), master FD returns "HELLO\\r\\n" Same shell, same PTY, same threading topology as WebUI. Fix --- Drop the `preexec_fn` entirely. The orphan-shell-on-crash case the original PR was navigating is rare for self-hosted single-user installs, and the existing `atexit.register(close_all_terminals)` + explicit `close_terminal` paths cover graceful shutdown. A future fix (option B in the issue) can re-introduce pdeathsig pinned to a long-lived supervisor thread, but that is a follow-up — this PR is the smallest unbricks-Linux-today change. Tests ----- - Invert `test_terminal_shell_uses_parent_death_signal_preexec` → `test_terminal_shell_does_not_use_pdeathsig_preexec`: asserts `preexec_fn` is NOT in the Popen kwargs. - Add `test_pty_shell_survives_when_spawning_thread_exits`: spawns a real PTY shell via `start_terminal` from a worker thread, waits for the worker to join, asserts the shell is still alive after a half-second grace window. This is the contract the original tests never exercised. - Update `test_terminal_module_registers_graceful_shutdown_reaper` to refuse re-introduction of the preexec_fn or the `libc.prctl(1, SIGTERM)` call (treats either as a regression). All 27 terminal-related tests pass locally. Refs #2853
278 lines
8.8 KiB
Python
278 lines
8.8 KiB
Python
"""Embedded workspace terminal support for Hermes Web UI.
|
|
|
|
The terminal is intentionally independent from the agent execution path. It
|
|
starts a shell with an explicit cwd/env per process and never mutates
|
|
process-global os.environ, which avoids expanding the session-env race tracked
|
|
in the agent execution layer.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import errno
|
|
import atexit
|
|
import codecs
|
|
import fcntl
|
|
import os
|
|
import queue
|
|
import select
|
|
import shutil
|
|
import signal
|
|
import struct
|
|
import subprocess
|
|
import termios
|
|
import threading
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
|
|
|
|
def _set_nonblocking(fd: int) -> None:
|
|
flags = fcntl.fcntl(fd, fcntl.F_GETFL)
|
|
fcntl.fcntl(fd, fcntl.F_SETFL, flags | os.O_NONBLOCK)
|
|
|
|
|
|
def _winsize(rows: int, cols: int) -> bytes:
|
|
rows = max(8, min(int(rows or 24), 80))
|
|
cols = max(20, min(int(cols or 80), 240))
|
|
return struct.pack("HHHH", rows, cols, 0, 0)
|
|
|
|
|
|
@dataclass
|
|
class TerminalSession:
|
|
session_id: str
|
|
workspace: str
|
|
proc: subprocess.Popen
|
|
master_fd: int
|
|
rows: int = 24
|
|
cols: int = 80
|
|
output: queue.Queue = field(default_factory=lambda: queue.Queue(maxsize=2000))
|
|
closed: threading.Event = field(default_factory=threading.Event)
|
|
reader: threading.Thread | None = None
|
|
|
|
def is_alive(self) -> bool:
|
|
return not self.closed.is_set() and self.proc.poll() is None
|
|
|
|
def put_output(self, event: str, payload: dict) -> None:
|
|
try:
|
|
self.output.put_nowait((event, payload))
|
|
except queue.Full:
|
|
# Keep the terminal responsive by dropping the oldest queued chunk.
|
|
try:
|
|
self.output.get_nowait()
|
|
except queue.Empty:
|
|
pass
|
|
try:
|
|
self.output.put_nowait((event, payload))
|
|
except queue.Full:
|
|
pass
|
|
|
|
|
|
_TERMINALS: dict[str, TerminalSession] = {}
|
|
_LOCK = threading.RLock()
|
|
|
|
|
|
# NOTE on parent-death-signal: a previous version of this module set
|
|
# PR_SET_PDEATHSIG via a preexec_fn to terminate orphaned PTY shells when the
|
|
# WebUI process crashed. That broke every Linux user (#2853): WebUI runs a
|
|
# ThreadingHTTPServer, so the Popen call happens on a short-lived per-request
|
|
# thread, and PR_SET_PDEATHSIG is per-thread. The PTY shell registered the
|
|
# spawning thread as its "parent" and was killed with SIGTERM the instant that
|
|
# thread joined — within ~10 ms of opening the terminal — surfacing as the
|
|
# `[terminal closed]` banner. The graceful path is covered by
|
|
# `atexit.register(close_all_terminals)` and the explicit `close_terminal`
|
|
# call sites; hard kills of the WebUI process leak the shell, which is the
|
|
# tradeoff for working on Linux at all.
|
|
|
|
|
|
def _decode_terminal_output(decoder, data: bytes) -> str:
|
|
"""Decode PTY bytes without stripping terminal control sequences."""
|
|
return decoder.decode(data)
|
|
|
|
|
|
def _shell_path() -> str:
|
|
shell = os.environ.get("SHELL") or ""
|
|
if shell and Path(shell).exists():
|
|
return shell
|
|
return shutil.which("zsh") or shutil.which("bash") or shutil.which("sh") or "/bin/sh"
|
|
|
|
|
|
def _shell_argv(shell: str) -> list[str]:
|
|
name = Path(shell).name
|
|
if name in {"zsh", "bash", "sh"}:
|
|
return [shell, "-i"]
|
|
return [shell]
|
|
|
|
|
|
def _reader_loop(term: TerminalSession) -> None:
|
|
decoder = codecs.getincrementaldecoder("utf-8")("replace")
|
|
try:
|
|
while not term.closed.is_set():
|
|
if term.proc.poll() is not None:
|
|
break
|
|
try:
|
|
ready, _, _ = select.select([term.master_fd], [], [], 0.25)
|
|
except (OSError, ValueError):
|
|
break
|
|
if not ready:
|
|
continue
|
|
try:
|
|
data = os.read(term.master_fd, 8192)
|
|
except OSError as exc:
|
|
if exc.errno in (errno.EIO, errno.EBADF):
|
|
break
|
|
raise
|
|
if not data:
|
|
break
|
|
text = _decode_terminal_output(decoder, data)
|
|
if text:
|
|
term.put_output("output", {"text": text})
|
|
except Exception as exc:
|
|
term.put_output("terminal_error", {"error": str(exc)})
|
|
finally:
|
|
term.closed.set()
|
|
code = term.proc.poll()
|
|
term.put_output("terminal_closed", {"exit_code": code})
|
|
|
|
|
|
def _set_size(term: TerminalSession, rows: int, cols: int) -> None:
|
|
term.rows = max(8, min(int(rows or term.rows or 24), 80))
|
|
term.cols = max(20, min(int(cols or term.cols or 80), 240))
|
|
try:
|
|
fcntl.ioctl(term.master_fd, termios.TIOCSWINSZ, _winsize(term.rows, term.cols))
|
|
except OSError:
|
|
pass
|
|
try:
|
|
if term.proc.poll() is None:
|
|
os.killpg(term.proc.pid, signal.SIGWINCH)
|
|
except (OSError, ProcessLookupError):
|
|
pass
|
|
|
|
|
|
def start_terminal(session_id: str, workspace: Path, rows: int = 24, cols: int = 80, restart: bool = False) -> TerminalSession:
|
|
"""Start or return the embedded terminal for a WebUI session."""
|
|
sid = str(session_id or "").strip()
|
|
if not sid:
|
|
raise ValueError("session_id is required")
|
|
cwd = str(Path(workspace).expanduser().resolve())
|
|
if not Path(cwd).is_dir():
|
|
raise ValueError("workspace is not a directory")
|
|
|
|
with _LOCK:
|
|
current = _TERMINALS.get(sid)
|
|
if current and current.is_alive() and not restart and current.workspace == cwd:
|
|
_set_size(current, rows, cols)
|
|
return current
|
|
if current:
|
|
close_terminal(sid)
|
|
|
|
master_fd, slave_fd = os.openpty()
|
|
# Build a safe env: allowlist common shell vars, strip API keys and secrets.
|
|
# The PTY shell is an interactive UI surface — do not leak server credentials.
|
|
_SAFE_ENV_KEYS = {
|
|
"PATH", "HOME", "USER", "LOGNAME", "SHELL", "LANG", "LC_ALL",
|
|
"LC_CTYPE", "LC_MESSAGES", "LANGUAGE", "TZ", "TMPDIR", "TEMP",
|
|
"XDG_RUNTIME_DIR", "XDG_CONFIG_HOME", "XDG_DATA_HOME",
|
|
}
|
|
env = {k: v for k, v in os.environ.items() if k in _SAFE_ENV_KEYS}
|
|
env.update(
|
|
{
|
|
"TERM": "xterm-256color",
|
|
"COLORTERM": "truecolor",
|
|
"COLUMNS": str(cols),
|
|
"LINES": str(rows),
|
|
"PWD": cwd,
|
|
"HERMES_WEBUI_TERMINAL": "1",
|
|
}
|
|
)
|
|
shell = _shell_path()
|
|
proc = subprocess.Popen(
|
|
_shell_argv(shell),
|
|
cwd=cwd,
|
|
env=env,
|
|
stdin=slave_fd,
|
|
stdout=slave_fd,
|
|
stderr=slave_fd,
|
|
close_fds=True,
|
|
start_new_session=True,
|
|
)
|
|
os.close(slave_fd)
|
|
_set_nonblocking(master_fd)
|
|
|
|
term = TerminalSession(
|
|
session_id=sid,
|
|
workspace=cwd,
|
|
proc=proc,
|
|
master_fd=master_fd,
|
|
rows=rows,
|
|
cols=cols,
|
|
)
|
|
_set_size(term, rows, cols)
|
|
term.reader = threading.Thread(target=_reader_loop, args=(term,), daemon=True)
|
|
term.reader.start()
|
|
_TERMINALS[sid] = term
|
|
return term
|
|
|
|
|
|
def get_terminal(session_id: str) -> TerminalSession | None:
|
|
with _LOCK:
|
|
term = _TERMINALS.get(str(session_id or ""))
|
|
if term and term.is_alive():
|
|
return term
|
|
return term
|
|
|
|
|
|
def write_terminal(session_id: str, data: str) -> None:
|
|
term = get_terminal(session_id)
|
|
if not term or not term.is_alive():
|
|
raise KeyError("terminal not running")
|
|
os.write(term.master_fd, str(data or "").encode("utf-8", errors="replace"))
|
|
|
|
|
|
def resize_terminal(session_id: str, rows: int, cols: int) -> None:
|
|
term = get_terminal(session_id)
|
|
if not term:
|
|
raise KeyError("terminal not running")
|
|
_set_size(term, rows, cols)
|
|
|
|
|
|
def close_terminal(session_id: str) -> bool:
|
|
sid = str(session_id or "")
|
|
with _LOCK:
|
|
term = _TERMINALS.pop(sid, None)
|
|
if not term:
|
|
return False
|
|
term.closed.set()
|
|
try:
|
|
if term.proc.poll() is None:
|
|
try:
|
|
os.killpg(term.proc.pid, signal.SIGHUP)
|
|
except ProcessLookupError:
|
|
pass
|
|
try:
|
|
term.proc.wait(timeout=1.5)
|
|
except subprocess.TimeoutExpired:
|
|
try:
|
|
os.killpg(term.proc.pid, signal.SIGKILL)
|
|
except ProcessLookupError:
|
|
pass
|
|
try:
|
|
term.proc.wait(timeout=1.0)
|
|
except (subprocess.TimeoutExpired, ProcessLookupError):
|
|
pass
|
|
finally:
|
|
try:
|
|
os.close(term.master_fd)
|
|
except OSError:
|
|
pass
|
|
return True
|
|
|
|
|
|
def close_all_terminals() -> None:
|
|
"""Best-effort reap of embedded shells during graceful WebUI shutdown."""
|
|
with _LOCK:
|
|
session_ids = list(_TERMINALS)
|
|
for session_id in session_ids:
|
|
close_terminal(session_id)
|
|
|
|
|
|
atexit.register(close_all_terminals)
|