fix: isolate background worker profile env

This commit is contained in:
Michael Lam
2026-05-15 12:41:29 -07:00
parent 2170846031
commit ff9e20f132
6 changed files with 103 additions and 26 deletions
+2
View File
@@ -4,6 +4,8 @@
### Fixed
- **PR #TBD** (refs #2321) — Background worker profile routing now applies profile-specific runtime env through the same thread-local channel used by foreground stream runs instead of temporarily mutating `os.environ`. This removes a cross-profile race where overlapping title/compression/update-summary workers could snapshot and restore another worker's profile env while preserving the existing short lock around process-global skill-home module cache patching.
- **PR #2279** by @franksong2702 (closes #2262 + refs #2168) — WebUI stream completion recovery gaps closed for both `notify_on_complete` background tasks and the preserved-task-list compression marker UI. Pre-fix, completions held in the agent process registry were never drained by the WebUI gateway session because the gateway session platform was unset. The fix routes the completion queue by process session key before injecting any notification into a WebUI turn. Separately, the preserved-task-list compression marker — an internal sentinel — was sometimes the only assistant text rendered after a context compression turn timed out, leaving a confusing "preserved tasks" message with no actual response. The frontend now suppresses the marker when it's the only assistant content and the run state is terminal.
- **PR #2299** by @starship-s — Background workers (title generation, manual session compression, update-summary generation) now correctly inherit profile-scoped configuration when a profile-scoped chat triggers them. Pre-fix, those workers read default-profile configuration instead of the session/request profile, so auxiliary model routing silently used the wrong configured model or failed provider resolution entirely. The fix threads the active profile context through `_run_background_title_update`, `_run_background_title_refresh`, and the manual compression and update-summary helpers, with regression tests covering all three paths.
+12 -19
View File
@@ -712,24 +712,21 @@ def profile_env_for_background_worker(
yield
return
env_keys = set(runtime_env.keys()) | {"HERMES_HOME"}
# Stage-360 maintainer fix: narrow the _ENV_LOCK critical section to just
# the env mutation (and the env restoration). Pre-fix, this held _ENV_LOCK
# for the entire `yield` duration — i.e. the whole background worker's
# runtime (title generation, compression, update summary). That caused
# _ENV_LOCK to be held for many seconds, blocking ALL other sessions and
# surfacing as the QA `test_third_message_completes` timeout. The fix
# mirrors the narrow-lock pattern in _run_agent_streaming: acquire briefly
# to set env, run worker without holding the lock, reacquire to restore.
# See also QA `test_finally_restores_env_with_lock`.
# Route profile-specific env through the same thread-local channel used by
# foreground streaming runs. Mutating os.environ for detached background
# workers creates a cross-profile race: worker B can snapshot and later
# restore worker A's temporary env while A is still running. Keep
# os.environ unchanged here and use the short global lock only for the
# remaining process-global skill-home module cache patch.
from api.config import _set_thread_env, _clear_thread_env
thread_env = dict(runtime_env)
thread_env["HERMES_HOME"] = str(profile_home_path)
skill_home_snapshot = None
old_env = {}
try:
_set_thread_env(**thread_env)
with _ENV_LOCK:
old_env = {key: os.environ.get(key) for key in env_keys}
skill_home_snapshot = snapshot_skill_home_modules()
os.environ.update(runtime_env)
os.environ["HERMES_HOME"] = str(profile_home_path)
try:
patch_skill_home_modules(profile_home_path)
except Exception:
@@ -741,12 +738,8 @@ def profile_env_for_background_worker(
)
yield
finally:
_clear_thread_env()
with _ENV_LOCK:
for key, old_value in old_env.items():
if old_value is None:
os.environ.pop(key, None)
else:
os.environ[key] = old_value
if skill_home_snapshot is not None:
restore_skill_home_modules(skill_home_snapshot)
+62
View File
@@ -1,5 +1,8 @@
import os
import re
import sys
import threading
import types
from pathlib import Path
import yaml
@@ -95,3 +98,62 @@ def test_streaming_thread_env_allows_profile_terminal_cwd_override():
assert env["HERMES_SESSION_PLATFORM"] == "webui"
assert env["HERMES_HOME"] == "/active/profile/home"
assert env["TERMINAL_ENV"] == "ssh"
def test_background_worker_profile_env_uses_thread_local_without_process_env(monkeypatch, tmp_path):
from api import config as config_api
from api import profiles
home = tmp_path / "profiles" / "ops"
home.mkdir(parents=True)
(home / "config.yaml").write_text(
yaml.safe_dump({"terminal": {"backend": "ssh", "cwd": "/profile/cwd"}}),
encoding="utf-8",
)
(home / ".env").write_text("HERMES_MAX_ITERATIONS=42\n", encoding="utf-8")
monkeypatch.setattr(profiles, "get_hermes_home_for_profile", lambda profile: home)
monkeypatch.setattr(profiles, "snapshot_skill_home_modules", lambda: {"snapshot": True})
patched_homes = []
restored = []
monkeypatch.setattr(profiles, "patch_skill_home_modules", lambda path: patched_homes.append(path))
monkeypatch.setattr(profiles, "restore_skill_home_modules", lambda snapshot: restored.append(snapshot))
monkeypatch.setitem(sys.modules, "api.streaming", types.SimpleNamespace(_ENV_LOCK=threading.Lock()))
monkeypatch.setenv("HERMES_HOME", "/root/default")
monkeypatch.delenv("TERMINAL_ENV", raising=False)
monkeypatch.delenv("TERMINAL_CWD", raising=False)
monkeypatch.delenv("HERMES_MAX_ITERATIONS", raising=False)
config_api._clear_thread_env()
with profiles.profile_env_for_background_worker("ops"):
assert os.environ["HERMES_HOME"] == "/root/default"
assert "TERMINAL_ENV" not in os.environ
assert "TERMINAL_CWD" not in os.environ
assert "HERMES_MAX_ITERATIONS" not in os.environ
assert config_api._thread_ctx.env["HERMES_HOME"] == str(home)
assert config_api._thread_ctx.env["TERMINAL_ENV"] == "ssh"
assert config_api._thread_ctx.env["TERMINAL_CWD"] == "/profile/cwd"
assert config_api._thread_ctx.env["HERMES_MAX_ITERATIONS"] == "42"
assert patched_homes == [home]
assert os.environ["HERMES_HOME"] == "/root/default"
assert getattr(config_api._thread_ctx, "env", {}) == {}
assert restored == [{"snapshot": True}]
def test_background_worker_profile_env_source_avoids_os_environ_mutation():
src = Path("api/profiles.py").read_text(encoding="utf-8")
match = re.search(
r"(@contextmanager\ndef profile_env_for_background_worker\(.*?\n)(?=\ndef |\nclass )",
src,
re.DOTALL,
)
assert match, "profile_env_for_background_worker not found"
body = match.group(1)
assert "from api.config import _set_thread_env, _clear_thread_env" in body
assert "_set_thread_env(**thread_env)" in body
assert "_clear_thread_env()" in body
assert "os.environ.update(runtime_env)" not in body
assert 'os.environ["HERMES_HOME"] = str(profile_home_path)' not in body
+8 -2
View File
@@ -408,6 +408,7 @@ def test_session_compress_async_reports_stream_state_guard(monkeypatch, cleanup_
def test_manual_compress_worker_uses_session_profile_env(monkeypatch, tmp_path, cleanup_test_sessions):
import api.config as config_api
import api.profiles as profiles
import api.routes as routes
@@ -416,9 +417,12 @@ def test_manual_compress_worker_uses_session_profile_env(monkeypatch, tmp_path,
def __init__(self, **kwargs):
skill_module = sys.modules.get("tools.skills_tool")
thread_env = getattr(config_api._thread_ctx, "env", {})
EnvAssertingAgent.seen_env = {
"HERMES_HOME": os.environ.get("HERMES_HOME"),
"HERMES_TEST_PROFILE_ENV": os.environ.get("HERMES_TEST_PROFILE_ENV"),
"HERMES_HOME": thread_env.get("HERMES_HOME"),
"HERMES_TEST_PROFILE_ENV": thread_env.get("HERMES_TEST_PROFILE_ENV"),
"PROCESS_HERMES_HOME": os.environ.get("HERMES_HOME"),
"PROCESS_HERMES_TEST_PROFILE_ENV": os.environ.get("HERMES_TEST_PROFILE_ENV"),
"SKILL_MODULE_HOME": getattr(skill_module, "HERMES_HOME", None),
"SKILL_MODULE_DIR": getattr(skill_module, "SKILLS_DIR", None),
}
@@ -461,6 +465,8 @@ def test_manual_compress_worker_uses_session_profile_env(monkeypatch, tmp_path,
assert EnvAssertingAgent.seen_env == {
"HERMES_HOME": str(profile_home),
"HERMES_TEST_PROFILE_ENV": "work-runtime",
"PROCESS_HERMES_HOME": "default-home",
"PROCESS_HERMES_TEST_PROFILE_ENV": None,
"SKILL_MODULE_HOME": profile_home,
"SKILL_MODULE_DIR": profile_home / "skills",
}
+5 -1
View File
@@ -448,6 +448,7 @@ class TestBackgroundTitleProfileRouting(unittest.TestCase):
self, mock_get_session, mock_configured,
):
"""A background title worker for a non-default profile must resolve aux config from that profile."""
from api import config as config_api
from api.streaming import _run_background_title_update
mock_session = MagicMock()
@@ -469,7 +470,9 @@ class TestBackgroundTitleProfileRouting(unittest.TestCase):
sys.modules['tools.skills_tool'] = fake_skill_module
def fake_aux_title(*args, **kwargs):
captured['hermes_home'] = os.environ.get('HERMES_HOME')
thread_env = getattr(config_api._thread_ctx, 'env', {})
captured['hermes_home'] = thread_env.get('HERMES_HOME')
captured['process_hermes_home'] = os.environ.get('HERMES_HOME')
captured['skill_module_home'] = getattr(fake_skill_module, 'HERMES_HOME')
captured['skill_module_dir'] = getattr(fake_skill_module, 'SKILLS_DIR')
return ('Profile Routed Title', 'llm_aux', '')
@@ -495,6 +498,7 @@ class TestBackgroundTitleProfileRouting(unittest.TestCase):
sys.modules['tools.skills_tool'] = original_skill_module
self.assertEqual(captured.get('hermes_home'), 'profile-home')
self.assertEqual(captured.get('process_hermes_home'), 'default-home')
self.assertEqual(str(captured.get('skill_module_home')), 'profile-home')
self.assertEqual(str(captured.get('skill_module_dir')), 'profile-home/skills')
self.assertEqual(captured.get('restored_hermes_home'), 'default-home')
+14 -4
View File
@@ -476,9 +476,12 @@ class TestUpdateSummaryRouteModelSelection:
monkeypatch.setattr(cfg, 'get_effective_default_model', lambda: 'openai/test-main')
def fake_resolve_model_provider(model):
thread_env = getattr(cfg._thread_ctx, 'env', {})
captured['model_resolution_env'] = {
'HERMES_HOME': os.environ.get('HERMES_HOME'),
'HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'),
'HERMES_HOME': thread_env.get('HERMES_HOME'),
'HERMES_TEST_PROFILE_ENV': thread_env.get('HERMES_TEST_PROFILE_ENV'),
'PROCESS_HERMES_HOME': os.environ.get('HERMES_HOME'),
'PROCESS_HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'),
}
return model, 'openai', 'https://example.test/v1'
@@ -514,9 +517,12 @@ class TestUpdateSummaryRouteModelSelection:
)
def fake_get_text_auxiliary_client(task, main_runtime=None):
thread_env = getattr(cfg._thread_ctx, 'env', {})
captured['aux_env'] = {
'HERMES_HOME': os.environ.get('HERMES_HOME'),
'HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'),
'HERMES_HOME': thread_env.get('HERMES_HOME'),
'HERMES_TEST_PROFILE_ENV': thread_env.get('HERMES_TEST_PROFILE_ENV'),
'PROCESS_HERMES_HOME': os.environ.get('HERMES_HOME'),
'PROCESS_HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'),
'SKILL_MODULE_HOME': getattr(fake_skill_module, 'HERMES_HOME'),
'SKILL_MODULE_DIR': getattr(fake_skill_module, 'SKILLS_DIR'),
}
@@ -564,10 +570,14 @@ class TestUpdateSummaryRouteModelSelection:
assert captured['model_resolution_env'] == {
'HERMES_HOME': str(profile_home),
'HERMES_TEST_PROFILE_ENV': 'work-runtime',
'PROCESS_HERMES_HOME': 'default-home',
'PROCESS_HERMES_TEST_PROFILE_ENV': 'default-runtime',
}
assert captured['aux_env'] == {
'HERMES_HOME': str(profile_home),
'HERMES_TEST_PROFILE_ENV': 'work-runtime',
'PROCESS_HERMES_HOME': 'default-home',
'PROCESS_HERMES_TEST_PROFILE_ENV': 'default-runtime',
'SKILL_MODULE_HOME': profile_home,
'SKILL_MODULE_DIR': profile_home / 'skills',
}