diff --git a/CHANGELOG.md b/CHANGELOG.md index 674486f1..fd1ffe32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ ### Fixed +- **PR #TBD** (refs #2321) — Background worker profile routing now applies profile-specific runtime env through the same thread-local channel used by foreground stream runs instead of temporarily mutating `os.environ`. This removes a cross-profile race where overlapping title/compression/update-summary workers could snapshot and restore another worker's profile env while preserving the existing short lock around process-global skill-home module cache patching. + - **PR #2279** by @franksong2702 (closes #2262 + refs #2168) — WebUI stream completion recovery gaps closed for both `notify_on_complete` background tasks and the preserved-task-list compression marker UI. Pre-fix, completions held in the agent process registry were never drained by the WebUI gateway session because the gateway session platform was unset. The fix routes the completion queue by process session key before injecting any notification into a WebUI turn. Separately, the preserved-task-list compression marker — an internal sentinel — was sometimes the only assistant text rendered after a context compression turn timed out, leaving a confusing "preserved tasks" message with no actual response. The frontend now suppresses the marker when it's the only assistant content and the run state is terminal. - **PR #2299** by @starship-s — Background workers (title generation, manual session compression, update-summary generation) now correctly inherit profile-scoped configuration when a profile-scoped chat triggers them. Pre-fix, those workers read default-profile configuration instead of the session/request profile, so auxiliary model routing silently used the wrong configured model or failed provider resolution entirely. The fix threads the active profile context through `_run_background_title_update`, `_run_background_title_refresh`, and the manual compression and update-summary helpers, with regression tests covering all three paths. diff --git a/api/profiles.py b/api/profiles.py index 3d1ecb2c..7761b377 100644 --- a/api/profiles.py +++ b/api/profiles.py @@ -712,24 +712,21 @@ def profile_env_for_background_worker( yield return - env_keys = set(runtime_env.keys()) | {"HERMES_HOME"} - # Stage-360 maintainer fix: narrow the _ENV_LOCK critical section to just - # the env mutation (and the env restoration). Pre-fix, this held _ENV_LOCK - # for the entire `yield` duration — i.e. the whole background worker's - # runtime (title generation, compression, update summary). That caused - # _ENV_LOCK to be held for many seconds, blocking ALL other sessions and - # surfacing as the QA `test_third_message_completes` timeout. The fix - # mirrors the narrow-lock pattern in _run_agent_streaming: acquire briefly - # to set env, run worker without holding the lock, reacquire to restore. - # See also QA `test_finally_restores_env_with_lock`. + # Route profile-specific env through the same thread-local channel used by + # foreground streaming runs. Mutating os.environ for detached background + # workers creates a cross-profile race: worker B can snapshot and later + # restore worker A's temporary env while A is still running. Keep + # os.environ unchanged here and use the short global lock only for the + # remaining process-global skill-home module cache patch. + from api.config import _set_thread_env, _clear_thread_env + + thread_env = dict(runtime_env) + thread_env["HERMES_HOME"] = str(profile_home_path) skill_home_snapshot = None - old_env = {} try: + _set_thread_env(**thread_env) with _ENV_LOCK: - old_env = {key: os.environ.get(key) for key in env_keys} skill_home_snapshot = snapshot_skill_home_modules() - os.environ.update(runtime_env) - os.environ["HERMES_HOME"] = str(profile_home_path) try: patch_skill_home_modules(profile_home_path) except Exception: @@ -741,12 +738,8 @@ def profile_env_for_background_worker( ) yield finally: + _clear_thread_env() with _ENV_LOCK: - for key, old_value in old_env.items(): - if old_value is None: - os.environ.pop(key, None) - else: - os.environ[key] = old_value if skill_home_snapshot is not None: restore_skill_home_modules(skill_home_snapshot) diff --git a/tests/test_profile_terminal_env.py b/tests/test_profile_terminal_env.py index 1614d6b6..3d8c8fe8 100644 --- a/tests/test_profile_terminal_env.py +++ b/tests/test_profile_terminal_env.py @@ -1,5 +1,8 @@ import os import re +import sys +import threading +import types from pathlib import Path import yaml @@ -95,3 +98,62 @@ def test_streaming_thread_env_allows_profile_terminal_cwd_override(): assert env["HERMES_SESSION_PLATFORM"] == "webui" assert env["HERMES_HOME"] == "/active/profile/home" assert env["TERMINAL_ENV"] == "ssh" + + +def test_background_worker_profile_env_uses_thread_local_without_process_env(monkeypatch, tmp_path): + from api import config as config_api + from api import profiles + + home = tmp_path / "profiles" / "ops" + home.mkdir(parents=True) + (home / "config.yaml").write_text( + yaml.safe_dump({"terminal": {"backend": "ssh", "cwd": "/profile/cwd"}}), + encoding="utf-8", + ) + (home / ".env").write_text("HERMES_MAX_ITERATIONS=42\n", encoding="utf-8") + + monkeypatch.setattr(profiles, "get_hermes_home_for_profile", lambda profile: home) + monkeypatch.setattr(profiles, "snapshot_skill_home_modules", lambda: {"snapshot": True}) + patched_homes = [] + restored = [] + monkeypatch.setattr(profiles, "patch_skill_home_modules", lambda path: patched_homes.append(path)) + monkeypatch.setattr(profiles, "restore_skill_home_modules", lambda snapshot: restored.append(snapshot)) + monkeypatch.setitem(sys.modules, "api.streaming", types.SimpleNamespace(_ENV_LOCK=threading.Lock())) + + monkeypatch.setenv("HERMES_HOME", "/root/default") + monkeypatch.delenv("TERMINAL_ENV", raising=False) + monkeypatch.delenv("TERMINAL_CWD", raising=False) + monkeypatch.delenv("HERMES_MAX_ITERATIONS", raising=False) + config_api._clear_thread_env() + + with profiles.profile_env_for_background_worker("ops"): + assert os.environ["HERMES_HOME"] == "/root/default" + assert "TERMINAL_ENV" not in os.environ + assert "TERMINAL_CWD" not in os.environ + assert "HERMES_MAX_ITERATIONS" not in os.environ + assert config_api._thread_ctx.env["HERMES_HOME"] == str(home) + assert config_api._thread_ctx.env["TERMINAL_ENV"] == "ssh" + assert config_api._thread_ctx.env["TERMINAL_CWD"] == "/profile/cwd" + assert config_api._thread_ctx.env["HERMES_MAX_ITERATIONS"] == "42" + assert patched_homes == [home] + + assert os.environ["HERMES_HOME"] == "/root/default" + assert getattr(config_api._thread_ctx, "env", {}) == {} + assert restored == [{"snapshot": True}] + + +def test_background_worker_profile_env_source_avoids_os_environ_mutation(): + src = Path("api/profiles.py").read_text(encoding="utf-8") + match = re.search( + r"(@contextmanager\ndef profile_env_for_background_worker\(.*?\n)(?=\ndef |\nclass )", + src, + re.DOTALL, + ) + assert match, "profile_env_for_background_worker not found" + body = match.group(1) + + assert "from api.config import _set_thread_env, _clear_thread_env" in body + assert "_set_thread_env(**thread_env)" in body + assert "_clear_thread_env()" in body + assert "os.environ.update(runtime_env)" not in body + assert 'os.environ["HERMES_HOME"] = str(profile_home_path)' not in body diff --git a/tests/test_sprint46.py b/tests/test_sprint46.py index 0db0ffa2..f98897f5 100644 --- a/tests/test_sprint46.py +++ b/tests/test_sprint46.py @@ -408,6 +408,7 @@ def test_session_compress_async_reports_stream_state_guard(monkeypatch, cleanup_ def test_manual_compress_worker_uses_session_profile_env(monkeypatch, tmp_path, cleanup_test_sessions): + import api.config as config_api import api.profiles as profiles import api.routes as routes @@ -416,9 +417,12 @@ def test_manual_compress_worker_uses_session_profile_env(monkeypatch, tmp_path, def __init__(self, **kwargs): skill_module = sys.modules.get("tools.skills_tool") + thread_env = getattr(config_api._thread_ctx, "env", {}) EnvAssertingAgent.seen_env = { - "HERMES_HOME": os.environ.get("HERMES_HOME"), - "HERMES_TEST_PROFILE_ENV": os.environ.get("HERMES_TEST_PROFILE_ENV"), + "HERMES_HOME": thread_env.get("HERMES_HOME"), + "HERMES_TEST_PROFILE_ENV": thread_env.get("HERMES_TEST_PROFILE_ENV"), + "PROCESS_HERMES_HOME": os.environ.get("HERMES_HOME"), + "PROCESS_HERMES_TEST_PROFILE_ENV": os.environ.get("HERMES_TEST_PROFILE_ENV"), "SKILL_MODULE_HOME": getattr(skill_module, "HERMES_HOME", None), "SKILL_MODULE_DIR": getattr(skill_module, "SKILLS_DIR", None), } @@ -461,6 +465,8 @@ def test_manual_compress_worker_uses_session_profile_env(monkeypatch, tmp_path, assert EnvAssertingAgent.seen_env == { "HERMES_HOME": str(profile_home), "HERMES_TEST_PROFILE_ENV": "work-runtime", + "PROCESS_HERMES_HOME": "default-home", + "PROCESS_HERMES_TEST_PROFILE_ENV": None, "SKILL_MODULE_HOME": profile_home, "SKILL_MODULE_DIR": profile_home / "skills", } diff --git a/tests/test_title_aux_routing.py b/tests/test_title_aux_routing.py index a125d361..2d18e4a8 100644 --- a/tests/test_title_aux_routing.py +++ b/tests/test_title_aux_routing.py @@ -448,6 +448,7 @@ class TestBackgroundTitleProfileRouting(unittest.TestCase): self, mock_get_session, mock_configured, ): """A background title worker for a non-default profile must resolve aux config from that profile.""" + from api import config as config_api from api.streaming import _run_background_title_update mock_session = MagicMock() @@ -469,7 +470,9 @@ class TestBackgroundTitleProfileRouting(unittest.TestCase): sys.modules['tools.skills_tool'] = fake_skill_module def fake_aux_title(*args, **kwargs): - captured['hermes_home'] = os.environ.get('HERMES_HOME') + thread_env = getattr(config_api._thread_ctx, 'env', {}) + captured['hermes_home'] = thread_env.get('HERMES_HOME') + captured['process_hermes_home'] = os.environ.get('HERMES_HOME') captured['skill_module_home'] = getattr(fake_skill_module, 'HERMES_HOME') captured['skill_module_dir'] = getattr(fake_skill_module, 'SKILLS_DIR') return ('Profile Routed Title', 'llm_aux', '') @@ -495,6 +498,7 @@ class TestBackgroundTitleProfileRouting(unittest.TestCase): sys.modules['tools.skills_tool'] = original_skill_module self.assertEqual(captured.get('hermes_home'), 'profile-home') + self.assertEqual(captured.get('process_hermes_home'), 'default-home') self.assertEqual(str(captured.get('skill_module_home')), 'profile-home') self.assertEqual(str(captured.get('skill_module_dir')), 'profile-home/skills') self.assertEqual(captured.get('restored_hermes_home'), 'default-home') diff --git a/tests/test_update_banner_fixes.py b/tests/test_update_banner_fixes.py index bc3a2cba..fadb5235 100644 --- a/tests/test_update_banner_fixes.py +++ b/tests/test_update_banner_fixes.py @@ -476,9 +476,12 @@ class TestUpdateSummaryRouteModelSelection: monkeypatch.setattr(cfg, 'get_effective_default_model', lambda: 'openai/test-main') def fake_resolve_model_provider(model): + thread_env = getattr(cfg._thread_ctx, 'env', {}) captured['model_resolution_env'] = { - 'HERMES_HOME': os.environ.get('HERMES_HOME'), - 'HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'), + 'HERMES_HOME': thread_env.get('HERMES_HOME'), + 'HERMES_TEST_PROFILE_ENV': thread_env.get('HERMES_TEST_PROFILE_ENV'), + 'PROCESS_HERMES_HOME': os.environ.get('HERMES_HOME'), + 'PROCESS_HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'), } return model, 'openai', 'https://example.test/v1' @@ -514,9 +517,12 @@ class TestUpdateSummaryRouteModelSelection: ) def fake_get_text_auxiliary_client(task, main_runtime=None): + thread_env = getattr(cfg._thread_ctx, 'env', {}) captured['aux_env'] = { - 'HERMES_HOME': os.environ.get('HERMES_HOME'), - 'HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'), + 'HERMES_HOME': thread_env.get('HERMES_HOME'), + 'HERMES_TEST_PROFILE_ENV': thread_env.get('HERMES_TEST_PROFILE_ENV'), + 'PROCESS_HERMES_HOME': os.environ.get('HERMES_HOME'), + 'PROCESS_HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'), 'SKILL_MODULE_HOME': getattr(fake_skill_module, 'HERMES_HOME'), 'SKILL_MODULE_DIR': getattr(fake_skill_module, 'SKILLS_DIR'), } @@ -564,10 +570,14 @@ class TestUpdateSummaryRouteModelSelection: assert captured['model_resolution_env'] == { 'HERMES_HOME': str(profile_home), 'HERMES_TEST_PROFILE_ENV': 'work-runtime', + 'PROCESS_HERMES_HOME': 'default-home', + 'PROCESS_HERMES_TEST_PROFILE_ENV': 'default-runtime', } assert captured['aux_env'] == { 'HERMES_HOME': str(profile_home), 'HERMES_TEST_PROFILE_ENV': 'work-runtime', + 'PROCESS_HERMES_HOME': 'default-home', + 'PROCESS_HERMES_TEST_PROFILE_ENV': 'default-runtime', 'SKILL_MODULE_HOME': profile_home, 'SKILL_MODULE_DIR': profile_home / 'skills', }