fix: isolate background worker profile env

2026-05-25 11:10:18 +00:00 · 2026-05-15 12:41:29 -07:00
parent 2170846031
commit ff9e20f132
6 changed files with 103 additions and 26 deletions
@@ -4,6 +4,8 @@

 ### Fixed

+- **PR #TBD** (refs #2321) — Background worker profile routing now applies profile-specific runtime env through the same thread-local channel used by foreground stream runs instead of temporarily mutating `os.environ`. This removes a cross-profile race where overlapping title/compression/update-summary workers could snapshot and restore another worker's profile env while preserving the existing short lock around process-global skill-home module cache patching.
+
 - **PR #2279** by @franksong2702 (closes #2262 + refs #2168) — WebUI stream completion recovery gaps closed for both `notify_on_complete` background tasks and the preserved-task-list compression marker UI. Pre-fix, completions held in the agent process registry were never drained by the WebUI gateway session because the gateway session platform was unset. The fix routes the completion queue by process session key before injecting any notification into a WebUI turn. Separately, the preserved-task-list compression marker — an internal sentinel — was sometimes the only assistant text rendered after a context compression turn timed out, leaving a confusing "preserved tasks" message with no actual response. The frontend now suppresses the marker when it's the only assistant content and the run state is terminal.

 - **PR #2299** by @starship-s — Background workers (title generation, manual session compression, update-summary generation) now correctly inherit profile-scoped configuration when a profile-scoped chat triggers them. Pre-fix, those workers read default-profile configuration instead of the session/request profile, so auxiliary model routing silently used the wrong configured model or failed provider resolution entirely. The fix threads the active profile context through `_run_background_title_update`, `_run_background_title_refresh`, and the manual compression and update-summary helpers, with regression tests covering all three paths.
@@ -712,24 +712,21 @@ def profile_env_for_background_worker(
        yield
        return

-    env_keys = set(runtime_env.keys()) | {"HERMES_HOME"}
-    # Stage-360 maintainer fix: narrow the _ENV_LOCK critical section to just
-    # the env mutation (and the env restoration). Pre-fix, this held _ENV_LOCK
-    # for the entire `yield` duration — i.e. the whole background worker's
-    # runtime (title generation, compression, update summary). That caused
-    # _ENV_LOCK to be held for many seconds, blocking ALL other sessions and
-    # surfacing as the QA `test_third_message_completes` timeout. The fix
-    # mirrors the narrow-lock pattern in _run_agent_streaming: acquire briefly
-    # to set env, run worker without holding the lock, reacquire to restore.
-    # See also QA `test_finally_restores_env_with_lock`.
+    # Route profile-specific env through the same thread-local channel used by
+    # foreground streaming runs. Mutating os.environ for detached background
+    # workers creates a cross-profile race: worker B can snapshot and later
+    # restore worker A's temporary env while A is still running. Keep
+    # os.environ unchanged here and use the short global lock only for the
+    # remaining process-global skill-home module cache patch.
+    from api.config import _set_thread_env, _clear_thread_env
+
+    thread_env = dict(runtime_env)
+    thread_env["HERMES_HOME"] = str(profile_home_path)
    skill_home_snapshot = None
-    old_env = {}
    try:
+        _set_thread_env(**thread_env)
        with _ENV_LOCK:
-            old_env = {key: os.environ.get(key) for key in env_keys}
            skill_home_snapshot = snapshot_skill_home_modules()
-            os.environ.update(runtime_env)
-            os.environ["HERMES_HOME"] = str(profile_home_path)
            try:
                patch_skill_home_modules(profile_home_path)
            except Exception:
@@ -741,12 +738,8 @@ def profile_env_for_background_worker(
                )
        yield
    finally:
+        _clear_thread_env()
        with _ENV_LOCK:
-            for key, old_value in old_env.items():
-                if old_value is None:
-                    os.environ.pop(key, None)
-                else:
-                    os.environ[key] = old_value
            if skill_home_snapshot is not None:
                restore_skill_home_modules(skill_home_snapshot)

@@ -1,5 +1,8 @@
 import os
 import re
+import sys
+import threading
+import types
 from pathlib import Path

 import yaml
@@ -95,3 +98,62 @@ def test_streaming_thread_env_allows_profile_terminal_cwd_override():
    assert env["HERMES_SESSION_PLATFORM"] == "webui"
    assert env["HERMES_HOME"] == "/active/profile/home"
    assert env["TERMINAL_ENV"] == "ssh"
+
+
+def test_background_worker_profile_env_uses_thread_local_without_process_env(monkeypatch, tmp_path):
+    from api import config as config_api
+    from api import profiles
+
+    home = tmp_path / "profiles" / "ops"
+    home.mkdir(parents=True)
+    (home / "config.yaml").write_text(
+        yaml.safe_dump({"terminal": {"backend": "ssh", "cwd": "/profile/cwd"}}),
+        encoding="utf-8",
+    )
+    (home / ".env").write_text("HERMES_MAX_ITERATIONS=42\n", encoding="utf-8")
+
+    monkeypatch.setattr(profiles, "get_hermes_home_for_profile", lambda profile: home)
+    monkeypatch.setattr(profiles, "snapshot_skill_home_modules", lambda: {"snapshot": True})
+    patched_homes = []
+    restored = []
+    monkeypatch.setattr(profiles, "patch_skill_home_modules", lambda path: patched_homes.append(path))
+    monkeypatch.setattr(profiles, "restore_skill_home_modules", lambda snapshot: restored.append(snapshot))
+    monkeypatch.setitem(sys.modules, "api.streaming", types.SimpleNamespace(_ENV_LOCK=threading.Lock()))
+
+    monkeypatch.setenv("HERMES_HOME", "/root/default")
+    monkeypatch.delenv("TERMINAL_ENV", raising=False)
+    monkeypatch.delenv("TERMINAL_CWD", raising=False)
+    monkeypatch.delenv("HERMES_MAX_ITERATIONS", raising=False)
+    config_api._clear_thread_env()
+
+    with profiles.profile_env_for_background_worker("ops"):
+        assert os.environ["HERMES_HOME"] == "/root/default"
+        assert "TERMINAL_ENV" not in os.environ
+        assert "TERMINAL_CWD" not in os.environ
+        assert "HERMES_MAX_ITERATIONS" not in os.environ
+        assert config_api._thread_ctx.env["HERMES_HOME"] == str(home)
+        assert config_api._thread_ctx.env["TERMINAL_ENV"] == "ssh"
+        assert config_api._thread_ctx.env["TERMINAL_CWD"] == "/profile/cwd"
+        assert config_api._thread_ctx.env["HERMES_MAX_ITERATIONS"] == "42"
+        assert patched_homes == [home]
+
+    assert os.environ["HERMES_HOME"] == "/root/default"
+    assert getattr(config_api._thread_ctx, "env", {}) == {}
+    assert restored == [{"snapshot": True}]
+
+
+def test_background_worker_profile_env_source_avoids_os_environ_mutation():
+    src = Path("api/profiles.py").read_text(encoding="utf-8")
+    match = re.search(
+        r"(@contextmanager\ndef profile_env_for_background_worker\(.*?\n)(?=\ndef |\nclass )",
+        src,
+        re.DOTALL,
+    )
+    assert match, "profile_env_for_background_worker not found"
+    body = match.group(1)
+
+    assert "from api.config import _set_thread_env, _clear_thread_env" in body
+    assert "_set_thread_env(**thread_env)" in body
+    assert "_clear_thread_env()" in body
+    assert "os.environ.update(runtime_env)" not in body
+    assert 'os.environ["HERMES_HOME"] = str(profile_home_path)' not in body
@@ -408,6 +408,7 @@ def test_session_compress_async_reports_stream_state_guard(monkeypatch, cleanup_


 def test_manual_compress_worker_uses_session_profile_env(monkeypatch, tmp_path, cleanup_test_sessions):
+    import api.config as config_api
    import api.profiles as profiles
    import api.routes as routes

@@ -416,9 +417,12 @@ def test_manual_compress_worker_uses_session_profile_env(monkeypatch, tmp_path,

        def __init__(self, **kwargs):
            skill_module = sys.modules.get("tools.skills_tool")
+            thread_env = getattr(config_api._thread_ctx, "env", {})
            EnvAssertingAgent.seen_env = {
-                "HERMES_HOME": os.environ.get("HERMES_HOME"),
-                "HERMES_TEST_PROFILE_ENV": os.environ.get("HERMES_TEST_PROFILE_ENV"),
+                "HERMES_HOME": thread_env.get("HERMES_HOME"),
+                "HERMES_TEST_PROFILE_ENV": thread_env.get("HERMES_TEST_PROFILE_ENV"),
+                "PROCESS_HERMES_HOME": os.environ.get("HERMES_HOME"),
+                "PROCESS_HERMES_TEST_PROFILE_ENV": os.environ.get("HERMES_TEST_PROFILE_ENV"),
                "SKILL_MODULE_HOME": getattr(skill_module, "HERMES_HOME", None),
                "SKILL_MODULE_DIR": getattr(skill_module, "SKILLS_DIR", None),
            }
@@ -461,6 +465,8 @@ def test_manual_compress_worker_uses_session_profile_env(monkeypatch, tmp_path,
    assert EnvAssertingAgent.seen_env == {
        "HERMES_HOME": str(profile_home),
        "HERMES_TEST_PROFILE_ENV": "work-runtime",
+        "PROCESS_HERMES_HOME": "default-home",
+        "PROCESS_HERMES_TEST_PROFILE_ENV": None,
        "SKILL_MODULE_HOME": profile_home,
        "SKILL_MODULE_DIR": profile_home / "skills",
    }
@@ -448,6 +448,7 @@ class TestBackgroundTitleProfileRouting(unittest.TestCase):
        self, mock_get_session, mock_configured,
    ):
        """A background title worker for a non-default profile must resolve aux config from that profile."""
+        from api import config as config_api
        from api.streaming import _run_background_title_update

        mock_session = MagicMock()
@@ -469,7 +470,9 @@ class TestBackgroundTitleProfileRouting(unittest.TestCase):
        sys.modules['tools.skills_tool'] = fake_skill_module

        def fake_aux_title(*args, **kwargs):
-            captured['hermes_home'] = os.environ.get('HERMES_HOME')
+            thread_env = getattr(config_api._thread_ctx, 'env', {})
+            captured['hermes_home'] = thread_env.get('HERMES_HOME')
+            captured['process_hermes_home'] = os.environ.get('HERMES_HOME')
            captured['skill_module_home'] = getattr(fake_skill_module, 'HERMES_HOME')
            captured['skill_module_dir'] = getattr(fake_skill_module, 'SKILLS_DIR')
            return ('Profile Routed Title', 'llm_aux', '')
@@ -495,6 +498,7 @@ class TestBackgroundTitleProfileRouting(unittest.TestCase):
                sys.modules['tools.skills_tool'] = original_skill_module

        self.assertEqual(captured.get('hermes_home'), 'profile-home')
+        self.assertEqual(captured.get('process_hermes_home'), 'default-home')
        self.assertEqual(str(captured.get('skill_module_home')), 'profile-home')
        self.assertEqual(str(captured.get('skill_module_dir')), 'profile-home/skills')
        self.assertEqual(captured.get('restored_hermes_home'), 'default-home')
@@ -476,9 +476,12 @@ class TestUpdateSummaryRouteModelSelection:
        monkeypatch.setattr(cfg, 'get_effective_default_model', lambda: 'openai/test-main')

        def fake_resolve_model_provider(model):
+            thread_env = getattr(cfg._thread_ctx, 'env', {})
            captured['model_resolution_env'] = {
-                'HERMES_HOME': os.environ.get('HERMES_HOME'),
-                'HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'),
+                'HERMES_HOME': thread_env.get('HERMES_HOME'),
+                'HERMES_TEST_PROFILE_ENV': thread_env.get('HERMES_TEST_PROFILE_ENV'),
+                'PROCESS_HERMES_HOME': os.environ.get('HERMES_HOME'),
+                'PROCESS_HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'),
            }
            return model, 'openai', 'https://example.test/v1'

@@ -514,9 +517,12 @@ class TestUpdateSummaryRouteModelSelection:
                        )

        def fake_get_text_auxiliary_client(task, main_runtime=None):
+            thread_env = getattr(cfg._thread_ctx, 'env', {})
            captured['aux_env'] = {
-                'HERMES_HOME': os.environ.get('HERMES_HOME'),
-                'HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'),
+                'HERMES_HOME': thread_env.get('HERMES_HOME'),
+                'HERMES_TEST_PROFILE_ENV': thread_env.get('HERMES_TEST_PROFILE_ENV'),
+                'PROCESS_HERMES_HOME': os.environ.get('HERMES_HOME'),
+                'PROCESS_HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'),
                'SKILL_MODULE_HOME': getattr(fake_skill_module, 'HERMES_HOME'),
                'SKILL_MODULE_DIR': getattr(fake_skill_module, 'SKILLS_DIR'),
            }
@@ -564,10 +570,14 @@ class TestUpdateSummaryRouteModelSelection:
        assert captured['model_resolution_env'] == {
            'HERMES_HOME': str(profile_home),
            'HERMES_TEST_PROFILE_ENV': 'work-runtime',
+            'PROCESS_HERMES_HOME': 'default-home',
+            'PROCESS_HERMES_TEST_PROFILE_ENV': 'default-runtime',
        }
        assert captured['aux_env'] == {
            'HERMES_HOME': str(profile_home),
            'HERMES_TEST_PROFILE_ENV': 'work-runtime',
+            'PROCESS_HERMES_HOME': 'default-home',
+            'PROCESS_HERMES_TEST_PROFILE_ENV': 'default-runtime',
            'SKILL_MODULE_HOME': profile_home,
            'SKILL_MODULE_DIR': profile_home / 'skills',
        }