Merge pull request #2323 into stage-362

fix: isolate background worker profile env (Michaelyklam, closes #2321)

# Conflicts:
#	CHANGELOG.md
This commit is contained in:
Hermes Agent
2026-05-15 22:55:39 +00:00
5 changed files with 101 additions and 26 deletions
+12 -19
View File
@@ -712,24 +712,21 @@ def profile_env_for_background_worker(
yield
return
env_keys = set(runtime_env.keys()) | {"HERMES_HOME"}
# Stage-360 maintainer fix: narrow the _ENV_LOCK critical section to just
# the env mutation (and the env restoration). Pre-fix, this held _ENV_LOCK
# for the entire `yield` duration — i.e. the whole background worker's
# runtime (title generation, compression, update summary). That caused
# _ENV_LOCK to be held for many seconds, blocking ALL other sessions and
# surfacing as the QA `test_third_message_completes` timeout. The fix
# mirrors the narrow-lock pattern in _run_agent_streaming: acquire briefly
# to set env, run worker without holding the lock, reacquire to restore.
# See also QA `test_finally_restores_env_with_lock`.
# Route profile-specific env through the same thread-local channel used by
# foreground streaming runs. Mutating os.environ for detached background
# workers creates a cross-profile race: worker B can snapshot and later
# restore worker A's temporary env while A is still running. Keep
# os.environ unchanged here and use the short global lock only for the
# remaining process-global skill-home module cache patch.
from api.config import _set_thread_env, _clear_thread_env
thread_env = dict(runtime_env)
thread_env["HERMES_HOME"] = str(profile_home_path)
skill_home_snapshot = None
old_env = {}
try:
_set_thread_env(**thread_env)
with _ENV_LOCK:
old_env = {key: os.environ.get(key) for key in env_keys}
skill_home_snapshot = snapshot_skill_home_modules()
os.environ.update(runtime_env)
os.environ["HERMES_HOME"] = str(profile_home_path)
try:
patch_skill_home_modules(profile_home_path)
except Exception:
@@ -741,12 +738,8 @@ def profile_env_for_background_worker(
)
yield
finally:
_clear_thread_env()
with _ENV_LOCK:
for key, old_value in old_env.items():
if old_value is None:
os.environ.pop(key, None)
else:
os.environ[key] = old_value
if skill_home_snapshot is not None:
restore_skill_home_modules(skill_home_snapshot)
+62
View File
@@ -1,5 +1,8 @@
import os
import re
import sys
import threading
import types
from pathlib import Path
import yaml
@@ -95,3 +98,62 @@ def test_streaming_thread_env_allows_profile_terminal_cwd_override():
assert env["HERMES_SESSION_PLATFORM"] == "webui"
assert env["HERMES_HOME"] == "/active/profile/home"
assert env["TERMINAL_ENV"] == "ssh"
def test_background_worker_profile_env_uses_thread_local_without_process_env(monkeypatch, tmp_path):
from api import config as config_api
from api import profiles
home = tmp_path / "profiles" / "ops"
home.mkdir(parents=True)
(home / "config.yaml").write_text(
yaml.safe_dump({"terminal": {"backend": "ssh", "cwd": "/profile/cwd"}}),
encoding="utf-8",
)
(home / ".env").write_text("HERMES_MAX_ITERATIONS=42\n", encoding="utf-8")
monkeypatch.setattr(profiles, "get_hermes_home_for_profile", lambda profile: home)
monkeypatch.setattr(profiles, "snapshot_skill_home_modules", lambda: {"snapshot": True})
patched_homes = []
restored = []
monkeypatch.setattr(profiles, "patch_skill_home_modules", lambda path: patched_homes.append(path))
monkeypatch.setattr(profiles, "restore_skill_home_modules", lambda snapshot: restored.append(snapshot))
monkeypatch.setitem(sys.modules, "api.streaming", types.SimpleNamespace(_ENV_LOCK=threading.Lock()))
monkeypatch.setenv("HERMES_HOME", "/root/default")
monkeypatch.delenv("TERMINAL_ENV", raising=False)
monkeypatch.delenv("TERMINAL_CWD", raising=False)
monkeypatch.delenv("HERMES_MAX_ITERATIONS", raising=False)
config_api._clear_thread_env()
with profiles.profile_env_for_background_worker("ops"):
assert os.environ["HERMES_HOME"] == "/root/default"
assert "TERMINAL_ENV" not in os.environ
assert "TERMINAL_CWD" not in os.environ
assert "HERMES_MAX_ITERATIONS" not in os.environ
assert config_api._thread_ctx.env["HERMES_HOME"] == str(home)
assert config_api._thread_ctx.env["TERMINAL_ENV"] == "ssh"
assert config_api._thread_ctx.env["TERMINAL_CWD"] == "/profile/cwd"
assert config_api._thread_ctx.env["HERMES_MAX_ITERATIONS"] == "42"
assert patched_homes == [home]
assert os.environ["HERMES_HOME"] == "/root/default"
assert getattr(config_api._thread_ctx, "env", {}) == {}
assert restored == [{"snapshot": True}]
def test_background_worker_profile_env_source_avoids_os_environ_mutation():
src = Path("api/profiles.py").read_text(encoding="utf-8")
match = re.search(
r"(@contextmanager\ndef profile_env_for_background_worker\(.*?\n)(?=\ndef |\nclass )",
src,
re.DOTALL,
)
assert match, "profile_env_for_background_worker not found"
body = match.group(1)
assert "from api.config import _set_thread_env, _clear_thread_env" in body
assert "_set_thread_env(**thread_env)" in body
assert "_clear_thread_env()" in body
assert "os.environ.update(runtime_env)" not in body
assert 'os.environ["HERMES_HOME"] = str(profile_home_path)' not in body
+8 -2
View File
@@ -408,6 +408,7 @@ def test_session_compress_async_reports_stream_state_guard(monkeypatch, cleanup_
def test_manual_compress_worker_uses_session_profile_env(monkeypatch, tmp_path, cleanup_test_sessions):
import api.config as config_api
import api.profiles as profiles
import api.routes as routes
@@ -416,9 +417,12 @@ def test_manual_compress_worker_uses_session_profile_env(monkeypatch, tmp_path,
def __init__(self, **kwargs):
skill_module = sys.modules.get("tools.skills_tool")
thread_env = getattr(config_api._thread_ctx, "env", {})
EnvAssertingAgent.seen_env = {
"HERMES_HOME": os.environ.get("HERMES_HOME"),
"HERMES_TEST_PROFILE_ENV": os.environ.get("HERMES_TEST_PROFILE_ENV"),
"HERMES_HOME": thread_env.get("HERMES_HOME"),
"HERMES_TEST_PROFILE_ENV": thread_env.get("HERMES_TEST_PROFILE_ENV"),
"PROCESS_HERMES_HOME": os.environ.get("HERMES_HOME"),
"PROCESS_HERMES_TEST_PROFILE_ENV": os.environ.get("HERMES_TEST_PROFILE_ENV"),
"SKILL_MODULE_HOME": getattr(skill_module, "HERMES_HOME", None),
"SKILL_MODULE_DIR": getattr(skill_module, "SKILLS_DIR", None),
}
@@ -461,6 +465,8 @@ def test_manual_compress_worker_uses_session_profile_env(monkeypatch, tmp_path,
assert EnvAssertingAgent.seen_env == {
"HERMES_HOME": str(profile_home),
"HERMES_TEST_PROFILE_ENV": "work-runtime",
"PROCESS_HERMES_HOME": "default-home",
"PROCESS_HERMES_TEST_PROFILE_ENV": None,
"SKILL_MODULE_HOME": profile_home,
"SKILL_MODULE_DIR": profile_home / "skills",
}
+5 -1
View File
@@ -448,6 +448,7 @@ class TestBackgroundTitleProfileRouting(unittest.TestCase):
self, mock_get_session, mock_configured,
):
"""A background title worker for a non-default profile must resolve aux config from that profile."""
from api import config as config_api
from api.streaming import _run_background_title_update
mock_session = MagicMock()
@@ -469,7 +470,9 @@ class TestBackgroundTitleProfileRouting(unittest.TestCase):
sys.modules['tools.skills_tool'] = fake_skill_module
def fake_aux_title(*args, **kwargs):
captured['hermes_home'] = os.environ.get('HERMES_HOME')
thread_env = getattr(config_api._thread_ctx, 'env', {})
captured['hermes_home'] = thread_env.get('HERMES_HOME')
captured['process_hermes_home'] = os.environ.get('HERMES_HOME')
captured['skill_module_home'] = getattr(fake_skill_module, 'HERMES_HOME')
captured['skill_module_dir'] = getattr(fake_skill_module, 'SKILLS_DIR')
return ('Profile Routed Title', 'llm_aux', '')
@@ -495,6 +498,7 @@ class TestBackgroundTitleProfileRouting(unittest.TestCase):
sys.modules['tools.skills_tool'] = original_skill_module
self.assertEqual(captured.get('hermes_home'), 'profile-home')
self.assertEqual(captured.get('process_hermes_home'), 'default-home')
self.assertEqual(str(captured.get('skill_module_home')), 'profile-home')
self.assertEqual(str(captured.get('skill_module_dir')), 'profile-home/skills')
self.assertEqual(captured.get('restored_hermes_home'), 'default-home')
+14 -4
View File
@@ -476,9 +476,12 @@ class TestUpdateSummaryRouteModelSelection:
monkeypatch.setattr(cfg, 'get_effective_default_model', lambda: 'openai/test-main')
def fake_resolve_model_provider(model):
thread_env = getattr(cfg._thread_ctx, 'env', {})
captured['model_resolution_env'] = {
'HERMES_HOME': os.environ.get('HERMES_HOME'),
'HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'),
'HERMES_HOME': thread_env.get('HERMES_HOME'),
'HERMES_TEST_PROFILE_ENV': thread_env.get('HERMES_TEST_PROFILE_ENV'),
'PROCESS_HERMES_HOME': os.environ.get('HERMES_HOME'),
'PROCESS_HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'),
}
return model, 'openai', 'https://example.test/v1'
@@ -514,9 +517,12 @@ class TestUpdateSummaryRouteModelSelection:
)
def fake_get_text_auxiliary_client(task, main_runtime=None):
thread_env = getattr(cfg._thread_ctx, 'env', {})
captured['aux_env'] = {
'HERMES_HOME': os.environ.get('HERMES_HOME'),
'HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'),
'HERMES_HOME': thread_env.get('HERMES_HOME'),
'HERMES_TEST_PROFILE_ENV': thread_env.get('HERMES_TEST_PROFILE_ENV'),
'PROCESS_HERMES_HOME': os.environ.get('HERMES_HOME'),
'PROCESS_HERMES_TEST_PROFILE_ENV': os.environ.get('HERMES_TEST_PROFILE_ENV'),
'SKILL_MODULE_HOME': getattr(fake_skill_module, 'HERMES_HOME'),
'SKILL_MODULE_DIR': getattr(fake_skill_module, 'SKILLS_DIR'),
}
@@ -564,10 +570,14 @@ class TestUpdateSummaryRouteModelSelection:
assert captured['model_resolution_env'] == {
'HERMES_HOME': str(profile_home),
'HERMES_TEST_PROFILE_ENV': 'work-runtime',
'PROCESS_HERMES_HOME': 'default-home',
'PROCESS_HERMES_TEST_PROFILE_ENV': 'default-runtime',
}
assert captured['aux_env'] == {
'HERMES_HOME': str(profile_home),
'HERMES_TEST_PROFILE_ENV': 'work-runtime',
'PROCESS_HERMES_HOME': 'default-home',
'PROCESS_HERMES_TEST_PROFILE_ENV': 'default-runtime',
'SKILL_MODULE_HOME': profile_home,
'SKILL_MODULE_DIR': profile_home / 'skills',
}