From 423f3437da6146568268c0ac838add4779489919 Mon Sep 17 00:00:00 2001
From: Ardha <joneslsi123@gmail.com>
Date: Sun, 7 Jun 2026 05:53:18 +0000
Subject: [PATCH] feat: add CJK leak sanitization for non-Chinese reports
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the LLM generates persona quotes (BI economists, ministry officials,
Reddit commenters, etc.) in non-Chinese locales, it can occasionally slip
Chinese characters into otherwise fluent English/Latin prose. The system
prompt language instruction reduces but doesn't eliminate this — the LLM
sometimes reaches back to its Chinese training data for fluent-sounding
speech, producing output like:
  "BI economist said: Purbaya过于倾向财政扩张..."

This adds a post-processing step that:
1. Detects runs of CJK Unified Ideographs (U+4E00..U+9FFF) and CJK
   Symbols/Punctuation (U+3000..U+303F) ≥ 2 chars in length
2. Batch-translates them via the configured LLM endpoint (reusing the
   same LLM_API_KEY/LLM_BASE_URL as the rest of MiroFish)
3. Replaces each run in-place, injecting spaces at ASCII boundaries so
   the result reads naturally in surrounding text
4. Iterates up to 3 passes to catch fragments the LLM leaves in pass 1

Behavior:
- Auto-enabled for non-Chinese locales (en, es, fr, pt, ru, de, id)
- Skipped for zh / zh-CN / zh-TW (legitimate CJK content)
- No-op when LLM_API_KEY is not configured (warns and returns original)
- Graceful fallback: any LLM failure returns original text unchanged
- Idempotent: re-running on already-sanitized text is a no-op

Configuration (all optional, set in .env):
  CJK_SANITIZE_ENABLED=0   # force off (default: auto for non-zh)
  CJK_SANITIZE_LANGS=ja,ko # override target locale set
  CJK_SANITIZE_MAX_PASSES=3 # default 3

Files added:
- backend/app/utils/cjk_sanitize.py     (~250 lines, the module)
- backend/scripts/test_cjk_sanitize.py  (23 unit + integration tests)

Files modified:
- backend/app/services/report_agent.py  (wire-in after assemble_full_report)
- README.md                             (document config env vars)

Tested: 23/23 unit tests pass; live Purbaya/USD-IDR report (24 unique CJK
runs in 14kB markdown) reduced to 0 in 3.4s with real DeepSeek API.
---
 README.md                            |  27 +++
 backend/app/services/report_agent.py |  47 ++++-
 backend/app/utils/cjk_sanitize.py    | 262 ++++++++++++++++++++++++
 backend/scripts/test_cjk_sanitize.py | 290 +++++++++++++++++++++++++++
 4 files changed, 624 insertions(+), 2 deletions(-)
 create mode 100644 backend/app/utils/cjk_sanitize.py
 create mode 100644 backend/scripts/test_cjk_sanitize.py
diff --git a/README.md b/README.md
index de082935a7..dbecb0a8d0 100644
--- a/README.md
+++ b/README.md
@@ -176,6 +176,33 @@ Reads `.env` from root directory by default, maps ports `3000 (frontend) / 5001
 
 > Mirror address for faster pulling is provided as comments in `docker-compose.yml`, replace if needed.
 
+### Optional: CJK leak sanitization for non-Chinese reports
+
+When running simulations in non-Chinese locales (`en`, `es`, `fr`, `pt`, `ru`, `de`, `id`),
+the LLM may occasionally slip Chinese characters into persona quotes despite the
+language instruction (e.g. *"BI economist said: Purbaya过于倾向财政扩张..."*).
+
+MiroFish auto-detects and re-translates any leaked CJK runs after report
+generation, reusing the same `LLM_API_KEY` / `LLM_BASE_URL` as the rest of the
+backend. The sanitization adds ~3-10 seconds to report completion and is
+idempotent (re-runs are no-op when no CJK remains).
+
+**Configuration** (all optional, set in `.env`):
+
+```bash
+# Disable entirely (default: auto-enabled for non-zh locales)
+CJK_SANITIZE_ENABLED=0
+
+# Run sanitization only for specific locales (comma-separated)
+CJK_SANITIZE_LANGS=ja,ko
+
+# Maximum retry passes (default: 3)
+CJK_SANITIZE_MAX_PASSES=3
+```
+
+No action is needed for Chinese reports (`zh` / `zh-CN` / `zh-TW`) — sanitization
+is automatically skipped.
+
 ## 📬 Join the Conversation
 
 <div align="center">
diff --git a/backend/app/services/report_agent.py b/backend/app/services/report_agent.py
index cecd70b464..a5cdf03fc7 100644
--- a/backend/app/services/report_agent.py
+++ b/backend/app/services/report_agent.py
@@ -19,6 +19,7 @@
 from enum import Enum
 
 from ..config import Config
+from ..utils.cjk_sanitize import is_enabled as cjk_sanitize_enabled, sanitize_cjk_in_text
 from ..utils.llm_client import LLMClient
 from ..utils.logger import get_logger
 from ..utils.locale import get_language_instruction, t
@@ -902,7 +903,15 @@ def __init__(
         self.graph_id = graph_id
         self.simulation_id = simulation_id
         self.simulation_requirement = simulation_requirement
-        
+        # Capture the request's locale so post-processing (e.g. CJK sanitization)
+        # can decide whether to run. Falls back to the current thread locale
+        # (set via set_locale() in the request handler) or 'zh' default.
+        try:
+            from ..utils.locale import get_locale
+            self.locale = get_locale()
+        except Exception:
+            self.locale = 'zh'
+
         self.llm = llm_client or LLMClient()
         self.zep_tools = zep_tools or ZepToolsService()
         
@@ -1707,7 +1716,41 @@ def generate_report(
             report.markdown_content = ReportManager.assemble_full_report(report_id, outline)
             report.status = ReportStatus.COMPLETED
             report.completed_at = datetime.now().isoformat()
-            
+
+            # Sanitize any CJK characters that leaked into the report despite
+            # the language instruction (LLMs sometimes slip Chinese idioms into
+            # otherwise fluent English quotes). No-op for Chinese reports or
+            # when CJK_SANITIZE_ENABLED is off. See backend/app/utils/cjk_sanitize.py.
+            if cjk_sanitize_enabled(getattr(self, 'locale', None)):
+                try:
+                    sanitized = sanitize_cjk_in_text(
+                        report.markdown_content,
+                        locale=getattr(self, 'locale', None),
+                    )
+                    if sanitized != report.markdown_content:
+                        report.markdown_content = sanitized
+                        # Persist the sanitized version to the on-disk file too
+                        # so subsequent /download and /chat calls see the cleaned
+                        # text.
+                        try:
+                            full_report_path = os.path.join(
+                                Config.UPLOAD_FOLDER, 'reports', report_id, 'full_report.md'
+                            )
+                            if os.path.exists(full_report_path):
+                                with open(full_report_path, 'w', encoding='utf-8') as f:
+                                    f.write(sanitized)
+                        except Exception as path_err:
+                            logger.warning(
+                                "cjk_sanitize: failed to rewrite report file: %s",
+                                path_err,
+                            )
+                except Exception as sanitize_err:
+                    # Sanitize failures must never break report delivery
+                    logger.warning(
+                        "cjk_sanitize: unexpected error, returning unsanitized report: %s",
+                        sanitize_err,
+                    )
+
             # 计算总耗时
             total_time_seconds = (datetime.now() - start_time).total_seconds()
             
diff --git a/backend/app/utils/cjk_sanitize.py b/backend/app/utils/cjk_sanitize.py
new file mode 100644
index 0000000000..601af77bba
--- /dev/null
+++ b/backend/app/utils/cjk_sanitize.py
@@ -0,0 +1,262 @@
+"""
+CJK Leak Sanitization Utility
+==============================
+
+Even with proper ``get_language_instruction()`` injected into LLM system prompts,
+the model can still leak Chinese characters mid-sentence when generating persona
+quotes (BI economists, ministry officials, Reddit commenters, etc.). The LLM
+sometimes reaches back to its Chinese training data for fluent-sounding speech,
+producing output like::
+
+    "BI economist said: Purbaya过于倾向财政扩张..."
+
+This module provides :func:`sanitize_cjk_in_text` which:
+
+1. Detects runs of CJK Unified Ideographs (U+4E00..U+9FFF) and CJK Symbols
+   & Punctuation (U+3000..U+303F) ≥ 2 characters in length
+2. Batch-translates them via the configured LLM endpoint (reusing the same
+   ``LLM_BASE_URL`` / ``LLM_API_KEY`` as the rest of MiroFish)
+3. Replaces each run in-place, injecting spaces at ASCII boundaries so the
+   result reads naturally in the surrounding language
+4. Iterates up to 3 passes so any fragments the LLM leaves in its first
+   response are caught in subsequent passes
+
+The function is a no-op when the input is short of CJK characters, when the
+target locale is Chinese (``locale == 'zh'``), or when no LLM API key is
+configured. It is safe to call from any code path — failures are logged and
+the original text is returned unchanged.
+
+Configuration (env vars, all optional):
+    CJK_SANITIZE_ENABLED   — ``"1"`` / ``"true"`` to force-enable; ``"0"`` to force-disable.
+                             Default: auto-enable for non-zh locales.
+    CJK_SANITIZE_LANGS     — comma-separated locales where sanitization runs.
+                             Default: ``"en,es,fr,pt,ru,de,id"`` (any non-zh locale).
+    CJK_SANITIZE_MAX_PASSES — maximum retry passes (default ``3``).
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+import urllib.error
+import urllib.request
+from typing import Iterable, Optional
+
+logger = logging.getLogger(__name__)
+
+# Match runs of ≥2 CJK ideographs or CJK punctuation/symbols.
+# Single-character matches (e.g. lone "（" or "】") are usually false positives
+# in mixed-language text; the 2-char floor filters most of those out.
+_CJK_PATTERN = re.compile(r"[\u4e00-\u9fff\u3000-\u303f]{2,}")
+
+# Default locales where sanitization is helpful. Chinese is intentionally
+# excluded — Chinese reports can and should contain CJK characters.
+_DEFAULT_TARGET_LOCALES = frozenset({"en", "es", "fr", "pt", "ru", "de", "id"})
+
+
+def _extract_cjk_runs(text: str) -> list[str]:
+    """Return unique CJK runs in order of first appearance."""
+    seen: set[str] = set()
+    out: list[str] = []
+    for m in _CJK_PATTERN.finditer(text):
+        s = m.group(0)
+        if s not in seen:
+            seen.add(s)
+            out.append(s)
+    return out
+
+
+def _smart_replace(text: str, cjk_run: str, translation: str) -> str:
+    """Replace a CJK run with its English translation, adding spaces at ASCII
+    boundaries so the result reads naturally in surrounding English/Latin text.
+
+    The LLM only sees the CJK snippet (not its surrounding context), so it can't
+    preserve the original spacing. We compensate with boundary lookbehind/ahead.
+    """
+    out = text
+    # ASCII on both sides → " trans "
+    pat = re.compile(r"(?<=[A-Za-z0-9])" + re.escape(cjk_run) + r"(?=[A-Za-z0-9])")
+    out = pat.sub(" " + translation + " ", out)
+    # ASCII on left only (e.g. "Purbaya过于扩张,") → " trans"
+    pre = re.compile(r"(?<=[A-Za-z0-9])" + re.escape(cjk_run))
+    out = pre.sub(" " + translation, out)
+    # ASCII on right only (e.g. "（中央银行）officials") → "trans "
+    post = re.compile(re.escape(cjk_run) + r"(?=[A-Za-z0-9])")
+    out = post.sub(translation + " ", out)
+    # Surrounded by non-ASCII (e.g. fullwidth parens) → bare translation
+    out = out.replace(cjk_run, translation)
+    return out
+
+
+def _batch_translate(
+    snippets: list[str],
+    api_key: str,
+    base_url: str,
+    model: str,
+    timeout: int = 60,
+    max_retries: int = 2,
+) -> dict[str, str]:
+    """Translate a list of CJK snippets to English via the configured LLM."""
+    if not snippets:
+        return {}
+
+    numbered = "\n".join(f"{i + 1}. {s}" for i, s in enumerate(snippets))
+    prompt = (
+        "You are a translator. The following are short Chinese fragments that "
+        "leaked into an otherwise English report. Translate each to natural "
+        "English. Output a JSON array of strings, one per input, in the same "
+        "order. Do not add commentary, numbering, or markdown fences — output "
+        f"only the JSON array.\n\n{numbered}"
+    )
+    body = json.dumps(
+        {
+            "model": model,
+            "max_tokens": 4096,
+            "temperature": 0.1,
+            "messages": [{"role": "user", "content": prompt}],
+        }
+    ).encode()
+
+    url = base_url.rstrip("/") + "/chat/completions"
+    last_err: Optional[Exception] = None
+    for attempt in range(max_retries):
+        try:
+            req = urllib.request.Request(
+                url,
+                data=body,
+                headers={
+                    "Content-Type": "application/json",
+                    "Authorization": f"Bearer {api_key}",
+                },
+                method="POST",
+            )
+            with urllib.request.urlopen(req, timeout=timeout) as resp:
+                payload = json.loads(resp.read())
+            content = payload["choices"][0]["message"]["content"].strip()
+            # Strip markdown fences if the model added them despite instructions
+            content = re.sub(r"^```(?:json)?\s*", "", content)
+            content = re.sub(r"\s*```$", "", content)
+            arr = json.loads(content)
+            if not isinstance(arr, list) or len(arr) != len(snippets):
+                raise ValueError(
+                    f"Got {len(arr) if isinstance(arr, list) else 'non-list'} "
+                    f"translations for {len(snippets)} snippets"
+                )
+            return {orig: trans for orig, trans in zip(snippets, arr)}
+        except Exception as e:  # any failure (network, JSON, schema) → retry
+            last_err = e
+            logger.warning(
+                "cjk_sanitize translation attempt %d/%d failed: %s",
+                attempt + 1,
+                max_retries,
+                e,
+            )
+    raise RuntimeError(f"Translation failed after {max_retries} attempts: {last_err}")
+
+
+def is_enabled(locale: Optional[str] = None) -> bool:
+    """Return True if sanitization should run for the given locale.
+
+    Honors the ``CJK_SANITIZE_ENABLED`` env var (force on/off). When unset,
+    enables for any locale in ``CJK_SANITIZE_LANGS`` (default: non-Chinese
+    supported locales).
+    """
+    override = os.environ.get("CJK_SANITIZE_ENABLED", "").strip().lower()
+    if override in ("1", "true", "yes", "on"):
+        return True
+    if override in ("0", "false", "no", "off"):
+        return False
+    if not locale:
+        return False
+    target_langs_env = os.environ.get("CJK_SANITIZE_LANGS", "").strip()
+    if target_langs_env:
+        target = {x.strip().lower() for x in target_langs_env.split(",") if x.strip()}
+    else:
+        target = _DEFAULT_TARGET_LOCALES
+    return locale.lower() in target
+
+
+def sanitize_cjk_in_text(
+    text: str,
+    locale: Optional[str] = None,
+    api_key: Optional[str] = None,
+    base_url: Optional[str] = None,
+    model: Optional[str] = None,
+) -> str:
+    """Return ``text`` with CJK runs replaced by English translations.
+
+    No-op if:
+        - ``text`` is empty or contains no CJK runs
+        - ``locale`` is Chinese (``zh`` / ``zh-CN`` / ``zh-TW``)
+        - no LLM ``api_key`` is configured
+        - the LLM call fails (logs warning, returns original text)
+
+    Args:
+        text: The text to sanitize (typically a generated report or section).
+        locale: Target report locale (e.g. ``"en"``). Chinese is skipped.
+        api_key: LLM API key. Defaults to ``LLM_API_KEY`` env var.
+        base_url: LLM endpoint base URL. Defaults to ``LLM_BASE_URL`` env var.
+        model: LLM model name. Defaults to ``LLM_MODEL_NAME`` env var.
+
+    Returns:
+        Sanitized text. If sanitization is skipped or fails, returns the
+        original ``text`` unchanged.
+    """
+    if not text:
+        return text
+
+    if locale and locale.lower().split("-")[0] == "zh":
+        return text  # Chinese reports legitimately contain CJK
+
+    runs = _extract_cjk_runs(text)
+    if not runs:
+        return text
+
+    api_key = api_key or os.environ.get("LLM_API_KEY")
+    base_url = base_url or os.environ.get("LLM_BASE_URL", "https://api.openai.com/v1")
+    model = model or os.environ.get("LLM_MODEL_NAME", "gpt-4o-mini")
+
+    if not api_key:
+        logger.warning(
+            "cjk_sanitize: %d CJK runs detected but no LLM_API_KEY; skipping",
+            len(runs),
+        )
+        return text
+
+    max_passes = int(os.environ.get("CJK_SANITIZE_MAX_PASSES", "3"))
+    out = text
+    n_total = 0
+    pass_n = 0
+    for pass_n in range(1, max_passes + 1):
+        runs = _extract_cjk_runs(out)
+        if not runs:
+            break
+        try:
+            translations = _batch_translate(runs, api_key, base_url, model)
+        except RuntimeError as e:
+            logger.warning("cjk_sanitize: %s — returning original text", e)
+            return text
+        n_replaced = 0
+        for orig, trans in translations.items():
+            if not trans or not trans.strip() or trans == orig:
+                continue
+            before = out.count(orig)
+            out = _smart_replace(out, orig, trans)
+            n_replaced += before
+        n_total += n_replaced
+        if n_replaced == 0:
+            break  # LLM didn't translate anything new — stop iterating
+
+    leftover = _extract_cjk_runs(out)
+    if leftover:
+        logger.warning(
+            "cjk_sanitize: %d CJK runs still present after %d passes: %s",
+            len(leftover),
+            pass_n or 1,
+            leftover,
+        )
+
+    logger.info("cjk_sanitize: replaced %d CJK runs across %d passes", n_total, pass_n or 1)
+    return out
diff --git a/backend/scripts/test_cjk_sanitize.py b/backend/scripts/test_cjk_sanitize.py
new file mode 100644
index 0000000000..273143d41e
--- /dev/null
+++ b/backend/scripts/test_cjk_sanitize.py
@@ -0,0 +1,290 @@
+"""
+Tests for cjk_sanitize utility.
+
+Verifies:
+  1. CJK run detection (basic, dedup, min-length, order)
+  2. Boundary-aware replacement (ASCII left/right, fullwidth parens, numbers)
+  3. Locale gating (zh skipped, en/id/etc enabled, env override)
+  4. Integration: empty, no-CJK, no-API-key, mocked LLM translate, multi-pass
+  5. Failure modes: LLM error falls back to original text, idempotent
+"""
+
+import os
+import sys
+from unittest.mock import patch, MagicMock
+
+# Add project path so 'app' package is importable
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from app.utils.cjk_sanitize import (
+    sanitize_cjk_in_text,
+    is_enabled,
+    _extract_cjk_runs,
+    _smart_replace,
+)
+
+
+# -------- _extract_cjk_runs --------
+
+def test_extract_cjk_runs_basic():
+    text = "Hello 你好世界 world 通胀压力"
+    runs = _extract_cjk_runs(text)
+    assert "你好世界" in runs
+    assert "通胀压力" in runs
+    assert len(runs) == 2
+    print("PASS: test_extract_cjk_runs_basic")
+
+
+def test_extract_cjk_runs_dedup():
+    text = "你好世界 and 你好世界 again"
+    runs = _extract_cjk_runs(text)
+    assert runs == ["你好世界"]
+    print("PASS: test_extract_cjk_runs_dedup")
+
+
+def test_extract_cjk_runs_min_length():
+    text = "A 你 B"  # single CJK char should NOT match
+    runs = _extract_cjk_runs(text)
+    assert runs == []
+    print("PASS: test_extract_cjk_runs_min_length")
+
+
+def test_extract_cjk_runs_preserves_order():
+    text = "First 你好 then 世界 and 你好 again"
+    runs = _extract_cjk_runs(text)
+    assert runs == ["你好", "世界"]
+    print("PASS: test_extract_cjk_runs_preserves_order")
+
+
+# -------- _smart_replace (boundary handling) --------
+
+def test_smart_replace_ascii_both_sides():
+    out = _smart_replace("data过于依赖issues", "过于依赖", "overly dependent")
+    assert out == "data overly dependent issues"
+    print("PASS: test_smart_replace_ascii_both_sides")
+
+
+def test_smart_replace_ascii_left_only():
+    out = _smart_replace("Purbaya过于扩张,", "过于扩张", "overly expansionary")
+    assert out == "Purbaya overly expansionary,"
+    print("PASS: test_smart_replace_ascii_left_only")
+
+
+def test_smart_replace_ascii_right_only():
+    out = _smart_replace("（中央银行）officials", "中央银行", "central bank")
+    assert out == "（central bank）officials"
+    print("PASS: test_smart_replace_ascii_right_only")
+
+
+def test_smart_replace_fullwidth_parens_preserved():
+    out = _smart_replace("the policy is（财政）expansion", "财政", "fiscal")
+    assert "（fiscal）" in out
+    print("PASS: test_smart_replace_fullwidth_parens_preserved")
+
+
+def test_smart_replace_number_boundary():
+    out = _smart_replace("2024年通胀率达到高点", "年通胀率达到高点", "annual inflation rate peaked")
+    assert out == "2024 annual inflation rate peaked"
+    print("PASS: test_smart_replace_number_boundary")
+
+
+# -------- is_enabled --------
+
+def test_is_enabled_chinese_locale_skipped():
+    os.environ.pop("CJK_SANITIZE_ENABLED", None)
+    os.environ.pop("CJK_SANITIZE_LANGS", None)
+    assert is_enabled("zh") is False
+    assert is_enabled("zh-CN") is False
+    assert is_enabled("zh-TW") is False
+    print("PASS: test_is_enabled_chinese_locale_skipped")
+
+
+def test_is_enabled_non_chinese_default():
+    os.environ.pop("CJK_SANITIZE_ENABLED", None)
+    os.environ.pop("CJK_SANITIZE_LANGS", None)
+    for loc in ("en", "es", "fr", "pt", "ru", "de", "id"):
+        assert is_enabled(loc) is True, f"Expected True for {loc}"
+    print("PASS: test_is_enabled_non_chinese_default")
+
+
+def test_is_enabled_force_on():
+    os.environ["CJK_SANITIZE_ENABLED"] = "1"
+    try:
+        assert is_enabled("zh") is True  # force-on overrides locale check
+        print("PASS: test_is_enabled_force_on")
+    finally:
+        del os.environ["CJK_SANITIZE_ENABLED"]
+
+
+def test_is_enabled_force_off():
+    os.environ["CJK_SANITIZE_ENABLED"] = "0"
+    try:
+        assert is_enabled("en") is False
+        print("PASS: test_is_enabled_force_off")
+    finally:
+        del os.environ["CJK_SANITIZE_ENABLED"]
+
+
+def test_is_enabled_custom_langs():
+    os.environ.pop("CJK_SANITIZE_ENABLED", None)
+    os.environ["CJK_SANITIZE_LANGS"] = "ja,ko"
+    try:
+        assert is_enabled("en") is False
+        assert is_enabled("ja") is True
+        assert is_enabled("ko") is True
+        print("PASS: test_is_enabled_custom_langs")
+    finally:
+        del os.environ["CJK_SANITIZE_LANGS"]
+
+
+# -------- sanitize_cjk_in_text integration --------
+
+def test_sanitize_empty_text():
+    assert sanitize_cjk_in_text("") == ""
+    print("PASS: test_sanitize_empty_text")
+
+
+def test_sanitize_no_cjk_passthrough():
+    text = "This is a pure English report with no leaks."
+    assert sanitize_cjk_in_text(text, locale="en") == text
+    print("PASS: test_sanitize_no_cjk_passthrough")
+
+
+def test_sanitize_chinese_locale_no_op():
+    text = "你好世界 should not be translated in Chinese reports"
+    assert sanitize_cjk_in_text(text, locale="zh") == text
+    print("PASS: test_sanitize_chinese_locale_no_op")
+
+
+def test_sanitize_no_api_key_no_op():
+    text = "BI said: Purbaya过于扩张"
+    result = sanitize_cjk_in_text(
+        text, locale="en", api_key=None, base_url="http://test", model="test"
+    )
+    assert result == text  # no LLM = no change
+    print("PASS: test_sanitize_no_api_key_no_op")
+
+
+def _make_mock_response(json_bytes: bytes):
+    """Build a context-manager-friendly mock for urllib.request.urlopen."""
+    resp = MagicMock()
+    resp.read.return_value = json_bytes
+    resp.__enter__ = lambda self: self
+    resp.__exit__ = lambda self, *args: None
+    return resp
+
+
+def test_sanitize_translates_cjk_runs():
+    mock = _make_mock_response(
+        b'{"choices": [{"message": {"content": "[\\"overly inclined toward fiscal expansion\\"]"}}]}'
+    )
+    with patch("urllib.request.urlopen", return_value=mock):
+        text = "BI said: Purbaya过于倾向财政扩张, this is concerning."
+        result = sanitize_cjk_in_text(
+            text, locale="en", api_key="fake", base_url="http://test", model="test"
+        )
+    assert "过于倾向财政扩张" not in result
+    assert "overly inclined toward fiscal expansion" in result
+    print("PASS: test_sanitize_translates_cjk_runs")
+
+
+def test_sanitize_multi_pass_catches_leftover():
+    """LLM translates pass-1 snippets but leaves CJK in one translation. Pass 2
+    picks up the leftover CJK that emerged in the output."""
+    call_count = [0]
+    # Build JSON with embedded CJK as bytes (b'...' can't contain non-ASCII)
+    pass1_content = '["Hello world", "partial \u505c\u5de5 remaining"]'
+    pass2_content = '["work stoppage"]'
+    pass1_body = (
+        b'{"choices": [{"message": {"content": "'
+        + pass1_content.replace("\\", "\\\\").replace('"', '\\"').encode("utf-8")
+        + b'"}}]}'
+    )
+    pass2_body = (
+        b'{"choices": [{"message": {"content": "'
+        + pass2_content.replace("\\", "\\\\").replace('"', '\\"').encode("utf-8")
+        + b'"}}]}'
+    )
+
+    def fake_urlopen(req, timeout):
+        call_count[0] += 1
+        if call_count[0] == 1:
+            return _make_mock_response(pass1_body)
+        return _make_mock_response(pass2_body)
+
+    with patch("urllib.request.urlopen", side_effect=fake_urlopen):
+        text = "你好世界 then 停工 happened"
+        result = sanitize_cjk_in_text(
+            text, locale="en", api_key="fake", base_url="http://test", model="test"
+        )
+    assert "你好世界" not in result
+    assert "停工" not in result
+    assert "Hello world" in result
+    assert "work stoppage" in result
+    assert call_count[0] == 2
+    print("PASS: test_sanitize_multi_pass_catches_leftover")
+
+
+def test_sanitize_idempotent_when_clean():
+    text = "Pure English with no CJK characters at all."
+    assert sanitize_cjk_in_text(text, locale="en") == text
+    # Re-run with API key: still no change, 0 LLM calls
+    with patch("urllib.request.urlopen") as mock_url:
+        result = sanitize_cjk_in_text(
+            text, locale="en", api_key="fake", base_url="http://test", model="test"
+        )
+    assert result == text
+    assert mock_url.call_count == 0
+    print("PASS: test_sanitize_idempotent_when_clean")
+
+
+def test_sanitize_llm_failure_returns_original():
+    with patch("urllib.request.urlopen", side_effect=Exception("network down")):
+        text = "BI said: Purbaya过于扩张"
+        result = sanitize_cjk_in_text(
+            text, locale="en", api_key="fake", base_url="http://test", model="test"
+        )
+    assert result == text  # falls back gracefully
+    print("PASS: test_sanitize_llm_failure_returns_original")
+
+
+def test_sanitize_id_locale_enabled():
+    """Indonesian locale should also enable sanitization by default."""
+    mock = _make_mock_response(
+        b'{"choices": [{"message": {"content": "[\\"overly expansive\\"]"}}]}'
+    )
+    with patch("urllib.request.urlopen", return_value=mock):
+        text = "BI: Purbaya过于扩张"
+        result = sanitize_cjk_in_text(
+            text, locale="id", api_key="fake", base_url="http://test", model="test"
+        )
+    assert "overly expansive" in result
+    print("PASS: test_sanitize_id_locale_enabled")
+
+
+# -------- runner --------
+
+def run_all():
+    tests = [
+        v for k, v in globals().items()
+        if k.startswith("test_") and callable(v)
+    ]
+    passed = 0
+    failed = 0
+    for t in tests:
+        try:
+            t()
+            passed += 1
+        except Exception as e:
+            print(f"FAIL: {t.__name__}: {e}")
+            import traceback
+            traceback.print_exc()
+            failed += 1
+    print(f"\n{'=' * 60}")
+    print(f"CJK sanitize: {passed} passed, {failed} failed")
+    print(f"{'=' * 60}")
+    return 0 if failed == 0 else 1
+
+
+if __name__ == "__main__":
+    sys.exit(run_all())