diff --git a/README.md b/README.md
index de082935a7..dbecb0a8d0 100644
--- a/README.md
+++ b/README.md
@@ -176,6 +176,33 @@ Reads `.env` from root directory by default, maps ports `3000 (frontend) / 5001
> Mirror address for faster pulling is provided as comments in `docker-compose.yml`, replace if needed.
+### Optional: CJK leak sanitization for non-Chinese reports
+
+When running simulations in non-Chinese locales (`en`, `es`, `fr`, `pt`, `ru`, `de`, `id`),
+the LLM may occasionally slip Chinese characters into persona quotes despite the
+language instruction (e.g. *"BI economist said: Purbaya过于倾向财政扩张..."*).
+
+MiroFish auto-detects and re-translates any leaked CJK runs after report
+generation, reusing the same `LLM_API_KEY` / `LLM_BASE_URL` as the rest of the
+backend. The sanitization adds ~3-10 seconds to report completion and is
+idempotent (re-runs are no-op when no CJK remains).
+
+**Configuration** (all optional, set in `.env`):
+
+```bash
+# Disable entirely (default: auto-enabled for non-zh locales)
+CJK_SANITIZE_ENABLED=0
+
+# Run sanitization only for specific locales (comma-separated)
+CJK_SANITIZE_LANGS=ja,ko
+
+# Maximum retry passes (default: 3)
+CJK_SANITIZE_MAX_PASSES=3
+```
+
+No action is needed for Chinese reports (`zh` / `zh-CN` / `zh-TW`) — sanitization
+is automatically skipped.
+
## 📬 Join the Conversation
diff --git a/backend/app/services/report_agent.py b/backend/app/services/report_agent.py
index cecd70b464..a5cdf03fc7 100644
--- a/backend/app/services/report_agent.py
+++ b/backend/app/services/report_agent.py
@@ -19,6 +19,7 @@
from enum import Enum
from ..config import Config
+from ..utils.cjk_sanitize import is_enabled as cjk_sanitize_enabled, sanitize_cjk_in_text
from ..utils.llm_client import LLMClient
from ..utils.logger import get_logger
from ..utils.locale import get_language_instruction, t
@@ -902,7 +903,15 @@ def __init__(
self.graph_id = graph_id
self.simulation_id = simulation_id
self.simulation_requirement = simulation_requirement
-
+ # Capture the request's locale so post-processing (e.g. CJK sanitization)
+ # can decide whether to run. Falls back to the current thread locale
+ # (set via set_locale() in the request handler) or 'zh' default.
+ try:
+ from ..utils.locale import get_locale
+ self.locale = get_locale()
+ except Exception:
+ self.locale = 'zh'
+
self.llm = llm_client or LLMClient()
self.zep_tools = zep_tools or ZepToolsService()
@@ -1707,7 +1716,41 @@ def generate_report(
report.markdown_content = ReportManager.assemble_full_report(report_id, outline)
report.status = ReportStatus.COMPLETED
report.completed_at = datetime.now().isoformat()
-
+
+ # Sanitize any CJK characters that leaked into the report despite
+ # the language instruction (LLMs sometimes slip Chinese idioms into
+ # otherwise fluent English quotes). No-op for Chinese reports or
+ # when CJK_SANITIZE_ENABLED is off. See backend/app/utils/cjk_sanitize.py.
+ if cjk_sanitize_enabled(getattr(self, 'locale', None)):
+ try:
+ sanitized = sanitize_cjk_in_text(
+ report.markdown_content,
+ locale=getattr(self, 'locale', None),
+ )
+ if sanitized != report.markdown_content:
+ report.markdown_content = sanitized
+ # Persist the sanitized version to the on-disk file too
+ # so subsequent /download and /chat calls see the cleaned
+ # text.
+ try:
+ full_report_path = os.path.join(
+ Config.UPLOAD_FOLDER, 'reports', report_id, 'full_report.md'
+ )
+ if os.path.exists(full_report_path):
+ with open(full_report_path, 'w', encoding='utf-8') as f:
+ f.write(sanitized)
+ except Exception as path_err:
+ logger.warning(
+ "cjk_sanitize: failed to rewrite report file: %s",
+ path_err,
+ )
+ except Exception as sanitize_err:
+ # Sanitize failures must never break report delivery
+ logger.warning(
+ "cjk_sanitize: unexpected error, returning unsanitized report: %s",
+ sanitize_err,
+ )
+
# 计算总耗时
total_time_seconds = (datetime.now() - start_time).total_seconds()
diff --git a/backend/app/utils/cjk_sanitize.py b/backend/app/utils/cjk_sanitize.py
new file mode 100644
index 0000000000..601af77bba
--- /dev/null
+++ b/backend/app/utils/cjk_sanitize.py
@@ -0,0 +1,262 @@
+"""
+CJK Leak Sanitization Utility
+==============================
+
+Even with proper ``get_language_instruction()`` injected into LLM system prompts,
+the model can still leak Chinese characters mid-sentence when generating persona
+quotes (BI economists, ministry officials, Reddit commenters, etc.). The LLM
+sometimes reaches back to its Chinese training data for fluent-sounding speech,
+producing output like::
+
+ "BI economist said: Purbaya过于倾向财政扩张..."
+
+This module provides :func:`sanitize_cjk_in_text` which:
+
+1. Detects runs of CJK Unified Ideographs (U+4E00..U+9FFF) and CJK Symbols
+ & Punctuation (U+3000..U+303F) ≥ 2 characters in length
+2. Batch-translates them via the configured LLM endpoint (reusing the same
+ ``LLM_BASE_URL`` / ``LLM_API_KEY`` as the rest of MiroFish)
+3. Replaces each run in-place, injecting spaces at ASCII boundaries so the
+ result reads naturally in the surrounding language
+4. Iterates up to 3 passes so any fragments the LLM leaves in its first
+ response are caught in subsequent passes
+
+The function is a no-op when the input is short of CJK characters, when the
+target locale is Chinese (``locale == 'zh'``), or when no LLM API key is
+configured. It is safe to call from any code path — failures are logged and
+the original text is returned unchanged.
+
+Configuration (env vars, all optional):
+ CJK_SANITIZE_ENABLED — ``"1"`` / ``"true"`` to force-enable; ``"0"`` to force-disable.
+ Default: auto-enable for non-zh locales.
+ CJK_SANITIZE_LANGS — comma-separated locales where sanitization runs.
+ Default: ``"en,es,fr,pt,ru,de,id"`` (any non-zh locale).
+ CJK_SANITIZE_MAX_PASSES — maximum retry passes (default ``3``).
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+import urllib.error
+import urllib.request
+from typing import Iterable, Optional
+
+logger = logging.getLogger(__name__)
+
+# Match runs of ≥2 CJK ideographs or CJK punctuation/symbols.
+# Single-character matches (e.g. lone "(" or "】") are usually false positives
+# in mixed-language text; the 2-char floor filters most of those out.
+_CJK_PATTERN = re.compile(r"[\u4e00-\u9fff\u3000-\u303f]{2,}")
+
+# Default locales where sanitization is helpful. Chinese is intentionally
+# excluded — Chinese reports can and should contain CJK characters.
+_DEFAULT_TARGET_LOCALES = frozenset({"en", "es", "fr", "pt", "ru", "de", "id"})
+
+
+def _extract_cjk_runs(text: str) -> list[str]:
+ """Return unique CJK runs in order of first appearance."""
+ seen: set[str] = set()
+ out: list[str] = []
+ for m in _CJK_PATTERN.finditer(text):
+ s = m.group(0)
+ if s not in seen:
+ seen.add(s)
+ out.append(s)
+ return out
+
+
+def _smart_replace(text: str, cjk_run: str, translation: str) -> str:
+ """Replace a CJK run with its English translation, adding spaces at ASCII
+ boundaries so the result reads naturally in surrounding English/Latin text.
+
+ The LLM only sees the CJK snippet (not its surrounding context), so it can't
+ preserve the original spacing. We compensate with boundary lookbehind/ahead.
+ """
+ out = text
+ # ASCII on both sides → " trans "
+ pat = re.compile(r"(?<=[A-Za-z0-9])" + re.escape(cjk_run) + r"(?=[A-Za-z0-9])")
+ out = pat.sub(" " + translation + " ", out)
+ # ASCII on left only (e.g. "Purbaya过于扩张,") → " trans"
+ pre = re.compile(r"(?<=[A-Za-z0-9])" + re.escape(cjk_run))
+ out = pre.sub(" " + translation, out)
+ # ASCII on right only (e.g. "(中央银行)officials") → "trans "
+ post = re.compile(re.escape(cjk_run) + r"(?=[A-Za-z0-9])")
+ out = post.sub(translation + " ", out)
+ # Surrounded by non-ASCII (e.g. fullwidth parens) → bare translation
+ out = out.replace(cjk_run, translation)
+ return out
+
+
+def _batch_translate(
+ snippets: list[str],
+ api_key: str,
+ base_url: str,
+ model: str,
+ timeout: int = 60,
+ max_retries: int = 2,
+) -> dict[str, str]:
+ """Translate a list of CJK snippets to English via the configured LLM."""
+ if not snippets:
+ return {}
+
+ numbered = "\n".join(f"{i + 1}. {s}" for i, s in enumerate(snippets))
+ prompt = (
+ "You are a translator. The following are short Chinese fragments that "
+ "leaked into an otherwise English report. Translate each to natural "
+ "English. Output a JSON array of strings, one per input, in the same "
+ "order. Do not add commentary, numbering, or markdown fences — output "
+ f"only the JSON array.\n\n{numbered}"
+ )
+ body = json.dumps(
+ {
+ "model": model,
+ "max_tokens": 4096,
+ "temperature": 0.1,
+ "messages": [{"role": "user", "content": prompt}],
+ }
+ ).encode()
+
+ url = base_url.rstrip("/") + "/chat/completions"
+ last_err: Optional[Exception] = None
+ for attempt in range(max_retries):
+ try:
+ req = urllib.request.Request(
+ url,
+ data=body,
+ headers={
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {api_key}",
+ },
+ method="POST",
+ )
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
+ payload = json.loads(resp.read())
+ content = payload["choices"][0]["message"]["content"].strip()
+ # Strip markdown fences if the model added them despite instructions
+ content = re.sub(r"^```(?:json)?\s*", "", content)
+ content = re.sub(r"\s*```$", "", content)
+ arr = json.loads(content)
+ if not isinstance(arr, list) or len(arr) != len(snippets):
+ raise ValueError(
+ f"Got {len(arr) if isinstance(arr, list) else 'non-list'} "
+ f"translations for {len(snippets)} snippets"
+ )
+ return {orig: trans for orig, trans in zip(snippets, arr)}
+ except Exception as e: # any failure (network, JSON, schema) → retry
+ last_err = e
+ logger.warning(
+ "cjk_sanitize translation attempt %d/%d failed: %s",
+ attempt + 1,
+ max_retries,
+ e,
+ )
+ raise RuntimeError(f"Translation failed after {max_retries} attempts: {last_err}")
+
+
+def is_enabled(locale: Optional[str] = None) -> bool:
+ """Return True if sanitization should run for the given locale.
+
+ Honors the ``CJK_SANITIZE_ENABLED`` env var (force on/off). When unset,
+ enables for any locale in ``CJK_SANITIZE_LANGS`` (default: non-Chinese
+ supported locales).
+ """
+ override = os.environ.get("CJK_SANITIZE_ENABLED", "").strip().lower()
+ if override in ("1", "true", "yes", "on"):
+ return True
+ if override in ("0", "false", "no", "off"):
+ return False
+ if not locale:
+ return False
+ target_langs_env = os.environ.get("CJK_SANITIZE_LANGS", "").strip()
+ if target_langs_env:
+ target = {x.strip().lower() for x in target_langs_env.split(",") if x.strip()}
+ else:
+ target = _DEFAULT_TARGET_LOCALES
+ return locale.lower() in target
+
+
+def sanitize_cjk_in_text(
+ text: str,
+ locale: Optional[str] = None,
+ api_key: Optional[str] = None,
+ base_url: Optional[str] = None,
+ model: Optional[str] = None,
+) -> str:
+ """Return ``text`` with CJK runs replaced by English translations.
+
+ No-op if:
+ - ``text`` is empty or contains no CJK runs
+ - ``locale`` is Chinese (``zh`` / ``zh-CN`` / ``zh-TW``)
+ - no LLM ``api_key`` is configured
+ - the LLM call fails (logs warning, returns original text)
+
+ Args:
+ text: The text to sanitize (typically a generated report or section).
+ locale: Target report locale (e.g. ``"en"``). Chinese is skipped.
+ api_key: LLM API key. Defaults to ``LLM_API_KEY`` env var.
+ base_url: LLM endpoint base URL. Defaults to ``LLM_BASE_URL`` env var.
+ model: LLM model name. Defaults to ``LLM_MODEL_NAME`` env var.
+
+ Returns:
+ Sanitized text. If sanitization is skipped or fails, returns the
+ original ``text`` unchanged.
+ """
+ if not text:
+ return text
+
+ if locale and locale.lower().split("-")[0] == "zh":
+ return text # Chinese reports legitimately contain CJK
+
+ runs = _extract_cjk_runs(text)
+ if not runs:
+ return text
+
+ api_key = api_key or os.environ.get("LLM_API_KEY")
+ base_url = base_url or os.environ.get("LLM_BASE_URL", "https://api.openai.com/v1")
+ model = model or os.environ.get("LLM_MODEL_NAME", "gpt-4o-mini")
+
+ if not api_key:
+ logger.warning(
+ "cjk_sanitize: %d CJK runs detected but no LLM_API_KEY; skipping",
+ len(runs),
+ )
+ return text
+
+ max_passes = int(os.environ.get("CJK_SANITIZE_MAX_PASSES", "3"))
+ out = text
+ n_total = 0
+ pass_n = 0
+ for pass_n in range(1, max_passes + 1):
+ runs = _extract_cjk_runs(out)
+ if not runs:
+ break
+ try:
+ translations = _batch_translate(runs, api_key, base_url, model)
+ except RuntimeError as e:
+ logger.warning("cjk_sanitize: %s — returning original text", e)
+ return text
+ n_replaced = 0
+ for orig, trans in translations.items():
+ if not trans or not trans.strip() or trans == orig:
+ continue
+ before = out.count(orig)
+ out = _smart_replace(out, orig, trans)
+ n_replaced += before
+ n_total += n_replaced
+ if n_replaced == 0:
+ break # LLM didn't translate anything new — stop iterating
+
+ leftover = _extract_cjk_runs(out)
+ if leftover:
+ logger.warning(
+ "cjk_sanitize: %d CJK runs still present after %d passes: %s",
+ len(leftover),
+ pass_n or 1,
+ leftover,
+ )
+
+ logger.info("cjk_sanitize: replaced %d CJK runs across %d passes", n_total, pass_n or 1)
+ return out
diff --git a/backend/scripts/test_cjk_sanitize.py b/backend/scripts/test_cjk_sanitize.py
new file mode 100644
index 0000000000..273143d41e
--- /dev/null
+++ b/backend/scripts/test_cjk_sanitize.py
@@ -0,0 +1,290 @@
+"""
+Tests for cjk_sanitize utility.
+
+Verifies:
+ 1. CJK run detection (basic, dedup, min-length, order)
+ 2. Boundary-aware replacement (ASCII left/right, fullwidth parens, numbers)
+ 3. Locale gating (zh skipped, en/id/etc enabled, env override)
+ 4. Integration: empty, no-CJK, no-API-key, mocked LLM translate, multi-pass
+ 5. Failure modes: LLM error falls back to original text, idempotent
+"""
+
+import os
+import sys
+from unittest.mock import patch, MagicMock
+
+# Add project path so 'app' package is importable
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from app.utils.cjk_sanitize import (
+ sanitize_cjk_in_text,
+ is_enabled,
+ _extract_cjk_runs,
+ _smart_replace,
+)
+
+
+# -------- _extract_cjk_runs --------
+
+def test_extract_cjk_runs_basic():
+ text = "Hello 你好世界 world 通胀压力"
+ runs = _extract_cjk_runs(text)
+ assert "你好世界" in runs
+ assert "通胀压力" in runs
+ assert len(runs) == 2
+ print("PASS: test_extract_cjk_runs_basic")
+
+
+def test_extract_cjk_runs_dedup():
+ text = "你好世界 and 你好世界 again"
+ runs = _extract_cjk_runs(text)
+ assert runs == ["你好世界"]
+ print("PASS: test_extract_cjk_runs_dedup")
+
+
+def test_extract_cjk_runs_min_length():
+ text = "A 你 B" # single CJK char should NOT match
+ runs = _extract_cjk_runs(text)
+ assert runs == []
+ print("PASS: test_extract_cjk_runs_min_length")
+
+
+def test_extract_cjk_runs_preserves_order():
+ text = "First 你好 then 世界 and 你好 again"
+ runs = _extract_cjk_runs(text)
+ assert runs == ["你好", "世界"]
+ print("PASS: test_extract_cjk_runs_preserves_order")
+
+
+# -------- _smart_replace (boundary handling) --------
+
+def test_smart_replace_ascii_both_sides():
+ out = _smart_replace("data过于依赖issues", "过于依赖", "overly dependent")
+ assert out == "data overly dependent issues"
+ print("PASS: test_smart_replace_ascii_both_sides")
+
+
+def test_smart_replace_ascii_left_only():
+ out = _smart_replace("Purbaya过于扩张,", "过于扩张", "overly expansionary")
+ assert out == "Purbaya overly expansionary,"
+ print("PASS: test_smart_replace_ascii_left_only")
+
+
+def test_smart_replace_ascii_right_only():
+ out = _smart_replace("(中央银行)officials", "中央银行", "central bank")
+ assert out == "(central bank)officials"
+ print("PASS: test_smart_replace_ascii_right_only")
+
+
+def test_smart_replace_fullwidth_parens_preserved():
+ out = _smart_replace("the policy is(财政)expansion", "财政", "fiscal")
+ assert "(fiscal)" in out
+ print("PASS: test_smart_replace_fullwidth_parens_preserved")
+
+
+def test_smart_replace_number_boundary():
+ out = _smart_replace("2024年通胀率达到高点", "年通胀率达到高点", "annual inflation rate peaked")
+ assert out == "2024 annual inflation rate peaked"
+ print("PASS: test_smart_replace_number_boundary")
+
+
+# -------- is_enabled --------
+
+def test_is_enabled_chinese_locale_skipped():
+ os.environ.pop("CJK_SANITIZE_ENABLED", None)
+ os.environ.pop("CJK_SANITIZE_LANGS", None)
+ assert is_enabled("zh") is False
+ assert is_enabled("zh-CN") is False
+ assert is_enabled("zh-TW") is False
+ print("PASS: test_is_enabled_chinese_locale_skipped")
+
+
+def test_is_enabled_non_chinese_default():
+ os.environ.pop("CJK_SANITIZE_ENABLED", None)
+ os.environ.pop("CJK_SANITIZE_LANGS", None)
+ for loc in ("en", "es", "fr", "pt", "ru", "de", "id"):
+ assert is_enabled(loc) is True, f"Expected True for {loc}"
+ print("PASS: test_is_enabled_non_chinese_default")
+
+
+def test_is_enabled_force_on():
+ os.environ["CJK_SANITIZE_ENABLED"] = "1"
+ try:
+ assert is_enabled("zh") is True # force-on overrides locale check
+ print("PASS: test_is_enabled_force_on")
+ finally:
+ del os.environ["CJK_SANITIZE_ENABLED"]
+
+
+def test_is_enabled_force_off():
+ os.environ["CJK_SANITIZE_ENABLED"] = "0"
+ try:
+ assert is_enabled("en") is False
+ print("PASS: test_is_enabled_force_off")
+ finally:
+ del os.environ["CJK_SANITIZE_ENABLED"]
+
+
+def test_is_enabled_custom_langs():
+ os.environ.pop("CJK_SANITIZE_ENABLED", None)
+ os.environ["CJK_SANITIZE_LANGS"] = "ja,ko"
+ try:
+ assert is_enabled("en") is False
+ assert is_enabled("ja") is True
+ assert is_enabled("ko") is True
+ print("PASS: test_is_enabled_custom_langs")
+ finally:
+ del os.environ["CJK_SANITIZE_LANGS"]
+
+
+# -------- sanitize_cjk_in_text integration --------
+
+def test_sanitize_empty_text():
+ assert sanitize_cjk_in_text("") == ""
+ print("PASS: test_sanitize_empty_text")
+
+
+def test_sanitize_no_cjk_passthrough():
+ text = "This is a pure English report with no leaks."
+ assert sanitize_cjk_in_text(text, locale="en") == text
+ print("PASS: test_sanitize_no_cjk_passthrough")
+
+
+def test_sanitize_chinese_locale_no_op():
+ text = "你好世界 should not be translated in Chinese reports"
+ assert sanitize_cjk_in_text(text, locale="zh") == text
+ print("PASS: test_sanitize_chinese_locale_no_op")
+
+
+def test_sanitize_no_api_key_no_op():
+ text = "BI said: Purbaya过于扩张"
+ result = sanitize_cjk_in_text(
+ text, locale="en", api_key=None, base_url="http://test", model="test"
+ )
+ assert result == text # no LLM = no change
+ print("PASS: test_sanitize_no_api_key_no_op")
+
+
+def _make_mock_response(json_bytes: bytes):
+ """Build a context-manager-friendly mock for urllib.request.urlopen."""
+ resp = MagicMock()
+ resp.read.return_value = json_bytes
+ resp.__enter__ = lambda self: self
+ resp.__exit__ = lambda self, *args: None
+ return resp
+
+
+def test_sanitize_translates_cjk_runs():
+ mock = _make_mock_response(
+ b'{"choices": [{"message": {"content": "[\\"overly inclined toward fiscal expansion\\"]"}}]}'
+ )
+ with patch("urllib.request.urlopen", return_value=mock):
+ text = "BI said: Purbaya过于倾向财政扩张, this is concerning."
+ result = sanitize_cjk_in_text(
+ text, locale="en", api_key="fake", base_url="http://test", model="test"
+ )
+ assert "过于倾向财政扩张" not in result
+ assert "overly inclined toward fiscal expansion" in result
+ print("PASS: test_sanitize_translates_cjk_runs")
+
+
+def test_sanitize_multi_pass_catches_leftover():
+ """LLM translates pass-1 snippets but leaves CJK in one translation. Pass 2
+ picks up the leftover CJK that emerged in the output."""
+ call_count = [0]
+ # Build JSON with embedded CJK as bytes (b'...' can't contain non-ASCII)
+ pass1_content = '["Hello world", "partial \u505c\u5de5 remaining"]'
+ pass2_content = '["work stoppage"]'
+ pass1_body = (
+ b'{"choices": [{"message": {"content": "'
+ + pass1_content.replace("\\", "\\\\").replace('"', '\\"').encode("utf-8")
+ + b'"}}]}'
+ )
+ pass2_body = (
+ b'{"choices": [{"message": {"content": "'
+ + pass2_content.replace("\\", "\\\\").replace('"', '\\"').encode("utf-8")
+ + b'"}}]}'
+ )
+
+ def fake_urlopen(req, timeout):
+ call_count[0] += 1
+ if call_count[0] == 1:
+ return _make_mock_response(pass1_body)
+ return _make_mock_response(pass2_body)
+
+ with patch("urllib.request.urlopen", side_effect=fake_urlopen):
+ text = "你好世界 then 停工 happened"
+ result = sanitize_cjk_in_text(
+ text, locale="en", api_key="fake", base_url="http://test", model="test"
+ )
+ assert "你好世界" not in result
+ assert "停工" not in result
+ assert "Hello world" in result
+ assert "work stoppage" in result
+ assert call_count[0] == 2
+ print("PASS: test_sanitize_multi_pass_catches_leftover")
+
+
+def test_sanitize_idempotent_when_clean():
+ text = "Pure English with no CJK characters at all."
+ assert sanitize_cjk_in_text(text, locale="en") == text
+ # Re-run with API key: still no change, 0 LLM calls
+ with patch("urllib.request.urlopen") as mock_url:
+ result = sanitize_cjk_in_text(
+ text, locale="en", api_key="fake", base_url="http://test", model="test"
+ )
+ assert result == text
+ assert mock_url.call_count == 0
+ print("PASS: test_sanitize_idempotent_when_clean")
+
+
+def test_sanitize_llm_failure_returns_original():
+ with patch("urllib.request.urlopen", side_effect=Exception("network down")):
+ text = "BI said: Purbaya过于扩张"
+ result = sanitize_cjk_in_text(
+ text, locale="en", api_key="fake", base_url="http://test", model="test"
+ )
+ assert result == text # falls back gracefully
+ print("PASS: test_sanitize_llm_failure_returns_original")
+
+
+def test_sanitize_id_locale_enabled():
+ """Indonesian locale should also enable sanitization by default."""
+ mock = _make_mock_response(
+ b'{"choices": [{"message": {"content": "[\\"overly expansive\\"]"}}]}'
+ )
+ with patch("urllib.request.urlopen", return_value=mock):
+ text = "BI: Purbaya过于扩张"
+ result = sanitize_cjk_in_text(
+ text, locale="id", api_key="fake", base_url="http://test", model="test"
+ )
+ assert "overly expansive" in result
+ print("PASS: test_sanitize_id_locale_enabled")
+
+
+# -------- runner --------
+
+def run_all():
+ tests = [
+ v for k, v in globals().items()
+ if k.startswith("test_") and callable(v)
+ ]
+ passed = 0
+ failed = 0
+ for t in tests:
+ try:
+ t()
+ passed += 1
+ except Exception as e:
+ print(f"FAIL: {t.__name__}: {e}")
+ import traceback
+ traceback.print_exc()
+ failed += 1
+ print(f"\n{'=' * 60}")
+ print(f"CJK sanitize: {passed} passed, {failed} failed")
+ print(f"{'=' * 60}")
+ return 0 if failed == 0 else 1
+
+
+if __name__ == "__main__":
+ sys.exit(run_all())