From 423f3437da6146568268c0ac838add4779489919 Mon Sep 17 00:00:00 2001 From: Ardha Date: Sun, 7 Jun 2026 05:53:18 +0000 Subject: [PATCH] feat: add CJK leak sanitization for non-Chinese reports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the LLM generates persona quotes (BI economists, ministry officials, Reddit commenters, etc.) in non-Chinese locales, it can occasionally slip Chinese characters into otherwise fluent English/Latin prose. The system prompt language instruction reduces but doesn't eliminate this — the LLM sometimes reaches back to its Chinese training data for fluent-sounding speech, producing output like: "BI economist said: Purbaya过于倾向财政扩张..." This adds a post-processing step that: 1. Detects runs of CJK Unified Ideographs (U+4E00..U+9FFF) and CJK Symbols/Punctuation (U+3000..U+303F) ≥ 2 chars in length 2. Batch-translates them via the configured LLM endpoint (reusing the same LLM_API_KEY/LLM_BASE_URL as the rest of MiroFish) 3. Replaces each run in-place, injecting spaces at ASCII boundaries so the result reads naturally in surrounding text 4. Iterates up to 3 passes to catch fragments the LLM leaves in pass 1 Behavior: - Auto-enabled for non-Chinese locales (en, es, fr, pt, ru, de, id) - Skipped for zh / zh-CN / zh-TW (legitimate CJK content) - No-op when LLM_API_KEY is not configured (warns and returns original) - Graceful fallback: any LLM failure returns original text unchanged - Idempotent: re-running on already-sanitized text is a no-op Configuration (all optional, set in .env): CJK_SANITIZE_ENABLED=0 # force off (default: auto for non-zh) CJK_SANITIZE_LANGS=ja,ko # override target locale set CJK_SANITIZE_MAX_PASSES=3 # default 3 Files added: - backend/app/utils/cjk_sanitize.py (~250 lines, the module) - backend/scripts/test_cjk_sanitize.py (23 unit + integration tests) Files modified: - backend/app/services/report_agent.py (wire-in after assemble_full_report) - README.md (document config env vars) Tested: 23/23 unit tests pass; live Purbaya/USD-IDR report (24 unique CJK runs in 14kB markdown) reduced to 0 in 3.4s with real DeepSeek API. --- README.md | 27 +++ backend/app/services/report_agent.py | 47 ++++- backend/app/utils/cjk_sanitize.py | 262 ++++++++++++++++++++++++ backend/scripts/test_cjk_sanitize.py | 290 +++++++++++++++++++++++++++ 4 files changed, 624 insertions(+), 2 deletions(-) create mode 100644 backend/app/utils/cjk_sanitize.py create mode 100644 backend/scripts/test_cjk_sanitize.py diff --git a/README.md b/README.md index de082935a7..dbecb0a8d0 100644 --- a/README.md +++ b/README.md @@ -176,6 +176,33 @@ Reads `.env` from root directory by default, maps ports `3000 (frontend) / 5001 > Mirror address for faster pulling is provided as comments in `docker-compose.yml`, replace if needed. +### Optional: CJK leak sanitization for non-Chinese reports + +When running simulations in non-Chinese locales (`en`, `es`, `fr`, `pt`, `ru`, `de`, `id`), +the LLM may occasionally slip Chinese characters into persona quotes despite the +language instruction (e.g. *"BI economist said: Purbaya过于倾向财政扩张..."*). + +MiroFish auto-detects and re-translates any leaked CJK runs after report +generation, reusing the same `LLM_API_KEY` / `LLM_BASE_URL` as the rest of the +backend. The sanitization adds ~3-10 seconds to report completion and is +idempotent (re-runs are no-op when no CJK remains). + +**Configuration** (all optional, set in `.env`): + +```bash +# Disable entirely (default: auto-enabled for non-zh locales) +CJK_SANITIZE_ENABLED=0 + +# Run sanitization only for specific locales (comma-separated) +CJK_SANITIZE_LANGS=ja,ko + +# Maximum retry passes (default: 3) +CJK_SANITIZE_MAX_PASSES=3 +``` + +No action is needed for Chinese reports (`zh` / `zh-CN` / `zh-TW`) — sanitization +is automatically skipped. + ## 📬 Join the Conversation
diff --git a/backend/app/services/report_agent.py b/backend/app/services/report_agent.py index cecd70b464..a5cdf03fc7 100644 --- a/backend/app/services/report_agent.py +++ b/backend/app/services/report_agent.py @@ -19,6 +19,7 @@ from enum import Enum from ..config import Config +from ..utils.cjk_sanitize import is_enabled as cjk_sanitize_enabled, sanitize_cjk_in_text from ..utils.llm_client import LLMClient from ..utils.logger import get_logger from ..utils.locale import get_language_instruction, t @@ -902,7 +903,15 @@ def __init__( self.graph_id = graph_id self.simulation_id = simulation_id self.simulation_requirement = simulation_requirement - + # Capture the request's locale so post-processing (e.g. CJK sanitization) + # can decide whether to run. Falls back to the current thread locale + # (set via set_locale() in the request handler) or 'zh' default. + try: + from ..utils.locale import get_locale + self.locale = get_locale() + except Exception: + self.locale = 'zh' + self.llm = llm_client or LLMClient() self.zep_tools = zep_tools or ZepToolsService() @@ -1707,7 +1716,41 @@ def generate_report( report.markdown_content = ReportManager.assemble_full_report(report_id, outline) report.status = ReportStatus.COMPLETED report.completed_at = datetime.now().isoformat() - + + # Sanitize any CJK characters that leaked into the report despite + # the language instruction (LLMs sometimes slip Chinese idioms into + # otherwise fluent English quotes). No-op for Chinese reports or + # when CJK_SANITIZE_ENABLED is off. See backend/app/utils/cjk_sanitize.py. + if cjk_sanitize_enabled(getattr(self, 'locale', None)): + try: + sanitized = sanitize_cjk_in_text( + report.markdown_content, + locale=getattr(self, 'locale', None), + ) + if sanitized != report.markdown_content: + report.markdown_content = sanitized + # Persist the sanitized version to the on-disk file too + # so subsequent /download and /chat calls see the cleaned + # text. + try: + full_report_path = os.path.join( + Config.UPLOAD_FOLDER, 'reports', report_id, 'full_report.md' + ) + if os.path.exists(full_report_path): + with open(full_report_path, 'w', encoding='utf-8') as f: + f.write(sanitized) + except Exception as path_err: + logger.warning( + "cjk_sanitize: failed to rewrite report file: %s", + path_err, + ) + except Exception as sanitize_err: + # Sanitize failures must never break report delivery + logger.warning( + "cjk_sanitize: unexpected error, returning unsanitized report: %s", + sanitize_err, + ) + # 计算总耗时 total_time_seconds = (datetime.now() - start_time).total_seconds() diff --git a/backend/app/utils/cjk_sanitize.py b/backend/app/utils/cjk_sanitize.py new file mode 100644 index 0000000000..601af77bba --- /dev/null +++ b/backend/app/utils/cjk_sanitize.py @@ -0,0 +1,262 @@ +""" +CJK Leak Sanitization Utility +============================== + +Even with proper ``get_language_instruction()`` injected into LLM system prompts, +the model can still leak Chinese characters mid-sentence when generating persona +quotes (BI economists, ministry officials, Reddit commenters, etc.). The LLM +sometimes reaches back to its Chinese training data for fluent-sounding speech, +producing output like:: + + "BI economist said: Purbaya过于倾向财政扩张..." + +This module provides :func:`sanitize_cjk_in_text` which: + +1. Detects runs of CJK Unified Ideographs (U+4E00..U+9FFF) and CJK Symbols + & Punctuation (U+3000..U+303F) ≥ 2 characters in length +2. Batch-translates them via the configured LLM endpoint (reusing the same + ``LLM_BASE_URL`` / ``LLM_API_KEY`` as the rest of MiroFish) +3. Replaces each run in-place, injecting spaces at ASCII boundaries so the + result reads naturally in the surrounding language +4. Iterates up to 3 passes so any fragments the LLM leaves in its first + response are caught in subsequent passes + +The function is a no-op when the input is short of CJK characters, when the +target locale is Chinese (``locale == 'zh'``), or when no LLM API key is +configured. It is safe to call from any code path — failures are logged and +the original text is returned unchanged. + +Configuration (env vars, all optional): + CJK_SANITIZE_ENABLED — ``"1"`` / ``"true"`` to force-enable; ``"0"`` to force-disable. + Default: auto-enable for non-zh locales. + CJK_SANITIZE_LANGS — comma-separated locales where sanitization runs. + Default: ``"en,es,fr,pt,ru,de,id"`` (any non-zh locale). + CJK_SANITIZE_MAX_PASSES — maximum retry passes (default ``3``). +""" + +from __future__ import annotations + +import json +import logging +import os +import re +import urllib.error +import urllib.request +from typing import Iterable, Optional + +logger = logging.getLogger(__name__) + +# Match runs of ≥2 CJK ideographs or CJK punctuation/symbols. +# Single-character matches (e.g. lone "(" or "】") are usually false positives +# in mixed-language text; the 2-char floor filters most of those out. +_CJK_PATTERN = re.compile(r"[\u4e00-\u9fff\u3000-\u303f]{2,}") + +# Default locales where sanitization is helpful. Chinese is intentionally +# excluded — Chinese reports can and should contain CJK characters. +_DEFAULT_TARGET_LOCALES = frozenset({"en", "es", "fr", "pt", "ru", "de", "id"}) + + +def _extract_cjk_runs(text: str) -> list[str]: + """Return unique CJK runs in order of first appearance.""" + seen: set[str] = set() + out: list[str] = [] + for m in _CJK_PATTERN.finditer(text): + s = m.group(0) + if s not in seen: + seen.add(s) + out.append(s) + return out + + +def _smart_replace(text: str, cjk_run: str, translation: str) -> str: + """Replace a CJK run with its English translation, adding spaces at ASCII + boundaries so the result reads naturally in surrounding English/Latin text. + + The LLM only sees the CJK snippet (not its surrounding context), so it can't + preserve the original spacing. We compensate with boundary lookbehind/ahead. + """ + out = text + # ASCII on both sides → " trans " + pat = re.compile(r"(?<=[A-Za-z0-9])" + re.escape(cjk_run) + r"(?=[A-Za-z0-9])") + out = pat.sub(" " + translation + " ", out) + # ASCII on left only (e.g. "Purbaya过于扩张,") → " trans" + pre = re.compile(r"(?<=[A-Za-z0-9])" + re.escape(cjk_run)) + out = pre.sub(" " + translation, out) + # ASCII on right only (e.g. "(中央银行)officials") → "trans " + post = re.compile(re.escape(cjk_run) + r"(?=[A-Za-z0-9])") + out = post.sub(translation + " ", out) + # Surrounded by non-ASCII (e.g. fullwidth parens) → bare translation + out = out.replace(cjk_run, translation) + return out + + +def _batch_translate( + snippets: list[str], + api_key: str, + base_url: str, + model: str, + timeout: int = 60, + max_retries: int = 2, +) -> dict[str, str]: + """Translate a list of CJK snippets to English via the configured LLM.""" + if not snippets: + return {} + + numbered = "\n".join(f"{i + 1}. {s}" for i, s in enumerate(snippets)) + prompt = ( + "You are a translator. The following are short Chinese fragments that " + "leaked into an otherwise English report. Translate each to natural " + "English. Output a JSON array of strings, one per input, in the same " + "order. Do not add commentary, numbering, or markdown fences — output " + f"only the JSON array.\n\n{numbered}" + ) + body = json.dumps( + { + "model": model, + "max_tokens": 4096, + "temperature": 0.1, + "messages": [{"role": "user", "content": prompt}], + } + ).encode() + + url = base_url.rstrip("/") + "/chat/completions" + last_err: Optional[Exception] = None + for attempt in range(max_retries): + try: + req = urllib.request.Request( + url, + data=body, + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}", + }, + method="POST", + ) + with urllib.request.urlopen(req, timeout=timeout) as resp: + payload = json.loads(resp.read()) + content = payload["choices"][0]["message"]["content"].strip() + # Strip markdown fences if the model added them despite instructions + content = re.sub(r"^```(?:json)?\s*", "", content) + content = re.sub(r"\s*```$", "", content) + arr = json.loads(content) + if not isinstance(arr, list) or len(arr) != len(snippets): + raise ValueError( + f"Got {len(arr) if isinstance(arr, list) else 'non-list'} " + f"translations for {len(snippets)} snippets" + ) + return {orig: trans for orig, trans in zip(snippets, arr)} + except Exception as e: # any failure (network, JSON, schema) → retry + last_err = e + logger.warning( + "cjk_sanitize translation attempt %d/%d failed: %s", + attempt + 1, + max_retries, + e, + ) + raise RuntimeError(f"Translation failed after {max_retries} attempts: {last_err}") + + +def is_enabled(locale: Optional[str] = None) -> bool: + """Return True if sanitization should run for the given locale. + + Honors the ``CJK_SANITIZE_ENABLED`` env var (force on/off). When unset, + enables for any locale in ``CJK_SANITIZE_LANGS`` (default: non-Chinese + supported locales). + """ + override = os.environ.get("CJK_SANITIZE_ENABLED", "").strip().lower() + if override in ("1", "true", "yes", "on"): + return True + if override in ("0", "false", "no", "off"): + return False + if not locale: + return False + target_langs_env = os.environ.get("CJK_SANITIZE_LANGS", "").strip() + if target_langs_env: + target = {x.strip().lower() for x in target_langs_env.split(",") if x.strip()} + else: + target = _DEFAULT_TARGET_LOCALES + return locale.lower() in target + + +def sanitize_cjk_in_text( + text: str, + locale: Optional[str] = None, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + model: Optional[str] = None, +) -> str: + """Return ``text`` with CJK runs replaced by English translations. + + No-op if: + - ``text`` is empty or contains no CJK runs + - ``locale`` is Chinese (``zh`` / ``zh-CN`` / ``zh-TW``) + - no LLM ``api_key`` is configured + - the LLM call fails (logs warning, returns original text) + + Args: + text: The text to sanitize (typically a generated report or section). + locale: Target report locale (e.g. ``"en"``). Chinese is skipped. + api_key: LLM API key. Defaults to ``LLM_API_KEY`` env var. + base_url: LLM endpoint base URL. Defaults to ``LLM_BASE_URL`` env var. + model: LLM model name. Defaults to ``LLM_MODEL_NAME`` env var. + + Returns: + Sanitized text. If sanitization is skipped or fails, returns the + original ``text`` unchanged. + """ + if not text: + return text + + if locale and locale.lower().split("-")[0] == "zh": + return text # Chinese reports legitimately contain CJK + + runs = _extract_cjk_runs(text) + if not runs: + return text + + api_key = api_key or os.environ.get("LLM_API_KEY") + base_url = base_url or os.environ.get("LLM_BASE_URL", "https://api.openai.com/v1") + model = model or os.environ.get("LLM_MODEL_NAME", "gpt-4o-mini") + + if not api_key: + logger.warning( + "cjk_sanitize: %d CJK runs detected but no LLM_API_KEY; skipping", + len(runs), + ) + return text + + max_passes = int(os.environ.get("CJK_SANITIZE_MAX_PASSES", "3")) + out = text + n_total = 0 + pass_n = 0 + for pass_n in range(1, max_passes + 1): + runs = _extract_cjk_runs(out) + if not runs: + break + try: + translations = _batch_translate(runs, api_key, base_url, model) + except RuntimeError as e: + logger.warning("cjk_sanitize: %s — returning original text", e) + return text + n_replaced = 0 + for orig, trans in translations.items(): + if not trans or not trans.strip() or trans == orig: + continue + before = out.count(orig) + out = _smart_replace(out, orig, trans) + n_replaced += before + n_total += n_replaced + if n_replaced == 0: + break # LLM didn't translate anything new — stop iterating + + leftover = _extract_cjk_runs(out) + if leftover: + logger.warning( + "cjk_sanitize: %d CJK runs still present after %d passes: %s", + len(leftover), + pass_n or 1, + leftover, + ) + + logger.info("cjk_sanitize: replaced %d CJK runs across %d passes", n_total, pass_n or 1) + return out diff --git a/backend/scripts/test_cjk_sanitize.py b/backend/scripts/test_cjk_sanitize.py new file mode 100644 index 0000000000..273143d41e --- /dev/null +++ b/backend/scripts/test_cjk_sanitize.py @@ -0,0 +1,290 @@ +""" +Tests for cjk_sanitize utility. + +Verifies: + 1. CJK run detection (basic, dedup, min-length, order) + 2. Boundary-aware replacement (ASCII left/right, fullwidth parens, numbers) + 3. Locale gating (zh skipped, en/id/etc enabled, env override) + 4. Integration: empty, no-CJK, no-API-key, mocked LLM translate, multi-pass + 5. Failure modes: LLM error falls back to original text, idempotent +""" + +import os +import sys +from unittest.mock import patch, MagicMock + +# Add project path so 'app' package is importable +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from app.utils.cjk_sanitize import ( + sanitize_cjk_in_text, + is_enabled, + _extract_cjk_runs, + _smart_replace, +) + + +# -------- _extract_cjk_runs -------- + +def test_extract_cjk_runs_basic(): + text = "Hello 你好世界 world 通胀压力" + runs = _extract_cjk_runs(text) + assert "你好世界" in runs + assert "通胀压力" in runs + assert len(runs) == 2 + print("PASS: test_extract_cjk_runs_basic") + + +def test_extract_cjk_runs_dedup(): + text = "你好世界 and 你好世界 again" + runs = _extract_cjk_runs(text) + assert runs == ["你好世界"] + print("PASS: test_extract_cjk_runs_dedup") + + +def test_extract_cjk_runs_min_length(): + text = "A 你 B" # single CJK char should NOT match + runs = _extract_cjk_runs(text) + assert runs == [] + print("PASS: test_extract_cjk_runs_min_length") + + +def test_extract_cjk_runs_preserves_order(): + text = "First 你好 then 世界 and 你好 again" + runs = _extract_cjk_runs(text) + assert runs == ["你好", "世界"] + print("PASS: test_extract_cjk_runs_preserves_order") + + +# -------- _smart_replace (boundary handling) -------- + +def test_smart_replace_ascii_both_sides(): + out = _smart_replace("data过于依赖issues", "过于依赖", "overly dependent") + assert out == "data overly dependent issues" + print("PASS: test_smart_replace_ascii_both_sides") + + +def test_smart_replace_ascii_left_only(): + out = _smart_replace("Purbaya过于扩张,", "过于扩张", "overly expansionary") + assert out == "Purbaya overly expansionary," + print("PASS: test_smart_replace_ascii_left_only") + + +def test_smart_replace_ascii_right_only(): + out = _smart_replace("(中央银行)officials", "中央银行", "central bank") + assert out == "(central bank)officials" + print("PASS: test_smart_replace_ascii_right_only") + + +def test_smart_replace_fullwidth_parens_preserved(): + out = _smart_replace("the policy is(财政)expansion", "财政", "fiscal") + assert "(fiscal)" in out + print("PASS: test_smart_replace_fullwidth_parens_preserved") + + +def test_smart_replace_number_boundary(): + out = _smart_replace("2024年通胀率达到高点", "年通胀率达到高点", "annual inflation rate peaked") + assert out == "2024 annual inflation rate peaked" + print("PASS: test_smart_replace_number_boundary") + + +# -------- is_enabled -------- + +def test_is_enabled_chinese_locale_skipped(): + os.environ.pop("CJK_SANITIZE_ENABLED", None) + os.environ.pop("CJK_SANITIZE_LANGS", None) + assert is_enabled("zh") is False + assert is_enabled("zh-CN") is False + assert is_enabled("zh-TW") is False + print("PASS: test_is_enabled_chinese_locale_skipped") + + +def test_is_enabled_non_chinese_default(): + os.environ.pop("CJK_SANITIZE_ENABLED", None) + os.environ.pop("CJK_SANITIZE_LANGS", None) + for loc in ("en", "es", "fr", "pt", "ru", "de", "id"): + assert is_enabled(loc) is True, f"Expected True for {loc}" + print("PASS: test_is_enabled_non_chinese_default") + + +def test_is_enabled_force_on(): + os.environ["CJK_SANITIZE_ENABLED"] = "1" + try: + assert is_enabled("zh") is True # force-on overrides locale check + print("PASS: test_is_enabled_force_on") + finally: + del os.environ["CJK_SANITIZE_ENABLED"] + + +def test_is_enabled_force_off(): + os.environ["CJK_SANITIZE_ENABLED"] = "0" + try: + assert is_enabled("en") is False + print("PASS: test_is_enabled_force_off") + finally: + del os.environ["CJK_SANITIZE_ENABLED"] + + +def test_is_enabled_custom_langs(): + os.environ.pop("CJK_SANITIZE_ENABLED", None) + os.environ["CJK_SANITIZE_LANGS"] = "ja,ko" + try: + assert is_enabled("en") is False + assert is_enabled("ja") is True + assert is_enabled("ko") is True + print("PASS: test_is_enabled_custom_langs") + finally: + del os.environ["CJK_SANITIZE_LANGS"] + + +# -------- sanitize_cjk_in_text integration -------- + +def test_sanitize_empty_text(): + assert sanitize_cjk_in_text("") == "" + print("PASS: test_sanitize_empty_text") + + +def test_sanitize_no_cjk_passthrough(): + text = "This is a pure English report with no leaks." + assert sanitize_cjk_in_text(text, locale="en") == text + print("PASS: test_sanitize_no_cjk_passthrough") + + +def test_sanitize_chinese_locale_no_op(): + text = "你好世界 should not be translated in Chinese reports" + assert sanitize_cjk_in_text(text, locale="zh") == text + print("PASS: test_sanitize_chinese_locale_no_op") + + +def test_sanitize_no_api_key_no_op(): + text = "BI said: Purbaya过于扩张" + result = sanitize_cjk_in_text( + text, locale="en", api_key=None, base_url="http://test", model="test" + ) + assert result == text # no LLM = no change + print("PASS: test_sanitize_no_api_key_no_op") + + +def _make_mock_response(json_bytes: bytes): + """Build a context-manager-friendly mock for urllib.request.urlopen.""" + resp = MagicMock() + resp.read.return_value = json_bytes + resp.__enter__ = lambda self: self + resp.__exit__ = lambda self, *args: None + return resp + + +def test_sanitize_translates_cjk_runs(): + mock = _make_mock_response( + b'{"choices": [{"message": {"content": "[\\"overly inclined toward fiscal expansion\\"]"}}]}' + ) + with patch("urllib.request.urlopen", return_value=mock): + text = "BI said: Purbaya过于倾向财政扩张, this is concerning." + result = sanitize_cjk_in_text( + text, locale="en", api_key="fake", base_url="http://test", model="test" + ) + assert "过于倾向财政扩张" not in result + assert "overly inclined toward fiscal expansion" in result + print("PASS: test_sanitize_translates_cjk_runs") + + +def test_sanitize_multi_pass_catches_leftover(): + """LLM translates pass-1 snippets but leaves CJK in one translation. Pass 2 + picks up the leftover CJK that emerged in the output.""" + call_count = [0] + # Build JSON with embedded CJK as bytes (b'...' can't contain non-ASCII) + pass1_content = '["Hello world", "partial \u505c\u5de5 remaining"]' + pass2_content = '["work stoppage"]' + pass1_body = ( + b'{"choices": [{"message": {"content": "' + + pass1_content.replace("\\", "\\\\").replace('"', '\\"').encode("utf-8") + + b'"}}]}' + ) + pass2_body = ( + b'{"choices": [{"message": {"content": "' + + pass2_content.replace("\\", "\\\\").replace('"', '\\"').encode("utf-8") + + b'"}}]}' + ) + + def fake_urlopen(req, timeout): + call_count[0] += 1 + if call_count[0] == 1: + return _make_mock_response(pass1_body) + return _make_mock_response(pass2_body) + + with patch("urllib.request.urlopen", side_effect=fake_urlopen): + text = "你好世界 then 停工 happened" + result = sanitize_cjk_in_text( + text, locale="en", api_key="fake", base_url="http://test", model="test" + ) + assert "你好世界" not in result + assert "停工" not in result + assert "Hello world" in result + assert "work stoppage" in result + assert call_count[0] == 2 + print("PASS: test_sanitize_multi_pass_catches_leftover") + + +def test_sanitize_idempotent_when_clean(): + text = "Pure English with no CJK characters at all." + assert sanitize_cjk_in_text(text, locale="en") == text + # Re-run with API key: still no change, 0 LLM calls + with patch("urllib.request.urlopen") as mock_url: + result = sanitize_cjk_in_text( + text, locale="en", api_key="fake", base_url="http://test", model="test" + ) + assert result == text + assert mock_url.call_count == 0 + print("PASS: test_sanitize_idempotent_when_clean") + + +def test_sanitize_llm_failure_returns_original(): + with patch("urllib.request.urlopen", side_effect=Exception("network down")): + text = "BI said: Purbaya过于扩张" + result = sanitize_cjk_in_text( + text, locale="en", api_key="fake", base_url="http://test", model="test" + ) + assert result == text # falls back gracefully + print("PASS: test_sanitize_llm_failure_returns_original") + + +def test_sanitize_id_locale_enabled(): + """Indonesian locale should also enable sanitization by default.""" + mock = _make_mock_response( + b'{"choices": [{"message": {"content": "[\\"overly expansive\\"]"}}]}' + ) + with patch("urllib.request.urlopen", return_value=mock): + text = "BI: Purbaya过于扩张" + result = sanitize_cjk_in_text( + text, locale="id", api_key="fake", base_url="http://test", model="test" + ) + assert "overly expansive" in result + print("PASS: test_sanitize_id_locale_enabled") + + +# -------- runner -------- + +def run_all(): + tests = [ + v for k, v in globals().items() + if k.startswith("test_") and callable(v) + ] + passed = 0 + failed = 0 + for t in tests: + try: + t() + passed += 1 + except Exception as e: + print(f"FAIL: {t.__name__}: {e}") + import traceback + traceback.print_exc() + failed += 1 + print(f"\n{'=' * 60}") + print(f"CJK sanitize: {passed} passed, {failed} failed") + print(f"{'=' * 60}") + return 0 if failed == 0 else 1 + + +if __name__ == "__main__": + sys.exit(run_all())