From a837c85d47577f5f1799b85344d6bd43e9fb4e62 Mon Sep 17 00:00:00 2001
From: Shlok148Dev <Shlok148Dev@users.noreply.github.com>
Date: Fri, 5 Jun 2026 00:17:14 +0530
Subject: [PATCH 1/8] fix: sanitize qt-web-extractor table cells and add
 integration audit doc

---
 .../third-party-integration-audit.md          | 53 +++++++++++
 scholaraio/providers/webtools.py              | 90 ++++++++++++++-----
 tests/fixtures/wikipedia_infobox_bad.md       | 10 +++
 tests/fixtures/wikipedia_infobox_clean.md     |  4 +
 tests/test_webtools_source.py                 | 58 ++++++++++++
 5 files changed, 192 insertions(+), 23 deletions(-)
 create mode 100644 docs/development/third-party-integration-audit.md
 create mode 100644 tests/fixtures/wikipedia_infobox_bad.md
 create mode 100644 tests/fixtures/wikipedia_infobox_clean.md

diff --git a/docs/development/third-party-integration-audit.md b/docs/development/third-party-integration-audit.md
new file mode 100644
index 00000000..77a44b89
--- /dev/null
+++ b/docs/development/third-party-integration-audit.md
@@ -0,0 +1,53 @@
+# ScholarAIO Third-Party Integration Quality Audit
+
+This document records the quality, reachability, and output validation status of the third-party integrations, APIs, CLIs, and optional toolchains supported by ScholarAIO. 
+
+Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoints, provider implementations, setup diagnostics, output formatting, fallback behaviors, and failure handling.
+
+---
+
+## 1. Quality Matrix
+
+| Integration / Surface | Category | Status | Verification Path / Test Evidence | Observed Result / Boundaries |
+| :--- | :--- | :--- | :--- | :--- |
+| **qt-web-extractor (HTTP & MCP)** | Web / Agent | **needs-cleanup** | `extract_web` / `tests/test_webtools_source.py` | Sanitized output successfully resolves table-cell code fence corruption on Wikipedia. |
+| **GUILessBingSearch** | Web / Agent | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **MinerU Local API** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **MinerU Cloud CLI** | Parsing | **good** | `test_mineru.py` | Handles `mineru-open-api` subprocess calls; enforces filename constraints safely. |
+| **Docling Fallback** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **PyMuPDF Fallback** | Parsing | **good** | `test_pdf_fallback.py` | Robust extraction fallback when default parser fails. |
+| **arXiv Search (Atom API)** | Discovery | **good** | `test_arxiv_source.py` | Atom XML parser is stable; query filters match client expectations. |
+| **arXiv PDF Download** | Discovery | **good** | `test_arxiv_source.py` | Enforces `RATE_LIMIT_DELAY = 3.0` between successive paper downloads. |
+| **OpenAlex Explore** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **Crossref / Semantic Scholar** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **Zotero SQLite Import** | Import/Export | **good** | `test_workspace.py` | Parsed SQLite columns correctly map to `PaperMetadata`. |
+| **Zotero Web API** | Import/Export | **usable-with-caveats** | `fetch_zotero_api` / `import-zotero` | pyzotero retrieves metadata; linked/external attachments are skipped by design. |
+| **EndNote / RIS** | Import/Export | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **USPTO ODP / PPubs** | Patents | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **OpenAI-compatible Chat API** | LLM Backend | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **Anthropic Messages API** | LLM Backend | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **Google Gemini API** | LLM Backend | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **Zhipu API** | LLM Backend | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **vLLM / Ollama Local** | LLM Backend | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **Sentence-transformers Embeddings** | Vector/Embed | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **FAISS Vector / BERTopic** | Vector/Embed | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **MarkItDown Office Ingest** | Office/Output | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **Office PPTX / DOCX Libraries** | Office/Output | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **Mermaid / DOT Rendering** | Diagram | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **Scientific Toolref (Quantum ESPRESSO, etc.)** | Toolref | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **AmberTools / PyMOL** | Scientific | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **rsync / SSH Backup** | System | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **Setup Diagnostics** | System | **good** | `test_setup.py` | Reports dependency presence and credential state in bilingual strings. |
+
+---
+
+## 2. Seed Issue: qt-web-extractor Table Cell Corruption
+- **Symptom**: Unescaped block elements (e.g. ` ``` ` or `\n\n`) inside Wikipedia tables break Markdown parsing and cause garbled readback.
+- **Verification Fixtures**:
+  - Raw Input: [wikipedia_infobox_bad.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_bad.md)
+  - Expected Output: [wikipedia_infobox_clean.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_clean.md)
+- **Fix**: Added a regex sanitization helper in `scholaraio/providers/webtools.py` called `_clean_table_code_fences`. It scans the output Markdown for block elements bounded by table column pipes (`|`) and collapses them to inline code blocks:
+  ```python
+  res["text"] = _clean_table_code_fences(res["text"])
+  ```
+- **Scope**: Executed at the end of the `extract_web` function to clean both HTTP and MCP outputs prior to consumption by RAG and CLI workflows.
diff --git a/scholaraio/providers/webtools.py b/scholaraio/providers/webtools.py
index dc200685..df37751c 100644
--- a/scholaraio/providers/webtools.py
+++ b/scholaraio/providers/webtools.py
@@ -572,6 +572,44 @@ def _extract_web_mcp(url: str, *, cfg: Config | None, timeout: float) -> dict:
     }
 
 
+def _clean_table_code_fences(text: str) -> str:
+    """Sanitize Markdown table cells that contain block-level code blocks/fences.
+
+    Transforms:
+        | Col | ```\nval\n``` |
+    Into:
+        | Col | `val` |
+    """
+    if not text:
+        return ""
+
+    # Pattern to match a code block inside a table cell (bounded by pipes)
+    pattern = re.compile(
+        r"\|([^|]*?)```(?:[a-zA-Z0-9_-]*)\n(.*?)\n\s*```([^|]*?)\|",
+        re.DOTALL
+    )
+
+    def replace_match(match):
+        before = match.group(1).replace("\n", " ").strip()
+        code_content = match.group(2).replace("\n", " ").strip()
+        after = match.group(3).replace("\n", " ").strip()
+        
+        # Format the code content as inline code
+        inline_code = f"`{code_content}`" if code_content else ""
+        
+        # Assemble the cleaned cell components
+        parts = [p for p in (before, inline_code, after) if p]
+        cleaned_cell = " " + " ".join(parts) + " "
+        return f"|{cleaned_cell}|"
+
+    cleaned = text
+    prev = ""
+    while cleaned != prev:
+        prev = cleaned
+        cleaned = pattern.sub(replace_match, cleaned)
+    return cleaned
+
+
 def extract_web(
     url: str,
     *,
@@ -600,33 +638,39 @@ def extract_web(
     """
     transport = _get_webextract_transport(cfg)
     if transport == "mcp":
-        return _extract_web_mcp(url, cfg=cfg, timeout=timeout)
-    if transport != "http":
-        raise WebExtractError(f"未知 webextract transport: {transport}")
+        res = _extract_web_mcp(url, cfg=cfg, timeout=timeout)
+    else:
+        if transport != "http":
+            raise WebExtractError(f"未知 webextract transport: {transport}")
 
-    base_url = _get_webextract_base_url(cfg)
-    if not check_webextract_service(cfg, timeout=3.0):
-        raise WebExtractServiceUnavailableError(
-            f"提取服务未启动或不可达: {base_url}\n请确保 qt-web-extractor 服务已运行"
+        base_url = _get_webextract_base_url(cfg)
+        if not check_webextract_service(cfg, timeout=3.0):
+            raise WebExtractServiceUnavailableError(
+                f"提取服务未启动或不可达: {base_url}\n请确保 qt-web-extractor 服务已运行"
+            )
+
+        body: dict[str, object] = {"url": url}
+        if pdf is not None:
+            body["pdf"] = pdf
+        if include_html:
+            body["include_html"] = include_html
+
+        api_key = _get_webextract_api_key(cfg) or ""
+        req = Request(
+            f"{base_url}/extract",
+            data=json.dumps(body).encode("utf-8"),
+            headers=_headers(api_key),
+            method="POST",
         )
+        try:
+            res = _load_json_response(req, timeout=int(timeout), error_prefix="提取失败")
+        except RuntimeError as e:
+            raise WebExtractError(str(e)) from e
 
-    body: dict[str, object] = {"url": url}
-    if pdf is not None:
-        body["pdf"] = pdf
-    if include_html:
-        body["include_html"] = include_html
+    if isinstance(res, dict) and "text" in res and res["text"]:
+        res["text"] = _clean_table_code_fences(res["text"])
 
-    api_key = _get_webextract_api_key(cfg) or ""
-    req = Request(
-        f"{base_url}/extract",
-        data=json.dumps(body).encode("utf-8"),
-        headers=_headers(api_key),
-        method="POST",
-    )
-    try:
-        return _load_json_response(req, timeout=int(timeout), error_prefix="提取失败")
-    except RuntimeError as e:
-        raise WebExtractError(str(e)) from e
+    return res
 
 
 def extract_and_display(
diff --git a/tests/fixtures/wikipedia_infobox_bad.md b/tests/fixtures/wikipedia_infobox_bad.md
new file mode 100644
index 00000000..c568fd1a
--- /dev/null
+++ b/tests/fixtures/wikipedia_infobox_bad.md
@@ -0,0 +1,10 @@
+| 性别 | 男 |
+| 出生 | ```
+1902年8月28日
+``` |
+| 逝世 | ```
+1993年11月24日
+``` |
+| 国籍 | ```
+中华人民共和国
+``` |
diff --git a/tests/fixtures/wikipedia_infobox_clean.md b/tests/fixtures/wikipedia_infobox_clean.md
new file mode 100644
index 00000000..e718f65e
--- /dev/null
+++ b/tests/fixtures/wikipedia_infobox_clean.md
@@ -0,0 +1,4 @@
+| 性别 | 男 |
+| 出生 | `1902年8月28日` |
+| 逝世 | `1993年11月24日` |
+| 国籍 | `中华人民共和国` |
diff --git a/tests/test_webtools_source.py b/tests/test_webtools_source.py
index 8fa788aa..650573e0 100644
--- a/tests/test_webtools_source.py
+++ b/tests/test_webtools_source.py
@@ -781,3 +781,61 @@ def fake_urlopen(req, timeout=0):
         assert result["title"] == "Page"
         captured = capsys.readouterr()
         assert "markdown body" in captured.out
+
+    def test_clean_table_code_fences_with_fixtures(self):
+        import pathlib
+        from scholaraio.providers.webtools import _clean_table_code_fences
+
+        fixtures_dir = pathlib.Path(__file__).parent / "fixtures"
+        bad_path = fixtures_dir / "wikipedia_infobox_bad.md"
+        clean_path = fixtures_dir / "wikipedia_infobox_clean.md"
+
+        assert bad_path.exists()
+        assert clean_path.exists()
+
+        bad_text = bad_path.read_text(encoding="utf-8")
+        expected_clean_text = clean_path.read_text(encoding="utf-8")
+
+        cleaned_text = _clean_table_code_fences(bad_text)
+        assert cleaned_text.strip() == expected_clean_text.strip()
+
+    def test_clean_table_code_fences_ignores_normal_structures(self):
+        from scholaraio.providers.webtools import _clean_table_code_fences
+
+        # Test normal code block outside table should not be changed
+        normal_code = (
+            "Here is a code snippet:\n"
+            "```python\n"
+            "def test():\n"
+            "    return True\n"
+            "```\n"
+            "And here is normal text."
+        )
+        assert _clean_table_code_fences(normal_code) == normal_code
+
+        # Test normal table with inline code should not be changed
+        normal_table = (
+            "| Column 1 | Column 2 |\n"
+            "| --- | --- |\n"
+            "| `inline code` | value |\n"
+        )
+        assert _clean_table_code_fences(normal_table) == normal_table
+
+    def test_extract_web_applies_cleanup_http(self, monkeypatch):
+        # Verify that HTTP path runs the clean helper
+        def fake_urlopen(req, timeout=0):
+            return _FakeResponse({
+                "title": "Page",
+                "text": "| 性别 |\n| 出生 | ```\n1902\n``` |"
+            })
+
+        def fake_check_service(cfg, timeout=3.0):
+            return True
+
+        monkeypatch.setattr("scholaraio.providers.webtools.urlopen", fake_urlopen)
+        monkeypatch.setattr("scholaraio.providers.webtools.check_webextract_service", fake_check_service)
+
+        from scholaraio.providers.webtools import extract_web
+
+        res = extract_web("https://example.com")
+        assert res["text"] == "| 性别 |\n| 出生 | `1902` |"

From 91197b3a5addf57f0d95aaebf49f955b3e4a9db1 Mon Sep 17 00:00:00 2001
From: Shlok148Dev <Shlok148Dev@users.noreply.github.com>
Date: Fri, 5 Jun 2026 00:17:15 +0530
Subject: [PATCH 2/8] docs: detail integration audits at workflow boundary

---
 .../third-party-integration-audit.md          | 98 ++++++++++++++++---
 1 file changed, 87 insertions(+), 11 deletions(-)

diff --git a/docs/development/third-party-integration-audit.md b/docs/development/third-party-integration-audit.md
index 77a44b89..4050a525 100644
--- a/docs/development/third-party-integration-audit.md
+++ b/docs/development/third-party-integration-audit.md
@@ -2,7 +2,7 @@
 
 This document records the quality, reachability, and output validation status of the third-party integrations, APIs, CLIs, and optional toolchains supported by ScholarAIO. 
 
-Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoints, provider implementations, setup diagnostics, output formatting, fallback behaviors, and failure handling.
+Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoints, provider implementations, setup diagnostics, output formatting, fallback behaviors, and failure handling. A config test or a broad unit-test filename is not enough evidence to mark an integration surface as Good.
 
 ---
 
@@ -41,13 +41,89 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi
 
 ---
 
-## 2. Seed Issue: qt-web-extractor Table Cell Corruption
-- **Symptom**: Unescaped block elements (e.g. ` ``` ` or `\n\n`) inside Wikipedia tables break Markdown parsing and cause garbled readback.
-- **Verification Fixtures**:
-  - Raw Input: [wikipedia_infobox_bad.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_bad.md)
-  - Expected Output: [wikipedia_infobox_clean.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_clean.md)
-- **Fix**: Added a regex sanitization helper in `scholaraio/providers/webtools.py` called `_clean_table_code_fences`. It scans the output Markdown for block elements bounded by table column pipes (`|`) and collapses them to inline code blocks:
-  ```python
-  res["text"] = _clean_table_code_fences(res["text"])
-  ```
-- **Scope**: Executed at the end of the `extract_web` function to clean both HTTP and MCP outputs prior to consumption by RAG and CLI workflows.
+## 2. Detailed Integration Audits (Workflow Boundary Analysis)
+
+### 2.1 qt-web-extractor (HTTP & MCP)
+* **CLI/Skill Entrypoint**:
+  * CLI: `scholaraio webextract <url>` (implemented in `cmd_webextract` inside [web.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/interfaces/cli/web.py))
+  * Skill: `.claude/skills/webextract`
+* **Provider/Service Implementation Path**:
+  * [webtools.py:extract_web](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/webtools.py#L613-L673)
+* **Setup Diagnostics**:
+  * Tested via `scholaraio setup check` (calls `_optional_webtool_detail` inside [setup.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/services/setup.py#L617-L665)), which executes `check_webextract_service` to verify that the HTTP/MCP endpoint responds.
+* **Output Quality & Validation**:
+  * Outputs parsed GFM Markdown. Output quality is protected by `_clean_table_code_fences` to sanitize malformed block code fences in Wikipedia/infobox table cells, resolving broken table rendering.
+  * Verified via raw/cleaned fixtures: [wikipedia_infobox_bad.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_bad.md) and [wikipedia_infobox_clean.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_clean.md).
+* **Fallback Behavior**:
+  * Configured via `webextract.transport` (HTTP or MCP). When configured as HTTP, failure to connect triggers fallback hint to MCP or setup checks.
+* **Failure Handling**:
+  * Unreachable HTTP endpoints raise `WebExtractServiceUnavailableError`, returning a clean user-facing hint with exit code `1`.
+  * API/Server errors raise `WebExtractError`, showing warnings/errors instead of generic crashes.
+
+### 2.2 MinerU Cloud CLI (`mineru-open-api`)
+* **CLI/Skill Entrypoint**:
+  * CLI: `scholaraio ingest <pdf>` or `scholaraio/providers/mineru.py` main parser CLI.
+  * Skill: `.claude/skills/ingest`
+* **Provider/Service Implementation Path**:
+  * [mineru.py:convert_pdf_cloud](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/mineru.py#L702-L810)
+* **Setup Diagnostics**:
+  * Checked under `scholaraio setup check` via `_detect_mineru` which verifies presence of `mineru-open-api` in system path (`shutil.which`) and reads credential key values.
+* **Output Quality & Validation**:
+  * Translates PDF structures to Markdown with images/formulas.
+  * Sanitizes cloud upload filenames via `_cloud_safe_pdf_name` to prevent platform-specific characters from crashing the extraction.
+  * Handles chunk merging for multi-part large PDF parsing.
+* **Fallback Behavior**:
+  * When MinerU is missing or fails, it falls back to the list of alternatives defined in the configuration option `pdf_fallback_order` (e.g. `["docling", "pymupdf"]`).
+* **Failure Handling**:
+  * Subprocess timeouts (`subprocess.TimeoutExpired`) are caught.
+  * Non-zero return codes from `mineru-open-api` raise descriptive errors containing stderr output.
+  * Retries are handled with exponential backoff (`attempts` based on `mineru_upload_retries`).
+
+### 2.3 PyMuPDF Fallback (`fitz`)
+* **CLI/Skill Entrypoint**:
+  * CLI: Invoked automatically as part of PDF ingestion when MinerU fails, or manually by setting `pdf_preferred_parser: pymupdf`.
+* **Provider/Service Implementation Path**:
+  * [pdf_fallback.py:run_pymupdf](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/pdf_fallback.py#L142-L160)
+* **Setup Diagnostics**:
+  * Checked in `scholaraio setup check` via `_check_dep_group("fitz")`.
+* **Output Quality & Validation**:
+  * Extracts page-by-page flat plaintext with page headers (`## Page N\n\n`). Lacks complex block structure formatting but acts as a highly reliable baseline.
+* **Fallback Behavior**:
+  * Represents the last-resort fallback in the fallback parser chain (since it has no model/server dependencies).
+* **Failure Handling**:
+  * Catches general exception and formats error messages, skipping page crashes or file read errors gracefully without aborting the ingest execution pipeline.
+
+### 2.4 arXiv Search & PDF Download
+* **CLI/Skill Entrypoint**:
+  * CLI: `scholaraio search --arxiv` (runs `cmd_search` inside [search.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/interfaces/cli/search.py)) and `scholaraio paper fetch` to retrieve PDFs.
+  * Skill: `.claude/skills/search`, `.claude/skills/paper-guided-reading`
+* **Provider/Service Implementation Path**:
+  * [arxiv.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/arxiv.py) (`_query_arxiv_api`, `download_arxiv_pdf`, and `batch_download`).
+* **Setup Diagnostics**:
+  * Setup checks verify internet connection and reachability of arXiv query export endpoints.
+* **Output Quality & Validation**:
+  * Parses response XML via `defusedxml.ElementTree` to prevent XML External Entity (XXE) vulnerabilities, mapping properties directly to `ArxivPaper` dataclasses.
+  * Performs client-side field filtration (`_filter_search_results`) on author, title, and abstract fields to tighten results returned by arXiv's loose matching API.
+* **Fallback Behavior**:
+  * Gracefully fails with standard warning logs if the arXiv endpoint is offline, returning empty results rather than hard crashes.
+* **Failure Handling**:
+  * A requests session is mounted with a custom `urllib3` retry adapter to handle transient `429`, `502`, `503`, and `504` status codes automatically.
+  * Enforces a polite rate limit delay `RATE_LIMIT_DELAY = 3.0` between successive paper downloads in batch modes.
+
+### 2.5 Zotero Integration (Web API & Local SQLite Import)
+* **CLI/Skill Entrypoint**:
+  * CLI: `scholaraio import-zotero` command (`cmd_import_zotero` inside [import_zotero.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/interfaces/cli/import_zotero.py)).
+  * Skill: `.claude/skills/import-zotero`
+* **Provider/Service Implementation Path**:
+  * [zotero.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/zotero.py) (`fetch_zotero_api` for cloud Web API, `parse_zotero_local` for local SQLite databases).
+* **Setup Diagnostics**:
+  * Checked in `setup.py` by verifying presence of Zotero API credentials.
+* **Output Quality & Validation**:
+  * Maps Zotero types (e.g. `journalArticle`, `preprint`) to standard `PaperMetadata` types.
+  * Locates corresponding PDF attachments and copies them into the import directory.
+* **Fallback Behavior**:
+  * Supports local SQLite database import via `--local <path/to/sqlite>` if API keys are missing or the API is unreachable.
+  * Skips unresolvable attachments/links instead of failing the import.
+* **Failure Handling**:
+  * Catches `ImportError` on `pyzotero` to prompt users to install optional dependencies.
+  * Attachment download failures are caught per-item, logging warnings while continuing to parse the rest of the collection.

From a33d2295be645d3c5293e4c6f96adda4f79b00b2 Mon Sep 17 00:00:00 2001
From: Shlok148Dev <Shlok148Dev@users.noreply.github.com>
Date: Fri, 5 Jun 2026 00:17:15 +0530
Subject: [PATCH 3/8] docs: add Paper2Any explicitly as not-yet-reviewed in
 audit matrix

---
 docs/development/third-party-integration-audit.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/development/third-party-integration-audit.md b/docs/development/third-party-integration-audit.md
index 4050a525..9c99a408 100644
--- a/docs/development/third-party-integration-audit.md
+++ b/docs/development/third-party-integration-audit.md
@@ -14,6 +14,7 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi
 | **GUILessBingSearch** | Web / Agent | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **MinerU Local API** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **MinerU Cloud CLI** | Parsing | **good** | `test_mineru.py` | Handles `mineru-open-api` subprocess calls; enforces filename constraints safely. |
+| **Paper2Any MCP Sidecar** | Parsing/MCP | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **Docling Fallback** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **PyMuPDF Fallback** | Parsing | **good** | `test_pdf_fallback.py` | Robust extraction fallback when default parser fails. |
 | **arXiv Search (Atom API)** | Discovery | **good** | `test_arxiv_source.py` | Atom XML parser is stable; query filters match client expectations. |

From 8c9836f89d68331b9583333940584b4b1e436cbc Mon Sep 17 00:00:00 2001
From: Shlok148Dev <Shlok148Dev@users.noreply.github.com>
Date: Fri, 5 Jun 2026 00:17:15 +0530
Subject: [PATCH 4/8] docs: document explicit config & version boundaries per
 matrix row

---
 .../third-party-integration-audit.md           | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/development/third-party-integration-audit.md b/docs/development/third-party-integration-audit.md
index 9c99a408..fb6a6fc9 100644
--- a/docs/development/third-party-integration-audit.md
+++ b/docs/development/third-party-integration-audit.md
@@ -8,21 +8,21 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi
 
 ## 1. Quality Matrix
 
-| Integration / Surface | Category | Status | Verification Path / Test Evidence | Observed Result / Boundaries |
+| Integration / Surface | Category | Status | Verification Path / Test Evidence | Observed Result / Config & Version Boundaries |
 | :--- | :--- | :--- | :--- | :--- |
-| **qt-web-extractor (HTTP & MCP)** | Web / Agent | **needs-cleanup** | `extract_web` / `tests/test_webtools_source.py` | Sanitized output successfully resolves table-cell code fence corruption on Wikipedia. |
+| **qt-web-extractor (HTTP & MCP)** | Web / Agent | **needs-cleanup** | `extract_web` / `tests/test_webtools_source.py` | Sanitized output successfully resolves table-cell code fence corruption. Boundaries: `webextract.transport` (HTTP/MCP), `webextract.base_url`, `webextract.mcp_url`, `webextract.api_key`. |
 | **GUILessBingSearch** | Web / Agent | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **MinerU Local API** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
-| **MinerU Cloud CLI** | Parsing | **good** | `test_mineru.py` | Handles `mineru-open-api` subprocess calls; enforces filename constraints safely. |
+| **MinerU Cloud CLI** | Parsing | **good** | `test_mineru.py` | Handles `mineru-open-api` subprocess calls; enforces filename constraints safely. Boundaries: `ingest.mineru_api_key` / `MINERU_TOKEN`, `mineru-open-api` CLI package. |
 | **Paper2Any MCP Sidecar** | Parsing/MCP | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **Docling Fallback** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
-| **PyMuPDF Fallback** | Parsing | **good** | `test_pdf_fallback.py` | Robust extraction fallback when default parser fails. |
-| **arXiv Search (Atom API)** | Discovery | **good** | `test_arxiv_source.py` | Atom XML parser is stable; query filters match client expectations. |
-| **arXiv PDF Download** | Discovery | **good** | `test_arxiv_source.py` | Enforces `RATE_LIMIT_DELAY = 3.0` between successive paper downloads. |
+| **PyMuPDF Fallback** | Parsing | **good** | `test_pdf_fallback.py` | Robust extraction fallback when default parser fails. Boundaries: `pymupdf` / `fitz` dependency presence. |
+| **arXiv Search (Atom API)** | Discovery | **good** | `test_arxiv_source.py` | Atom XML parser is stable; query filters match client expectations. Boundaries: HTTP requests to `export.arxiv.org`. |
+| **arXiv PDF Download** | Discovery | **good** | `test_arxiv_source.py` | Enforces `RATE_LIMIT_DELAY = 3.0` spacing in `batch_download`. Boundaries: `https://arxiv.org/pdf/` endpoint. |
 | **OpenAlex Explore** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **Crossref / Semantic Scholar** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
-| **Zotero SQLite Import** | Import/Export | **good** | `test_workspace.py` | Parsed SQLite columns correctly map to `PaperMetadata`. |
-| **Zotero Web API** | Import/Export | **usable-with-caveats** | `fetch_zotero_api` / `import-zotero` | pyzotero retrieves metadata; linked/external attachments are skipped by design. |
+| **Zotero SQLite Import** | Import/Export | **good** | `test_workspace.py` | Parsed SQLite columns correctly map to `PaperMetadata`. Boundaries: Local `zotero.sqlite` database schema layout. |
+| **Zotero Web API** | Import/Export | **usable-with-caveats** | `fetch_zotero_api` / `import-zotero` | pyzotero retrieves metadata; linked/external attachments are skipped by design. Boundaries: `zotero.api_key`, `zotero.library_id`, `ZOTERO_API_KEY`, `ZOTERO_LIBRARY_ID`. |
 | **EndNote / RIS** | Import/Export | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **USPTO ODP / PPubs** | Patents | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **OpenAI-compatible Chat API** | LLM Backend | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
@@ -38,7 +38,7 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi
 | **Scientific Toolref (Quantum ESPRESSO, etc.)** | Toolref | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **AmberTools / PyMOL** | Scientific | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **rsync / SSH Backup** | System | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
-| **Setup Diagnostics** | System | **good** | `test_setup.py` | Reports dependency presence and credential state in bilingual strings. |
+| **Setup Diagnostics** | System | **good** | `test_setup.py` | Reports dependency presence and credential state in bilingual strings. Boundaries: CLI `scholaraio setup check` / `run_check` path. |
 
 ---
 

From 80a584cfa8416ec7b40773ce3917ced7e13059a6 Mon Sep 17 00:00:00 2001
From: Shlok148Dev <Shlok148Dev@users.noreply.github.com>
Date: Fri, 5 Jun 2026 00:20:16 +0530
Subject: [PATCH 5/8] fix: constrain table code fence cleanup to avoid matching
 across empty lines

---
 scholaraio/providers/webtools.py |  4 ++++
 tests/test_webtools_source.py    | 14 ++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/scholaraio/providers/webtools.py b/scholaraio/providers/webtools.py
index df37751c..e9431f97 100644
--- a/scholaraio/providers/webtools.py
+++ b/scholaraio/providers/webtools.py
@@ -590,6 +590,10 @@ def _clean_table_code_fences(text: str) -> str:
     )
 
     def replace_match(match):
+        full_match = match.group(0)
+        if re.search(r"\n\s*\n", full_match):
+            return full_match
+
         before = match.group(1).replace("\n", " ").strip()
         code_content = match.group(2).replace("\n", " ").strip()
         after = match.group(3).replace("\n", " ").strip()
diff --git a/tests/test_webtools_source.py b/tests/test_webtools_source.py
index 650573e0..6c8447bd 100644
--- a/tests/test_webtools_source.py
+++ b/tests/test_webtools_source.py
@@ -821,6 +821,20 @@ def test_clean_table_code_fences_ignores_normal_structures(self):
         )
         assert _clean_table_code_fences(normal_table) == normal_table
 
+        # Test standalone code block between tables should not be changed
+        standalone_between_tables = (
+            "| A | B |\n"
+            "| --- | --- |\n"
+            "| one | two |\n\n"
+            "```python\n"
+            "print(1)\n"
+            "```\n\n"
+            "| C | D |\n"
+            "| --- | --- |\n"
+            "| three | four |\n"
+        )
+        assert _clean_table_code_fences(standalone_between_tables) == standalone_between_tables
+
     def test_extract_web_applies_cleanup_http(self, monkeypatch):
         # Verify that HTTP path runs the clean helper
         def fake_urlopen(req, timeout=0):

From 9fe1632be7b48492772a6736eadf5bb8cb493c69 Mon Sep 17 00:00:00 2001
From: Shlok148Dev <Shlok148Dev@users.noreply.github.com>
Date: Fri, 5 Jun 2026 23:16:03 +0530
Subject: [PATCH 6/8] fix: row-scoped table cell code fence cleanup & correct
 audit doc

---
 .../third-party-integration-audit.md          |  36 ++---
 scholaraio/providers/webtools.py              | 145 ++++++++++++++----
 tests/test_webtools_source.py                 |  22 +--
 3 files changed, 139 insertions(+), 64 deletions(-)

diff --git a/docs/development/third-party-integration-audit.md b/docs/development/third-party-integration-audit.md
index fb6a6fc9..e62eb82e 100644
--- a/docs/development/third-party-integration-audit.md
+++ b/docs/development/third-party-integration-audit.md
@@ -10,19 +10,19 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi
 
 | Integration / Surface | Category | Status | Verification Path / Test Evidence | Observed Result / Config & Version Boundaries |
 | :--- | :--- | :--- | :--- | :--- |
-| **qt-web-extractor (HTTP & MCP)** | Web / Agent | **needs-cleanup** | `extract_web` / `tests/test_webtools_source.py` | Sanitized output successfully resolves table-cell code fence corruption. Boundaries: `webextract.transport` (HTTP/MCP), `webextract.base_url`, `webextract.mcp_url`, `webextract.api_key`. |
+| **qt-web-extractor (HTTP & MCP)** | Web / Agent | **good** | `extract_web` / `tests/test_webtools_source.py` | Sanitized output successfully resolves table-cell code fence corruption. Boundaries: `webextract.transport` (HTTP/MCP), `webextract.base_url`, `webextract.mcp_url`, `webextract.api_key`. |
 | **GUILessBingSearch** | Web / Agent | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **MinerU Local API** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
-| **MinerU Cloud CLI** | Parsing | **good** | `test_mineru.py` | Handles `mineru-open-api` subprocess calls; enforces filename constraints safely. Boundaries: `ingest.mineru_api_key` / `MINERU_TOKEN`, `mineru-open-api` CLI package. |
+| **MinerU Cloud CLI** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **Paper2Any MCP Sidecar** | Parsing/MCP | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **Docling Fallback** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
-| **PyMuPDF Fallback** | Parsing | **good** | `test_pdf_fallback.py` | Robust extraction fallback when default parser fails. Boundaries: `pymupdf` / `fitz` dependency presence. |
-| **arXiv Search (Atom API)** | Discovery | **good** | `test_arxiv_source.py` | Atom XML parser is stable; query filters match client expectations. Boundaries: HTTP requests to `export.arxiv.org`. |
-| **arXiv PDF Download** | Discovery | **good** | `test_arxiv_source.py` | Enforces `RATE_LIMIT_DELAY = 3.0` spacing in `batch_download`. Boundaries: `https://arxiv.org/pdf/` endpoint. |
+| **PyMuPDF Fallback** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **arXiv Search (Atom API)** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **arXiv PDF Download** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **OpenAlex Explore** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **Crossref / Semantic Scholar** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
-| **Zotero SQLite Import** | Import/Export | **good** | `test_workspace.py` | Parsed SQLite columns correctly map to `PaperMetadata`. Boundaries: Local `zotero.sqlite` database schema layout. |
-| **Zotero Web API** | Import/Export | **usable-with-caveats** | `fetch_zotero_api` / `import-zotero` | pyzotero retrieves metadata; linked/external attachments are skipped by design. Boundaries: `zotero.api_key`, `zotero.library_id`, `ZOTERO_API_KEY`, `ZOTERO_LIBRARY_ID`. |
+| **Zotero SQLite Import** | Import/Export | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
+| **Zotero Web API** | Import/Export | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **EndNote / RIS** | Import/Export | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **USPTO ODP / PPubs** | Patents | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **OpenAI-compatible Chat API** | LLM Backend | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
@@ -38,7 +38,7 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi
 | **Scientific Toolref (Quantum ESPRESSO, etc.)** | Toolref | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **AmberTools / PyMOL** | Scientific | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **rsync / SSH Backup** | System | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
-| **Setup Diagnostics** | System | **good** | `test_setup.py` | Reports dependency presence and credential state in bilingual strings. Boundaries: CLI `scholaraio setup check` / `run_check` path. |
+| **Setup Diagnostics** | System | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 
 ---
 
@@ -46,15 +46,15 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi
 
 ### 2.1 qt-web-extractor (HTTP & MCP)
 * **CLI/Skill Entrypoint**:
-  * CLI: `scholaraio webextract <url>` (implemented in `cmd_webextract` inside [web.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/interfaces/cli/web.py))
+  * CLI: `scholaraio webextract <url>` (implemented in `cmd_webextract` inside [web.py](../../scholaraio/interfaces/cli/web.py))
   * Skill: `.claude/skills/webextract`
 * **Provider/Service Implementation Path**:
-  * [webtools.py:extract_web](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/webtools.py#L613-L673)
+  * [webtools.py:extract_web](../../scholaraio/providers/webtools.py)
 * **Setup Diagnostics**:
-  * Tested via `scholaraio setup check` (calls `_optional_webtool_detail` inside [setup.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/services/setup.py#L617-L665)), which executes `check_webextract_service` to verify that the HTTP/MCP endpoint responds.
+  * Tested via `scholaraio setup check` (calls `_optional_webtool_detail` inside [setup.py](../../scholaraio/services/setup.py)), which executes `check_webextract_service` to verify that the HTTP/MCP endpoint responds.
 * **Output Quality & Validation**:
   * Outputs parsed GFM Markdown. Output quality is protected by `_clean_table_code_fences` to sanitize malformed block code fences in Wikipedia/infobox table cells, resolving broken table rendering.
-  * Verified via raw/cleaned fixtures: [wikipedia_infobox_bad.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_bad.md) and [wikipedia_infobox_clean.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_clean.md).
+  * Verified via raw/cleaned fixtures: [wikipedia_infobox_bad.md](../../tests/fixtures/wikipedia_infobox_bad.md) and [wikipedia_infobox_clean.md](../../tests/fixtures/wikipedia_infobox_clean.md).
 * **Fallback Behavior**:
   * Configured via `webextract.transport` (HTTP or MCP). When configured as HTTP, failure to connect triggers fallback hint to MCP or setup checks.
 * **Failure Handling**:
@@ -66,7 +66,7 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi
   * CLI: `scholaraio ingest <pdf>` or `scholaraio/providers/mineru.py` main parser CLI.
   * Skill: `.claude/skills/ingest`
 * **Provider/Service Implementation Path**:
-  * [mineru.py:convert_pdf_cloud](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/mineru.py#L702-L810)
+  * [mineru.py:convert_pdf_cloud](../../scholaraio/providers/mineru.py)
 * **Setup Diagnostics**:
   * Checked under `scholaraio setup check` via `_detect_mineru` which verifies presence of `mineru-open-api` in system path (`shutil.which`) and reads credential key values.
 * **Output Quality & Validation**:
@@ -84,7 +84,7 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi
 * **CLI/Skill Entrypoint**:
   * CLI: Invoked automatically as part of PDF ingestion when MinerU fails, or manually by setting `pdf_preferred_parser: pymupdf`.
 * **Provider/Service Implementation Path**:
-  * [pdf_fallback.py:run_pymupdf](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/pdf_fallback.py#L142-L160)
+  * [pdf_fallback.py:run_pymupdf](../../scholaraio/providers/pdf_fallback.py)
 * **Setup Diagnostics**:
   * Checked in `scholaraio setup check` via `_check_dep_group("fitz")`.
 * **Output Quality & Validation**:
@@ -96,10 +96,10 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi
 
 ### 2.4 arXiv Search & PDF Download
 * **CLI/Skill Entrypoint**:
-  * CLI: `scholaraio search --arxiv` (runs `cmd_search` inside [search.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/interfaces/cli/search.py)) and `scholaraio paper fetch` to retrieve PDFs.
+  * CLI: `scholaraio arxiv search` and `scholaraio arxiv fetch` (defined in [arxiv.py](../../scholaraio/interfaces/cli/arxiv.py)), or `scholaraio fsearch --scope arxiv` for federated search.
   * Skill: `.claude/skills/search`, `.claude/skills/paper-guided-reading`
 * **Provider/Service Implementation Path**:
-  * [arxiv.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/arxiv.py) (`_query_arxiv_api`, `download_arxiv_pdf`, and `batch_download`).
+  * [arxiv.py](../../scholaraio/providers/arxiv.py) (`_query_arxiv_api`, `download_arxiv_pdf`, and `batch_download`).
 * **Setup Diagnostics**:
   * Setup checks verify internet connection and reachability of arXiv query export endpoints.
 * **Output Quality & Validation**:
@@ -113,10 +113,10 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi
 
 ### 2.5 Zotero Integration (Web API & Local SQLite Import)
 * **CLI/Skill Entrypoint**:
-  * CLI: `scholaraio import-zotero` command (`cmd_import_zotero` inside [import_zotero.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/interfaces/cli/import_zotero.py)).
+  * CLI: `scholaraio import-zotero` command (`cmd_import_zotero` inside [import_zotero.py](../../scholaraio/interfaces/cli/import_zotero.py)).
   * Skill: `.claude/skills/import-zotero`
 * **Provider/Service Implementation Path**:
-  * [zotero.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/zotero.py) (`fetch_zotero_api` for cloud Web API, `parse_zotero_local` for local SQLite databases).
+  * [zotero.py](../../scholaraio/providers/zotero.py) (`fetch_zotero_api` for cloud Web API, `parse_zotero_local` for local SQLite databases).
 * **Setup Diagnostics**:
   * Checked in `setup.py` by verifying presence of Zotero API credentials.
 * **Output Quality & Validation**:
diff --git a/scholaraio/providers/webtools.py b/scholaraio/providers/webtools.py
index e9431f97..1722a321 100644
--- a/scholaraio/providers/webtools.py
+++ b/scholaraio/providers/webtools.py
@@ -572,6 +572,53 @@ def _extract_web_mcp(url: str, *, cfg: Config | None, timeout: float) -> dict:
     }
 
 
+def _clean_single_row(row_text: str) -> str:
+    cells = row_text.split("|")
+    cleaned_cells = []
+
+    for i, cell in enumerate(cells):
+        if i == 0 and not cell.strip():
+            cleaned_cells.append(cell)
+            continue
+        if i == len(cells) - 1 and not cell.strip() and row_text.endswith("|"):
+            cleaned_cells.append(cell)
+            continue
+
+        if "```" in cell:
+            fence_count = cell.count("```")
+            cell_to_clean = cell + "\n```" if fence_count % 2 != 0 else cell
+            parts = cell_to_clean.split("```")
+            cleaned_parts = []
+            for j, part in enumerate(parts):
+                if j % 2 == 0:
+                    cleaned_parts.append(part.replace("\n", " "))
+                else:
+                    block = part
+                    if block.startswith("\n"):
+                        block = block[1:]
+                    else:
+                        block_lines = block.split("\n", 1)
+                        if len(block_lines) > 1:
+                            first_line = block_lines[0].strip()
+                            if re.match(r"^[a-zA-Z0-9_-]+$", first_line):
+                                block = block_lines[1]
+                    block_clean = block.replace("\n", " ").strip()
+                    if block_clean:
+                        cleaned_parts.append(f"`{block_clean}`")
+                    else:
+                        cleaned_parts.append("")
+            cleaned_cell = "".join(cleaned_parts)
+            cleaned_cell = " " + cleaned_cell.strip() + " "
+            cleaned_cells.append(cleaned_cell)
+        else:
+            cleaned_cells.append(cell.replace("\n", " "))
+
+    res = "|".join(cleaned_cells)
+    if not res.endswith("|"):
+        res += "|"
+    return res
+
+
 def _clean_table_code_fences(text: str) -> str:
     """Sanitize Markdown table cells that contain block-level code blocks/fences.
 
@@ -583,35 +630,77 @@ def _clean_table_code_fences(text: str) -> str:
     if not text:
         return ""
 
-    # Pattern to match a code block inside a table cell (bounded by pipes)
-    pattern = re.compile(
-        r"\|([^|]*?)```(?:[a-zA-Z0-9_-]*)\n(.*?)\n\s*```([^|]*?)\|",
-        re.DOTALL
-    )
+    lines = text.splitlines()
+    cleaned_lines = []
+    current_row_lines = []
+    in_multiline_row = False
+    in_code_block = False
+
+    def flush_current_row():
+        nonlocal in_multiline_row, current_row_lines, in_code_block
+        if current_row_lines:
+            row_text = "\n".join(current_row_lines)
+            cleaned_row = _clean_single_row(row_text)
+            cleaned_lines.append(cleaned_row)
+            current_row_lines = []
+        in_multiline_row = False
+        in_code_block = False
+
+    for line in lines:
+        stripped = line.strip()
+
+        if in_multiline_row:
+            num_fences = stripped.count("```")
+            if stripped.startswith("|") and (stripped.count("|") >= 2 or "```" in stripped):
+                flush_current_row()
+                # fall through to process as a new row start below
+            else:
+                if num_fences % 2 != 0:
+                    in_code_block = not in_code_block
+
+                if not in_code_block:
+                    if stripped.endswith("|"):
+                        current_row_lines.append(line)
+                        flush_current_row()
+                        continue
+                    elif not stripped:
+                        flush_current_row()
+                        cleaned_lines.append(line)
+                        continue
+                    elif stripped.startswith("```"):
+                        flush_current_row()
+                        # fall through to process as normal
+
+                if in_multiline_row:
+                    current_row_lines.append(line)
+                    continue
+
+        if stripped.startswith("|") and (stripped.count("|") >= 2 or "```" in stripped):
+            if "```" in stripped:
+                num_fences = stripped.count("```")
+                in_code = num_fences % 2 != 0
+                if not in_code and stripped.endswith("|"):
+                    cleaned_lines.append(_clean_single_row(line))
+                else:
+                    in_multiline_row = True
+                    in_code_block = in_code
+                    current_row_lines = [line]
+            else:
+                if stripped.endswith("|"):
+                    cleaned_lines.append(line)
+                else:
+                    in_multiline_row = True
+                    in_code_block = False
+                    current_row_lines = [line]
+        else:
+            cleaned_lines.append(line)
 
-    def replace_match(match):
-        full_match = match.group(0)
-        if re.search(r"\n\s*\n", full_match):
-            return full_match
-
-        before = match.group(1).replace("\n", " ").strip()
-        code_content = match.group(2).replace("\n", " ").strip()
-        after = match.group(3).replace("\n", " ").strip()
-        
-        # Format the code content as inline code
-        inline_code = f"`{code_content}`" if code_content else ""
-        
-        # Assemble the cleaned cell components
-        parts = [p for p in (before, inline_code, after) if p]
-        cleaned_cell = " " + " ".join(parts) + " "
-        return f"|{cleaned_cell}|"
-
-    cleaned = text
-    prev = ""
-    while cleaned != prev:
-        prev = cleaned
-        cleaned = pattern.sub(replace_match, cleaned)
-    return cleaned
+    flush_current_row()
+
+    result = "\n".join(cleaned_lines)
+    if text.endswith("\n") and not result.endswith("\n"):
+        result += "\n"
+    return result
 
 
 def extract_web(
diff --git a/tests/test_webtools_source.py b/tests/test_webtools_source.py
index 6c8447bd..7da2d0e8 100644
--- a/tests/test_webtools_source.py
+++ b/tests/test_webtools_source.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import json
+import pathlib
 
 import pytest
 
@@ -783,7 +784,6 @@ def fake_urlopen(req, timeout=0):
         assert "markdown body" in captured.out
 
     def test_clean_table_code_fences_with_fixtures(self):
-        import pathlib
         from scholaraio.providers.webtools import _clean_table_code_fences
 
         fixtures_dir = pathlib.Path(__file__).parent / "fixtures"
@@ -803,22 +803,11 @@ def test_clean_table_code_fences_ignores_normal_structures(self):
         from scholaraio.providers.webtools import _clean_table_code_fences
 
         # Test normal code block outside table should not be changed
-        normal_code = (
-            "Here is a code snippet:\n"
-            "```python\n"
-            "def test():\n"
-            "    return True\n"
-            "```\n"
-            "And here is normal text."
-        )
+        normal_code = "Here is a code snippet:\n```python\ndef test():\n    return True\n```\nAnd here is normal text."
         assert _clean_table_code_fences(normal_code) == normal_code
 
         # Test normal table with inline code should not be changed
-        normal_table = (
-            "| Column 1 | Column 2 |\n"
-            "| --- | --- |\n"
-            "| `inline code` | value |\n"
-        )
+        normal_table = "| Column 1 | Column 2 |\n| --- | --- |\n| `inline code` | value |\n"
         assert _clean_table_code_fences(normal_table) == normal_table
 
         # Test standalone code block between tables should not be changed
@@ -838,10 +827,7 @@ def test_clean_table_code_fences_ignores_normal_structures(self):
     def test_extract_web_applies_cleanup_http(self, monkeypatch):
         # Verify that HTTP path runs the clean helper
         def fake_urlopen(req, timeout=0):
-            return _FakeResponse({
-                "title": "Page",
-                "text": "| 性别 |\n| 出生 | ```\n1902\n``` |"
-            })
+            return _FakeResponse({"title": "Page", "text": "| 性别 |\n| 出生 | ```\n1902\n``` |"})
 
         def fake_check_service(cfg, timeout=3.0):
             return True

From 6d6bf2c819b230ff691263917806360f821e22db Mon Sep 17 00:00:00 2001
From: lzmo <zimoliao@mail.ustc.edu.cn>
Date: Sun, 7 Jun 2026 00:09:58 +0800
Subject: [PATCH 7/8] Tighten integration audit PR validation (#110)

- Keep qt-web-extractor audit claims within available fixture evidence

- Add regression coverage for adjacent standalone fenced code blocks

- Fix mypy inference for row cleanup state
---
 .../third-party-integration-audit.md          | 88 ++++---------------
 scholaraio/providers/webtools.py              |  4 +-
 tests/test_webtools_source.py                 |  5 ++
 3 files changed, 25 insertions(+), 72 deletions(-)

diff --git a/docs/development/third-party-integration-audit.md b/docs/development/third-party-integration-audit.md
index e62eb82e..ed60d708 100644
--- a/docs/development/third-party-integration-audit.md
+++ b/docs/development/third-party-integration-audit.md
@@ -1,16 +1,22 @@
 # ScholarAIO Third-Party Integration Quality Audit
 
-This document records the quality, reachability, and output validation status of the third-party integrations, APIs, CLIs, and optional toolchains supported by ScholarAIO. 
+This document records the quality, reachability, and output validation status of the third-party integrations, APIs, CLIs, and optional toolchains supported by ScholarAIO.
 
 Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoints, provider implementations, setup diagnostics, output formatting, fallback behaviors, and failure handling. A config test or a broad unit-test filename is not enough evidence to mark an integration surface as Good.
 
+Status is intentionally conservative:
+
+- **good**: workflow-boundary evidence exists, including commands, representative output, and failure handling.
+- **partially-reviewed**: code-level or fixture evidence exists, but live workflow evidence is still missing.
+- **not-yet-reviewed**: inventory only; no quality claim is made.
+
 ---
 
 ## 1. Quality Matrix
 
 | Integration / Surface | Category | Status | Verification Path / Test Evidence | Observed Result / Config & Version Boundaries |
 | :--- | :--- | :--- | :--- | :--- |
-| **qt-web-extractor (HTTP & MCP)** | Web / Agent | **good** | `extract_web` / `tests/test_webtools_source.py` | Sanitized output successfully resolves table-cell code fence corruption. Boundaries: `webextract.transport` (HTTP/MCP), `webextract.base_url`, `webextract.mcp_url`, `webextract.api_key`. |
+| **qt-web-extractor (HTTP & MCP)** | Web / Agent | **partially-reviewed** | `extract_web`, `_clean_table_code_fences`, `tests/test_webtools_source.py`, fixture pair under `tests/fixtures/` | Sanitizer regression is covered for malformed table-cell code fences and adjacent standalone code blocks. Live daemon canary evidence is still required before this surface is promoted to `good`. Boundaries: `webextract.transport` (HTTP/MCP), `webextract.base_url`, `webextract.mcp_url`, `webextract.api_key`. |
 | **GUILessBingSearch** | Web / Agent | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **MinerU Local API** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
 | **MinerU Cloud CLI** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. |
@@ -42,7 +48,7 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi
 
 ---
 
-## 2. Detailed Integration Audits (Workflow Boundary Analysis)
+## 2. Current Reviewed Surface
 
 ### 2.1 qt-web-extractor (HTTP & MCP)
 * **CLI/Skill Entrypoint**:
@@ -51,80 +57,22 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi
 * **Provider/Service Implementation Path**:
   * [webtools.py:extract_web](../../scholaraio/providers/webtools.py)
 * **Setup Diagnostics**:
-  * Tested via `scholaraio setup check` (calls `_optional_webtool_detail` inside [setup.py](../../scholaraio/services/setup.py)), which executes `check_webextract_service` to verify that the HTTP/MCP endpoint responds.
+  * Diagnostic path exists through `scholaraio setup check` (calls `_optional_webtool_detail` inside [setup.py](../../scholaraio/services/setup.py)), which executes `check_webextract_service` to verify that the HTTP/MCP endpoint responds. This PR does not include live daemon evidence from that path.
 * **Output Quality & Validation**:
   * Outputs parsed GFM Markdown. Output quality is protected by `_clean_table_code_fences` to sanitize malformed block code fences in Wikipedia/infobox table cells, resolving broken table rendering.
-  * Verified via raw/cleaned fixtures: [wikipedia_infobox_bad.md](../../tests/fixtures/wikipedia_infobox_bad.md) and [wikipedia_infobox_clean.md](../../tests/fixtures/wikipedia_infobox_clean.md).
+  * Verified via unit and fixture coverage: [wikipedia_infobox_bad.md](../../tests/fixtures/wikipedia_infobox_bad.md), [wikipedia_infobox_clean.md](../../tests/fixtures/wikipedia_infobox_clean.md), and regression tests for standalone fenced code blocks near table or pipe-prefixed lines.
 * **Fallback Behavior**:
   * Configured via `webextract.transport` (HTTP or MCP). When configured as HTTP, failure to connect triggers fallback hint to MCP or setup checks.
 * **Failure Handling**:
   * Unreachable HTTP endpoints raise `WebExtractServiceUnavailableError`, returning a clean user-facing hint with exit code `1`.
   * API/Server errors raise `WebExtractError`, showing warnings/errors instead of generic crashes.
 
-### 2.2 MinerU Cloud CLI (`mineru-open-api`)
-* **CLI/Skill Entrypoint**:
-  * CLI: `scholaraio ingest <pdf>` or `scholaraio/providers/mineru.py` main parser CLI.
-  * Skill: `.claude/skills/ingest`
-* **Provider/Service Implementation Path**:
-  * [mineru.py:convert_pdf_cloud](../../scholaraio/providers/mineru.py)
-* **Setup Diagnostics**:
-  * Checked under `scholaraio setup check` via `_detect_mineru` which verifies presence of `mineru-open-api` in system path (`shutil.which`) and reads credential key values.
-* **Output Quality & Validation**:
-  * Translates PDF structures to Markdown with images/formulas.
-  * Sanitizes cloud upload filenames via `_cloud_safe_pdf_name` to prevent platform-specific characters from crashing the extraction.
-  * Handles chunk merging for multi-part large PDF parsing.
-* **Fallback Behavior**:
-  * When MinerU is missing or fails, it falls back to the list of alternatives defined in the configuration option `pdf_fallback_order` (e.g. `["docling", "pymupdf"]`).
-* **Failure Handling**:
-  * Subprocess timeouts (`subprocess.TimeoutExpired`) are caught.
-  * Non-zero return codes from `mineru-open-api` raise descriptive errors containing stderr output.
-  * Retries are handled with exponential backoff (`attempts` based on `mineru_upload_retries`).
+## 3. Not-Yet-Reviewed Inventory
 
-### 2.3 PyMuPDF Fallback (`fitz`)
-* **CLI/Skill Entrypoint**:
-  * CLI: Invoked automatically as part of PDF ingestion when MinerU fails, or manually by setting `pdf_preferred_parser: pymupdf`.
-* **Provider/Service Implementation Path**:
-  * [pdf_fallback.py:run_pymupdf](../../scholaraio/providers/pdf_fallback.py)
-* **Setup Diagnostics**:
-  * Checked in `scholaraio setup check` via `_check_dep_group("fitz")`.
-* **Output Quality & Validation**:
-  * Extracts page-by-page flat plaintext with page headers (`## Page N\n\n`). Lacks complex block structure formatting but acts as a highly reliable baseline.
-* **Fallback Behavior**:
-  * Represents the last-resort fallback in the fallback parser chain (since it has no model/server dependencies).
-* **Failure Handling**:
-  * Catches general exception and formats error messages, skipping page crashes or file read errors gracefully without aborting the ingest execution pipeline.
+Rows marked `not-yet-reviewed` in the matrix are intentionally inventory-only. Promoting any of them to `partially-reviewed` or `good` should happen in a focused follow-up that includes:
 
-### 2.4 arXiv Search & PDF Download
-* **CLI/Skill Entrypoint**:
-  * CLI: `scholaraio arxiv search` and `scholaraio arxiv fetch` (defined in [arxiv.py](../../scholaraio/interfaces/cli/arxiv.py)), or `scholaraio fsearch --scope arxiv` for federated search.
-  * Skill: `.claude/skills/search`, `.claude/skills/paper-guided-reading`
-* **Provider/Service Implementation Path**:
-  * [arxiv.py](../../scholaraio/providers/arxiv.py) (`_query_arxiv_api`, `download_arxiv_pdf`, and `batch_download`).
-* **Setup Diagnostics**:
-  * Setup checks verify internet connection and reachability of arXiv query export endpoints.
-* **Output Quality & Validation**:
-  * Parses response XML via `defusedxml.ElementTree` to prevent XML External Entity (XXE) vulnerabilities, mapping properties directly to `ArxivPaper` dataclasses.
-  * Performs client-side field filtration (`_filter_search_results`) on author, title, and abstract fields to tighten results returned by arXiv's loose matching API.
-* **Fallback Behavior**:
-  * Gracefully fails with standard warning logs if the arXiv endpoint is offline, returning empty results rather than hard crashes.
-* **Failure Handling**:
-  * A requests session is mounted with a custom `urllib3` retry adapter to handle transient `429`, `502`, `503`, and `504` status codes automatically.
-  * Enforces a polite rate limit delay `RATE_LIMIT_DELAY = 3.0` between successive paper downloads in batch modes.
-
-### 2.5 Zotero Integration (Web API & Local SQLite Import)
-* **CLI/Skill Entrypoint**:
-  * CLI: `scholaraio import-zotero` command (`cmd_import_zotero` inside [import_zotero.py](../../scholaraio/interfaces/cli/import_zotero.py)).
-  * Skill: `.claude/skills/import-zotero`
-* **Provider/Service Implementation Path**:
-  * [zotero.py](../../scholaraio/providers/zotero.py) (`fetch_zotero_api` for cloud Web API, `parse_zotero_local` for local SQLite databases).
-* **Setup Diagnostics**:
-  * Checked in `setup.py` by verifying presence of Zotero API credentials.
-* **Output Quality & Validation**:
-  * Maps Zotero types (e.g. `journalArticle`, `preprint`) to standard `PaperMetadata` types.
-  * Locates corresponding PDF attachments and copies them into the import directory.
-* **Fallback Behavior**:
-  * Supports local SQLite database import via `--local <path/to/sqlite>` if API keys are missing or the API is unreachable.
-  * Skips unresolvable attachments/links instead of failing the import.
-* **Failure Handling**:
-  * Catches `ImportError` on `pyzotero` to prompt users to install optional dependencies.
-  * Attachment download failures are caught per-item, logging warnings while continuing to parse the rest of the collection.
+- exact CLI command or skill workflow exercised;
+- relevant config/version boundaries;
+- representative success output;
+- failure-mode behavior;
+- targeted tests or reproducible smoke evidence.
diff --git a/scholaraio/providers/webtools.py b/scholaraio/providers/webtools.py
index 1722a321..b366335f 100644
--- a/scholaraio/providers/webtools.py
+++ b/scholaraio/providers/webtools.py
@@ -631,8 +631,8 @@ def _clean_table_code_fences(text: str) -> str:
         return ""
 
     lines = text.splitlines()
-    cleaned_lines = []
-    current_row_lines = []
+    cleaned_lines: list[str] = []
+    current_row_lines: list[str] = []
     in_multiline_row = False
     in_code_block = False
 
diff --git a/tests/test_webtools_source.py b/tests/test_webtools_source.py
index 7da2d0e8..ee212ac4 100644
--- a/tests/test_webtools_source.py
+++ b/tests/test_webtools_source.py
@@ -824,6 +824,11 @@ def test_clean_table_code_fences_ignores_normal_structures(self):
         )
         assert _clean_table_code_fences(standalone_between_tables) == standalone_between_tables
 
+        adjacent_standalone_code = (
+            "| A | B |\n| one | two |\n```python\nprint(1)\n```\n| next paragraph starts with pipe |\n"
+        )
+        assert _clean_table_code_fences(adjacent_standalone_code) == adjacent_standalone_code
+
     def test_extract_web_applies_cleanup_http(self, monkeypatch):
         # Verify that HTTP path runs the clean helper
         def fake_urlopen(req, timeout=0):

From 32c198b4208630fe93ef449a3016b263b59b6075 Mon Sep 17 00:00:00 2001
From: lzmo <zimoliao@mail.ustc.edu.cn>
Date: Mon, 8 Jun 2026 21:57:49 +0800
Subject: [PATCH 8/8] Tighten webextract table cleanup coverage

---
 CHANGELOG.md                                  |  4 +++
 .../third-party-integration-audit.md          |  2 ++
 scholaraio/providers/webtools.py              | 26 +++++++++++++++++--
 tests/test_webtools_source.py                 |  3 +++
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 18603744..7706f5c4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/).
 
 - **Nature workflow bridge skill** ([#107](https://github.com/ZimoLiao/scholaraio/issues/107)): Added a ScholarAIO `nature-workflow` bridge skill that routes Nature Portfolio writing and figure workflows to the upstream `nature-skills` repository when installed, keeps ScholarAIO-native fallbacks explicit, documents the install and quick-start path, and includes deterministic plus product-demo fixtures that generate reviewable manuscript, figure, slide, and QA artifacts.
 
+### Fixed
+
+- **Webextract Markdown table-cell cleanup** ([#110](https://github.com/ZimoLiao/scholaraio/pull/110)): Sanitized malformed block-level code fences emitted inside `qt-web-extractor` table cells before HTTP/MCP extraction results reach `webextract` and ingest consumers, while preserving standalone fenced code blocks and pipe characters inside code-cell content.
+
 ## [1.5.0] — 2026-05-24
 
 ### Added
diff --git a/docs/development/third-party-integration-audit.md b/docs/development/third-party-integration-audit.md
index ed60d708..9ddf38ea 100644
--- a/docs/development/third-party-integration-audit.md
+++ b/docs/development/third-party-integration-audit.md
@@ -4,6 +4,8 @@ This document records the quality, reachability, and output validation status of
 
 Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoints, provider implementations, setup diagnostics, output formatting, fallback behaviors, and failure handling. A config test or a broad unit-test filename is not enough evidence to mark an integration surface as Good.
 
+This audit is not a declaration that the full third-party toolchain is adapted or verified. Each row claims only the evidence listed in that row; everything else remains inventory until a focused live or workflow-boundary pass verifies it.
+
 Status is intentionally conservative:
 
 - **good**: workflow-boundary evidence exists, including commands, representative output, and failure handling.
diff --git a/scholaraio/providers/webtools.py b/scholaraio/providers/webtools.py
index b366335f..22fcaf04 100644
--- a/scholaraio/providers/webtools.py
+++ b/scholaraio/providers/webtools.py
@@ -572,9 +572,31 @@ def _extract_web_mcp(url: str, *, cfg: Config | None, timeout: float) -> dict:
     }
 
 
+def _split_table_row_cells(row_text: str) -> list[str]:
+    cells: list[str] = []
+    current: list[str] = []
+    in_code_fence = False
+    i = 0
+    while i < len(row_text):
+        if row_text.startswith("```", i):
+            in_code_fence = not in_code_fence
+            current.append("```")
+            i += 3
+            continue
+        char = row_text[i]
+        if char == "|" and not in_code_fence:
+            cells.append("".join(current))
+            current = []
+        else:
+            current.append(char)
+        i += 1
+    cells.append("".join(current))
+    return cells
+
+
 def _clean_single_row(row_text: str) -> str:
-    cells = row_text.split("|")
-    cleaned_cells = []
+    cells = _split_table_row_cells(row_text)
+    cleaned_cells: list[str] = []
 
     for i, cell in enumerate(cells):
         if i == 0 and not cell.strip():
diff --git a/tests/test_webtools_source.py b/tests/test_webtools_source.py
index ee212ac4..c0088e77 100644
--- a/tests/test_webtools_source.py
+++ b/tests/test_webtools_source.py
@@ -829,6 +829,9 @@ def test_clean_table_code_fences_ignores_normal_structures(self):
         )
         assert _clean_table_code_fences(adjacent_standalone_code) == adjacent_standalone_code
 
+        table_cell_code_with_pipe = "| A | B |\n| code | ```\na | b\n``` |\n"
+        assert _clean_table_code_fences(table_cell_code_with_pipe) == "| A | B |\n| code | `a | b` |\n"
+
     def test_extract_web_applies_cleanup_http(self, monkeypatch):
         # Verify that HTTP path runs the clean helper
         def fake_urlopen(req, timeout=0):