From a837c85d47577f5f1799b85344d6bd43e9fb4e62 Mon Sep 17 00:00:00 2001 From: Shlok148Dev Date: Fri, 5 Jun 2026 00:17:14 +0530 Subject: [PATCH 1/8] fix: sanitize qt-web-extractor table cells and add integration audit doc --- .../third-party-integration-audit.md | 53 +++++++++++ scholaraio/providers/webtools.py | 90 ++++++++++++++----- tests/fixtures/wikipedia_infobox_bad.md | 10 +++ tests/fixtures/wikipedia_infobox_clean.md | 4 + tests/test_webtools_source.py | 58 ++++++++++++ 5 files changed, 192 insertions(+), 23 deletions(-) create mode 100644 docs/development/third-party-integration-audit.md create mode 100644 tests/fixtures/wikipedia_infobox_bad.md create mode 100644 tests/fixtures/wikipedia_infobox_clean.md diff --git a/docs/development/third-party-integration-audit.md b/docs/development/third-party-integration-audit.md new file mode 100644 index 00000000..77a44b89 --- /dev/null +++ b/docs/development/third-party-integration-audit.md @@ -0,0 +1,53 @@ +# ScholarAIO Third-Party Integration Quality Audit + +This document records the quality, reachability, and output validation status of the third-party integrations, APIs, CLIs, and optional toolchains supported by ScholarAIO. + +Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoints, provider implementations, setup diagnostics, output formatting, fallback behaviors, and failure handling. + +--- + +## 1. Quality Matrix + +| Integration / Surface | Category | Status | Verification Path / Test Evidence | Observed Result / Boundaries | +| :--- | :--- | :--- | :--- | :--- | +| **qt-web-extractor (HTTP & MCP)** | Web / Agent | **needs-cleanup** | `extract_web` / `tests/test_webtools_source.py` | Sanitized output successfully resolves table-cell code fence corruption on Wikipedia. | +| **GUILessBingSearch** | Web / Agent | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **MinerU Local API** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **MinerU Cloud CLI** | Parsing | **good** | `test_mineru.py` | Handles `mineru-open-api` subprocess calls; enforces filename constraints safely. | +| **Docling Fallback** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **PyMuPDF Fallback** | Parsing | **good** | `test_pdf_fallback.py` | Robust extraction fallback when default parser fails. | +| **arXiv Search (Atom API)** | Discovery | **good** | `test_arxiv_source.py` | Atom XML parser is stable; query filters match client expectations. | +| **arXiv PDF Download** | Discovery | **good** | `test_arxiv_source.py` | Enforces `RATE_LIMIT_DELAY = 3.0` between successive paper downloads. | +| **OpenAlex Explore** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **Crossref / Semantic Scholar** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **Zotero SQLite Import** | Import/Export | **good** | `test_workspace.py` | Parsed SQLite columns correctly map to `PaperMetadata`. | +| **Zotero Web API** | Import/Export | **usable-with-caveats** | `fetch_zotero_api` / `import-zotero` | pyzotero retrieves metadata; linked/external attachments are skipped by design. | +| **EndNote / RIS** | Import/Export | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **USPTO ODP / PPubs** | Patents | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **OpenAI-compatible Chat API** | LLM Backend | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **Anthropic Messages API** | LLM Backend | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **Google Gemini API** | LLM Backend | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **Zhipu API** | LLM Backend | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **vLLM / Ollama Local** | LLM Backend | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **Sentence-transformers Embeddings** | Vector/Embed | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **FAISS Vector / BERTopic** | Vector/Embed | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **MarkItDown Office Ingest** | Office/Output | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **Office PPTX / DOCX Libraries** | Office/Output | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **Mermaid / DOT Rendering** | Diagram | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **Scientific Toolref (Quantum ESPRESSO, etc.)** | Toolref | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **AmberTools / PyMOL** | Scientific | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **rsync / SSH Backup** | System | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **Setup Diagnostics** | System | **good** | `test_setup.py` | Reports dependency presence and credential state in bilingual strings. | + +--- + +## 2. Seed Issue: qt-web-extractor Table Cell Corruption +- **Symptom**: Unescaped block elements (e.g. ` ``` ` or `\n\n`) inside Wikipedia tables break Markdown parsing and cause garbled readback. +- **Verification Fixtures**: + - Raw Input: [wikipedia_infobox_bad.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_bad.md) + - Expected Output: [wikipedia_infobox_clean.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_clean.md) +- **Fix**: Added a regex sanitization helper in `scholaraio/providers/webtools.py` called `_clean_table_code_fences`. It scans the output Markdown for block elements bounded by table column pipes (`|`) and collapses them to inline code blocks: + ```python + res["text"] = _clean_table_code_fences(res["text"]) + ``` +- **Scope**: Executed at the end of the `extract_web` function to clean both HTTP and MCP outputs prior to consumption by RAG and CLI workflows. diff --git a/scholaraio/providers/webtools.py b/scholaraio/providers/webtools.py index dc200685..df37751c 100644 --- a/scholaraio/providers/webtools.py +++ b/scholaraio/providers/webtools.py @@ -572,6 +572,44 @@ def _extract_web_mcp(url: str, *, cfg: Config | None, timeout: float) -> dict: } +def _clean_table_code_fences(text: str) -> str: + """Sanitize Markdown table cells that contain block-level code blocks/fences. + + Transforms: + | Col | ```\nval\n``` | + Into: + | Col | `val` | + """ + if not text: + return "" + + # Pattern to match a code block inside a table cell (bounded by pipes) + pattern = re.compile( + r"\|([^|]*?)```(?:[a-zA-Z0-9_-]*)\n(.*?)\n\s*```([^|]*?)\|", + re.DOTALL + ) + + def replace_match(match): + before = match.group(1).replace("\n", " ").strip() + code_content = match.group(2).replace("\n", " ").strip() + after = match.group(3).replace("\n", " ").strip() + + # Format the code content as inline code + inline_code = f"`{code_content}`" if code_content else "" + + # Assemble the cleaned cell components + parts = [p for p in (before, inline_code, after) if p] + cleaned_cell = " " + " ".join(parts) + " " + return f"|{cleaned_cell}|" + + cleaned = text + prev = "" + while cleaned != prev: + prev = cleaned + cleaned = pattern.sub(replace_match, cleaned) + return cleaned + + def extract_web( url: str, *, @@ -600,33 +638,39 @@ def extract_web( """ transport = _get_webextract_transport(cfg) if transport == "mcp": - return _extract_web_mcp(url, cfg=cfg, timeout=timeout) - if transport != "http": - raise WebExtractError(f"未知 webextract transport: {transport}") + res = _extract_web_mcp(url, cfg=cfg, timeout=timeout) + else: + if transport != "http": + raise WebExtractError(f"未知 webextract transport: {transport}") - base_url = _get_webextract_base_url(cfg) - if not check_webextract_service(cfg, timeout=3.0): - raise WebExtractServiceUnavailableError( - f"提取服务未启动或不可达: {base_url}\n请确保 qt-web-extractor 服务已运行" + base_url = _get_webextract_base_url(cfg) + if not check_webextract_service(cfg, timeout=3.0): + raise WebExtractServiceUnavailableError( + f"提取服务未启动或不可达: {base_url}\n请确保 qt-web-extractor 服务已运行" + ) + + body: dict[str, object] = {"url": url} + if pdf is not None: + body["pdf"] = pdf + if include_html: + body["include_html"] = include_html + + api_key = _get_webextract_api_key(cfg) or "" + req = Request( + f"{base_url}/extract", + data=json.dumps(body).encode("utf-8"), + headers=_headers(api_key), + method="POST", ) + try: + res = _load_json_response(req, timeout=int(timeout), error_prefix="提取失败") + except RuntimeError as e: + raise WebExtractError(str(e)) from e - body: dict[str, object] = {"url": url} - if pdf is not None: - body["pdf"] = pdf - if include_html: - body["include_html"] = include_html + if isinstance(res, dict) and "text" in res and res["text"]: + res["text"] = _clean_table_code_fences(res["text"]) - api_key = _get_webextract_api_key(cfg) or "" - req = Request( - f"{base_url}/extract", - data=json.dumps(body).encode("utf-8"), - headers=_headers(api_key), - method="POST", - ) - try: - return _load_json_response(req, timeout=int(timeout), error_prefix="提取失败") - except RuntimeError as e: - raise WebExtractError(str(e)) from e + return res def extract_and_display( diff --git a/tests/fixtures/wikipedia_infobox_bad.md b/tests/fixtures/wikipedia_infobox_bad.md new file mode 100644 index 00000000..c568fd1a --- /dev/null +++ b/tests/fixtures/wikipedia_infobox_bad.md @@ -0,0 +1,10 @@ +| 性别 | 男 | +| 出生 | ``` +1902年8月28日 +``` | +| 逝世 | ``` +1993年11月24日 +``` | +| 国籍 | ``` +中华人民共和国 +``` | diff --git a/tests/fixtures/wikipedia_infobox_clean.md b/tests/fixtures/wikipedia_infobox_clean.md new file mode 100644 index 00000000..e718f65e --- /dev/null +++ b/tests/fixtures/wikipedia_infobox_clean.md @@ -0,0 +1,4 @@ +| 性别 | 男 | +| 出生 | `1902年8月28日` | +| 逝世 | `1993年11月24日` | +| 国籍 | `中华人民共和国` | diff --git a/tests/test_webtools_source.py b/tests/test_webtools_source.py index 8fa788aa..650573e0 100644 --- a/tests/test_webtools_source.py +++ b/tests/test_webtools_source.py @@ -781,3 +781,61 @@ def fake_urlopen(req, timeout=0): assert result["title"] == "Page" captured = capsys.readouterr() assert "markdown body" in captured.out + + def test_clean_table_code_fences_with_fixtures(self): + import pathlib + from scholaraio.providers.webtools import _clean_table_code_fences + + fixtures_dir = pathlib.Path(__file__).parent / "fixtures" + bad_path = fixtures_dir / "wikipedia_infobox_bad.md" + clean_path = fixtures_dir / "wikipedia_infobox_clean.md" + + assert bad_path.exists() + assert clean_path.exists() + + bad_text = bad_path.read_text(encoding="utf-8") + expected_clean_text = clean_path.read_text(encoding="utf-8") + + cleaned_text = _clean_table_code_fences(bad_text) + assert cleaned_text.strip() == expected_clean_text.strip() + + def test_clean_table_code_fences_ignores_normal_structures(self): + from scholaraio.providers.webtools import _clean_table_code_fences + + # Test normal code block outside table should not be changed + normal_code = ( + "Here is a code snippet:\n" + "```python\n" + "def test():\n" + " return True\n" + "```\n" + "And here is normal text." + ) + assert _clean_table_code_fences(normal_code) == normal_code + + # Test normal table with inline code should not be changed + normal_table = ( + "| Column 1 | Column 2 |\n" + "| --- | --- |\n" + "| `inline code` | value |\n" + ) + assert _clean_table_code_fences(normal_table) == normal_table + + def test_extract_web_applies_cleanup_http(self, monkeypatch): + # Verify that HTTP path runs the clean helper + def fake_urlopen(req, timeout=0): + return _FakeResponse({ + "title": "Page", + "text": "| 性别 |\n| 出生 | ```\n1902\n``` |" + }) + + def fake_check_service(cfg, timeout=3.0): + return True + + monkeypatch.setattr("scholaraio.providers.webtools.urlopen", fake_urlopen) + monkeypatch.setattr("scholaraio.providers.webtools.check_webextract_service", fake_check_service) + + from scholaraio.providers.webtools import extract_web + + res = extract_web("https://example.com") + assert res["text"] == "| 性别 |\n| 出生 | `1902` |" From 91197b3a5addf57f0d95aaebf49f955b3e4a9db1 Mon Sep 17 00:00:00 2001 From: Shlok148Dev Date: Fri, 5 Jun 2026 00:17:15 +0530 Subject: [PATCH 2/8] docs: detail integration audits at workflow boundary --- .../third-party-integration-audit.md | 98 ++++++++++++++++--- 1 file changed, 87 insertions(+), 11 deletions(-) diff --git a/docs/development/third-party-integration-audit.md b/docs/development/third-party-integration-audit.md index 77a44b89..4050a525 100644 --- a/docs/development/third-party-integration-audit.md +++ b/docs/development/third-party-integration-audit.md @@ -2,7 +2,7 @@ This document records the quality, reachability, and output validation status of the third-party integrations, APIs, CLIs, and optional toolchains supported by ScholarAIO. -Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoints, provider implementations, setup diagnostics, output formatting, fallback behaviors, and failure handling. +Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoints, provider implementations, setup diagnostics, output formatting, fallback behaviors, and failure handling. A config test or a broad unit-test filename is not enough evidence to mark an integration surface as Good. --- @@ -41,13 +41,89 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi --- -## 2. Seed Issue: qt-web-extractor Table Cell Corruption -- **Symptom**: Unescaped block elements (e.g. ` ``` ` or `\n\n`) inside Wikipedia tables break Markdown parsing and cause garbled readback. -- **Verification Fixtures**: - - Raw Input: [wikipedia_infobox_bad.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_bad.md) - - Expected Output: [wikipedia_infobox_clean.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_clean.md) -- **Fix**: Added a regex sanitization helper in `scholaraio/providers/webtools.py` called `_clean_table_code_fences`. It scans the output Markdown for block elements bounded by table column pipes (`|`) and collapses them to inline code blocks: - ```python - res["text"] = _clean_table_code_fences(res["text"]) - ``` -- **Scope**: Executed at the end of the `extract_web` function to clean both HTTP and MCP outputs prior to consumption by RAG and CLI workflows. +## 2. Detailed Integration Audits (Workflow Boundary Analysis) + +### 2.1 qt-web-extractor (HTTP & MCP) +* **CLI/Skill Entrypoint**: + * CLI: `scholaraio webextract ` (implemented in `cmd_webextract` inside [web.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/interfaces/cli/web.py)) + * Skill: `.claude/skills/webextract` +* **Provider/Service Implementation Path**: + * [webtools.py:extract_web](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/webtools.py#L613-L673) +* **Setup Diagnostics**: + * Tested via `scholaraio setup check` (calls `_optional_webtool_detail` inside [setup.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/services/setup.py#L617-L665)), which executes `check_webextract_service` to verify that the HTTP/MCP endpoint responds. +* **Output Quality & Validation**: + * Outputs parsed GFM Markdown. Output quality is protected by `_clean_table_code_fences` to sanitize malformed block code fences in Wikipedia/infobox table cells, resolving broken table rendering. + * Verified via raw/cleaned fixtures: [wikipedia_infobox_bad.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_bad.md) and [wikipedia_infobox_clean.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_clean.md). +* **Fallback Behavior**: + * Configured via `webextract.transport` (HTTP or MCP). When configured as HTTP, failure to connect triggers fallback hint to MCP or setup checks. +* **Failure Handling**: + * Unreachable HTTP endpoints raise `WebExtractServiceUnavailableError`, returning a clean user-facing hint with exit code `1`. + * API/Server errors raise `WebExtractError`, showing warnings/errors instead of generic crashes. + +### 2.2 MinerU Cloud CLI (`mineru-open-api`) +* **CLI/Skill Entrypoint**: + * CLI: `scholaraio ingest ` or `scholaraio/providers/mineru.py` main parser CLI. + * Skill: `.claude/skills/ingest` +* **Provider/Service Implementation Path**: + * [mineru.py:convert_pdf_cloud](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/mineru.py#L702-L810) +* **Setup Diagnostics**: + * Checked under `scholaraio setup check` via `_detect_mineru` which verifies presence of `mineru-open-api` in system path (`shutil.which`) and reads credential key values. +* **Output Quality & Validation**: + * Translates PDF structures to Markdown with images/formulas. + * Sanitizes cloud upload filenames via `_cloud_safe_pdf_name` to prevent platform-specific characters from crashing the extraction. + * Handles chunk merging for multi-part large PDF parsing. +* **Fallback Behavior**: + * When MinerU is missing or fails, it falls back to the list of alternatives defined in the configuration option `pdf_fallback_order` (e.g. `["docling", "pymupdf"]`). +* **Failure Handling**: + * Subprocess timeouts (`subprocess.TimeoutExpired`) are caught. + * Non-zero return codes from `mineru-open-api` raise descriptive errors containing stderr output. + * Retries are handled with exponential backoff (`attempts` based on `mineru_upload_retries`). + +### 2.3 PyMuPDF Fallback (`fitz`) +* **CLI/Skill Entrypoint**: + * CLI: Invoked automatically as part of PDF ingestion when MinerU fails, or manually by setting `pdf_preferred_parser: pymupdf`. +* **Provider/Service Implementation Path**: + * [pdf_fallback.py:run_pymupdf](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/pdf_fallback.py#L142-L160) +* **Setup Diagnostics**: + * Checked in `scholaraio setup check` via `_check_dep_group("fitz")`. +* **Output Quality & Validation**: + * Extracts page-by-page flat plaintext with page headers (`## Page N\n\n`). Lacks complex block structure formatting but acts as a highly reliable baseline. +* **Fallback Behavior**: + * Represents the last-resort fallback in the fallback parser chain (since it has no model/server dependencies). +* **Failure Handling**: + * Catches general exception and formats error messages, skipping page crashes or file read errors gracefully without aborting the ingest execution pipeline. + +### 2.4 arXiv Search & PDF Download +* **CLI/Skill Entrypoint**: + * CLI: `scholaraio search --arxiv` (runs `cmd_search` inside [search.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/interfaces/cli/search.py)) and `scholaraio paper fetch` to retrieve PDFs. + * Skill: `.claude/skills/search`, `.claude/skills/paper-guided-reading` +* **Provider/Service Implementation Path**: + * [arxiv.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/arxiv.py) (`_query_arxiv_api`, `download_arxiv_pdf`, and `batch_download`). +* **Setup Diagnostics**: + * Setup checks verify internet connection and reachability of arXiv query export endpoints. +* **Output Quality & Validation**: + * Parses response XML via `defusedxml.ElementTree` to prevent XML External Entity (XXE) vulnerabilities, mapping properties directly to `ArxivPaper` dataclasses. + * Performs client-side field filtration (`_filter_search_results`) on author, title, and abstract fields to tighten results returned by arXiv's loose matching API. +* **Fallback Behavior**: + * Gracefully fails with standard warning logs if the arXiv endpoint is offline, returning empty results rather than hard crashes. +* **Failure Handling**: + * A requests session is mounted with a custom `urllib3` retry adapter to handle transient `429`, `502`, `503`, and `504` status codes automatically. + * Enforces a polite rate limit delay `RATE_LIMIT_DELAY = 3.0` between successive paper downloads in batch modes. + +### 2.5 Zotero Integration (Web API & Local SQLite Import) +* **CLI/Skill Entrypoint**: + * CLI: `scholaraio import-zotero` command (`cmd_import_zotero` inside [import_zotero.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/interfaces/cli/import_zotero.py)). + * Skill: `.claude/skills/import-zotero` +* **Provider/Service Implementation Path**: + * [zotero.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/zotero.py) (`fetch_zotero_api` for cloud Web API, `parse_zotero_local` for local SQLite databases). +* **Setup Diagnostics**: + * Checked in `setup.py` by verifying presence of Zotero API credentials. +* **Output Quality & Validation**: + * Maps Zotero types (e.g. `journalArticle`, `preprint`) to standard `PaperMetadata` types. + * Locates corresponding PDF attachments and copies them into the import directory. +* **Fallback Behavior**: + * Supports local SQLite database import via `--local ` if API keys are missing or the API is unreachable. + * Skips unresolvable attachments/links instead of failing the import. +* **Failure Handling**: + * Catches `ImportError` on `pyzotero` to prompt users to install optional dependencies. + * Attachment download failures are caught per-item, logging warnings while continuing to parse the rest of the collection. From a33d2295be645d3c5293e4c6f96adda4f79b00b2 Mon Sep 17 00:00:00 2001 From: Shlok148Dev Date: Fri, 5 Jun 2026 00:17:15 +0530 Subject: [PATCH 3/8] docs: add Paper2Any explicitly as not-yet-reviewed in audit matrix --- docs/development/third-party-integration-audit.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/development/third-party-integration-audit.md b/docs/development/third-party-integration-audit.md index 4050a525..9c99a408 100644 --- a/docs/development/third-party-integration-audit.md +++ b/docs/development/third-party-integration-audit.md @@ -14,6 +14,7 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi | **GUILessBingSearch** | Web / Agent | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **MinerU Local API** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **MinerU Cloud CLI** | Parsing | **good** | `test_mineru.py` | Handles `mineru-open-api` subprocess calls; enforces filename constraints safely. | +| **Paper2Any MCP Sidecar** | Parsing/MCP | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **Docling Fallback** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **PyMuPDF Fallback** | Parsing | **good** | `test_pdf_fallback.py` | Robust extraction fallback when default parser fails. | | **arXiv Search (Atom API)** | Discovery | **good** | `test_arxiv_source.py` | Atom XML parser is stable; query filters match client expectations. | From 8c9836f89d68331b9583333940584b4b1e436cbc Mon Sep 17 00:00:00 2001 From: Shlok148Dev Date: Fri, 5 Jun 2026 00:17:15 +0530 Subject: [PATCH 4/8] docs: document explicit config & version boundaries per matrix row --- .../third-party-integration-audit.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/development/third-party-integration-audit.md b/docs/development/third-party-integration-audit.md index 9c99a408..fb6a6fc9 100644 --- a/docs/development/third-party-integration-audit.md +++ b/docs/development/third-party-integration-audit.md @@ -8,21 +8,21 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi ## 1. Quality Matrix -| Integration / Surface | Category | Status | Verification Path / Test Evidence | Observed Result / Boundaries | +| Integration / Surface | Category | Status | Verification Path / Test Evidence | Observed Result / Config & Version Boundaries | | :--- | :--- | :--- | :--- | :--- | -| **qt-web-extractor (HTTP & MCP)** | Web / Agent | **needs-cleanup** | `extract_web` / `tests/test_webtools_source.py` | Sanitized output successfully resolves table-cell code fence corruption on Wikipedia. | +| **qt-web-extractor (HTTP & MCP)** | Web / Agent | **needs-cleanup** | `extract_web` / `tests/test_webtools_source.py` | Sanitized output successfully resolves table-cell code fence corruption. Boundaries: `webextract.transport` (HTTP/MCP), `webextract.base_url`, `webextract.mcp_url`, `webextract.api_key`. | | **GUILessBingSearch** | Web / Agent | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **MinerU Local API** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. | -| **MinerU Cloud CLI** | Parsing | **good** | `test_mineru.py` | Handles `mineru-open-api` subprocess calls; enforces filename constraints safely. | +| **MinerU Cloud CLI** | Parsing | **good** | `test_mineru.py` | Handles `mineru-open-api` subprocess calls; enforces filename constraints safely. Boundaries: `ingest.mineru_api_key` / `MINERU_TOKEN`, `mineru-open-api` CLI package. | | **Paper2Any MCP Sidecar** | Parsing/MCP | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **Docling Fallback** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. | -| **PyMuPDF Fallback** | Parsing | **good** | `test_pdf_fallback.py` | Robust extraction fallback when default parser fails. | -| **arXiv Search (Atom API)** | Discovery | **good** | `test_arxiv_source.py` | Atom XML parser is stable; query filters match client expectations. | -| **arXiv PDF Download** | Discovery | **good** | `test_arxiv_source.py` | Enforces `RATE_LIMIT_DELAY = 3.0` between successive paper downloads. | +| **PyMuPDF Fallback** | Parsing | **good** | `test_pdf_fallback.py` | Robust extraction fallback when default parser fails. Boundaries: `pymupdf` / `fitz` dependency presence. | +| **arXiv Search (Atom API)** | Discovery | **good** | `test_arxiv_source.py` | Atom XML parser is stable; query filters match client expectations. Boundaries: HTTP requests to `export.arxiv.org`. | +| **arXiv PDF Download** | Discovery | **good** | `test_arxiv_source.py` | Enforces `RATE_LIMIT_DELAY = 3.0` spacing in `batch_download`. Boundaries: `https://arxiv.org/pdf/` endpoint. | | **OpenAlex Explore** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **Crossref / Semantic Scholar** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. | -| **Zotero SQLite Import** | Import/Export | **good** | `test_workspace.py` | Parsed SQLite columns correctly map to `PaperMetadata`. | -| **Zotero Web API** | Import/Export | **usable-with-caveats** | `fetch_zotero_api` / `import-zotero` | pyzotero retrieves metadata; linked/external attachments are skipped by design. | +| **Zotero SQLite Import** | Import/Export | **good** | `test_workspace.py` | Parsed SQLite columns correctly map to `PaperMetadata`. Boundaries: Local `zotero.sqlite` database schema layout. | +| **Zotero Web API** | Import/Export | **usable-with-caveats** | `fetch_zotero_api` / `import-zotero` | pyzotero retrieves metadata; linked/external attachments are skipped by design. Boundaries: `zotero.api_key`, `zotero.library_id`, `ZOTERO_API_KEY`, `ZOTERO_LIBRARY_ID`. | | **EndNote / RIS** | Import/Export | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **USPTO ODP / PPubs** | Patents | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **OpenAI-compatible Chat API** | LLM Backend | **not-yet-reviewed** | N/A | Excluded from current triage phase. | @@ -38,7 +38,7 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi | **Scientific Toolref (Quantum ESPRESSO, etc.)** | Toolref | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **AmberTools / PyMOL** | Scientific | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **rsync / SSH Backup** | System | **not-yet-reviewed** | N/A | Excluded from current triage phase. | -| **Setup Diagnostics** | System | **good** | `test_setup.py` | Reports dependency presence and credential state in bilingual strings. | +| **Setup Diagnostics** | System | **good** | `test_setup.py` | Reports dependency presence and credential state in bilingual strings. Boundaries: CLI `scholaraio setup check` / `run_check` path. | --- From 80a584cfa8416ec7b40773ce3917ced7e13059a6 Mon Sep 17 00:00:00 2001 From: Shlok148Dev Date: Fri, 5 Jun 2026 00:20:16 +0530 Subject: [PATCH 5/8] fix: constrain table code fence cleanup to avoid matching across empty lines --- scholaraio/providers/webtools.py | 4 ++++ tests/test_webtools_source.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/scholaraio/providers/webtools.py b/scholaraio/providers/webtools.py index df37751c..e9431f97 100644 --- a/scholaraio/providers/webtools.py +++ b/scholaraio/providers/webtools.py @@ -590,6 +590,10 @@ def _clean_table_code_fences(text: str) -> str: ) def replace_match(match): + full_match = match.group(0) + if re.search(r"\n\s*\n", full_match): + return full_match + before = match.group(1).replace("\n", " ").strip() code_content = match.group(2).replace("\n", " ").strip() after = match.group(3).replace("\n", " ").strip() diff --git a/tests/test_webtools_source.py b/tests/test_webtools_source.py index 650573e0..6c8447bd 100644 --- a/tests/test_webtools_source.py +++ b/tests/test_webtools_source.py @@ -821,6 +821,20 @@ def test_clean_table_code_fences_ignores_normal_structures(self): ) assert _clean_table_code_fences(normal_table) == normal_table + # Test standalone code block between tables should not be changed + standalone_between_tables = ( + "| A | B |\n" + "| --- | --- |\n" + "| one | two |\n\n" + "```python\n" + "print(1)\n" + "```\n\n" + "| C | D |\n" + "| --- | --- |\n" + "| three | four |\n" + ) + assert _clean_table_code_fences(standalone_between_tables) == standalone_between_tables + def test_extract_web_applies_cleanup_http(self, monkeypatch): # Verify that HTTP path runs the clean helper def fake_urlopen(req, timeout=0): From 9fe1632be7b48492772a6736eadf5bb8cb493c69 Mon Sep 17 00:00:00 2001 From: Shlok148Dev Date: Fri, 5 Jun 2026 23:16:03 +0530 Subject: [PATCH 6/8] fix: row-scoped table cell code fence cleanup & correct audit doc --- .../third-party-integration-audit.md | 36 ++--- scholaraio/providers/webtools.py | 145 ++++++++++++++---- tests/test_webtools_source.py | 22 +-- 3 files changed, 139 insertions(+), 64 deletions(-) diff --git a/docs/development/third-party-integration-audit.md b/docs/development/third-party-integration-audit.md index fb6a6fc9..e62eb82e 100644 --- a/docs/development/third-party-integration-audit.md +++ b/docs/development/third-party-integration-audit.md @@ -10,19 +10,19 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi | Integration / Surface | Category | Status | Verification Path / Test Evidence | Observed Result / Config & Version Boundaries | | :--- | :--- | :--- | :--- | :--- | -| **qt-web-extractor (HTTP & MCP)** | Web / Agent | **needs-cleanup** | `extract_web` / `tests/test_webtools_source.py` | Sanitized output successfully resolves table-cell code fence corruption. Boundaries: `webextract.transport` (HTTP/MCP), `webextract.base_url`, `webextract.mcp_url`, `webextract.api_key`. | +| **qt-web-extractor (HTTP & MCP)** | Web / Agent | **good** | `extract_web` / `tests/test_webtools_source.py` | Sanitized output successfully resolves table-cell code fence corruption. Boundaries: `webextract.transport` (HTTP/MCP), `webextract.base_url`, `webextract.mcp_url`, `webextract.api_key`. | | **GUILessBingSearch** | Web / Agent | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **MinerU Local API** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. | -| **MinerU Cloud CLI** | Parsing | **good** | `test_mineru.py` | Handles `mineru-open-api` subprocess calls; enforces filename constraints safely. Boundaries: `ingest.mineru_api_key` / `MINERU_TOKEN`, `mineru-open-api` CLI package. | +| **MinerU Cloud CLI** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **Paper2Any MCP Sidecar** | Parsing/MCP | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **Docling Fallback** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. | -| **PyMuPDF Fallback** | Parsing | **good** | `test_pdf_fallback.py` | Robust extraction fallback when default parser fails. Boundaries: `pymupdf` / `fitz` dependency presence. | -| **arXiv Search (Atom API)** | Discovery | **good** | `test_arxiv_source.py` | Atom XML parser is stable; query filters match client expectations. Boundaries: HTTP requests to `export.arxiv.org`. | -| **arXiv PDF Download** | Discovery | **good** | `test_arxiv_source.py` | Enforces `RATE_LIMIT_DELAY = 3.0` spacing in `batch_download`. Boundaries: `https://arxiv.org/pdf/` endpoint. | +| **PyMuPDF Fallback** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **arXiv Search (Atom API)** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **arXiv PDF Download** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **OpenAlex Explore** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **Crossref / Semantic Scholar** | Discovery | **not-yet-reviewed** | N/A | Excluded from current triage phase. | -| **Zotero SQLite Import** | Import/Export | **good** | `test_workspace.py` | Parsed SQLite columns correctly map to `PaperMetadata`. Boundaries: Local `zotero.sqlite` database schema layout. | -| **Zotero Web API** | Import/Export | **usable-with-caveats** | `fetch_zotero_api` / `import-zotero` | pyzotero retrieves metadata; linked/external attachments are skipped by design. Boundaries: `zotero.api_key`, `zotero.library_id`, `ZOTERO_API_KEY`, `ZOTERO_LIBRARY_ID`. | +| **Zotero SQLite Import** | Import/Export | **not-yet-reviewed** | N/A | Excluded from current triage phase. | +| **Zotero Web API** | Import/Export | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **EndNote / RIS** | Import/Export | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **USPTO ODP / PPubs** | Patents | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **OpenAI-compatible Chat API** | LLM Backend | **not-yet-reviewed** | N/A | Excluded from current triage phase. | @@ -38,7 +38,7 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi | **Scientific Toolref (Quantum ESPRESSO, etc.)** | Toolref | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **AmberTools / PyMOL** | Scientific | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **rsync / SSH Backup** | System | **not-yet-reviewed** | N/A | Excluded from current triage phase. | -| **Setup Diagnostics** | System | **good** | `test_setup.py` | Reports dependency presence and credential state in bilingual strings. Boundaries: CLI `scholaraio setup check` / `run_check` path. | +| **Setup Diagnostics** | System | **not-yet-reviewed** | N/A | Excluded from current triage phase. | --- @@ -46,15 +46,15 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi ### 2.1 qt-web-extractor (HTTP & MCP) * **CLI/Skill Entrypoint**: - * CLI: `scholaraio webextract ` (implemented in `cmd_webextract` inside [web.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/interfaces/cli/web.py)) + * CLI: `scholaraio webextract ` (implemented in `cmd_webextract` inside [web.py](../../scholaraio/interfaces/cli/web.py)) * Skill: `.claude/skills/webextract` * **Provider/Service Implementation Path**: - * [webtools.py:extract_web](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/webtools.py#L613-L673) + * [webtools.py:extract_web](../../scholaraio/providers/webtools.py) * **Setup Diagnostics**: - * Tested via `scholaraio setup check` (calls `_optional_webtool_detail` inside [setup.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/services/setup.py#L617-L665)), which executes `check_webextract_service` to verify that the HTTP/MCP endpoint responds. + * Tested via `scholaraio setup check` (calls `_optional_webtool_detail` inside [setup.py](../../scholaraio/services/setup.py)), which executes `check_webextract_service` to verify that the HTTP/MCP endpoint responds. * **Output Quality & Validation**: * Outputs parsed GFM Markdown. Output quality is protected by `_clean_table_code_fences` to sanitize malformed block code fences in Wikipedia/infobox table cells, resolving broken table rendering. - * Verified via raw/cleaned fixtures: [wikipedia_infobox_bad.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_bad.md) and [wikipedia_infobox_clean.md](file:///c:/Users/hp/Desktop/Scholara_oss/tests/fixtures/wikipedia_infobox_clean.md). + * Verified via raw/cleaned fixtures: [wikipedia_infobox_bad.md](../../tests/fixtures/wikipedia_infobox_bad.md) and [wikipedia_infobox_clean.md](../../tests/fixtures/wikipedia_infobox_clean.md). * **Fallback Behavior**: * Configured via `webextract.transport` (HTTP or MCP). When configured as HTTP, failure to connect triggers fallback hint to MCP or setup checks. * **Failure Handling**: @@ -66,7 +66,7 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi * CLI: `scholaraio ingest ` or `scholaraio/providers/mineru.py` main parser CLI. * Skill: `.claude/skills/ingest` * **Provider/Service Implementation Path**: - * [mineru.py:convert_pdf_cloud](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/mineru.py#L702-L810) + * [mineru.py:convert_pdf_cloud](../../scholaraio/providers/mineru.py) * **Setup Diagnostics**: * Checked under `scholaraio setup check` via `_detect_mineru` which verifies presence of `mineru-open-api` in system path (`shutil.which`) and reads credential key values. * **Output Quality & Validation**: @@ -84,7 +84,7 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi * **CLI/Skill Entrypoint**: * CLI: Invoked automatically as part of PDF ingestion when MinerU fails, or manually by setting `pdf_preferred_parser: pymupdf`. * **Provider/Service Implementation Path**: - * [pdf_fallback.py:run_pymupdf](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/pdf_fallback.py#L142-L160) + * [pdf_fallback.py:run_pymupdf](../../scholaraio/providers/pdf_fallback.py) * **Setup Diagnostics**: * Checked in `scholaraio setup check` via `_check_dep_group("fitz")`. * **Output Quality & Validation**: @@ -96,10 +96,10 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi ### 2.4 arXiv Search & PDF Download * **CLI/Skill Entrypoint**: - * CLI: `scholaraio search --arxiv` (runs `cmd_search` inside [search.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/interfaces/cli/search.py)) and `scholaraio paper fetch` to retrieve PDFs. + * CLI: `scholaraio arxiv search` and `scholaraio arxiv fetch` (defined in [arxiv.py](../../scholaraio/interfaces/cli/arxiv.py)), or `scholaraio fsearch --scope arxiv` for federated search. * Skill: `.claude/skills/search`, `.claude/skills/paper-guided-reading` * **Provider/Service Implementation Path**: - * [arxiv.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/arxiv.py) (`_query_arxiv_api`, `download_arxiv_pdf`, and `batch_download`). + * [arxiv.py](../../scholaraio/providers/arxiv.py) (`_query_arxiv_api`, `download_arxiv_pdf`, and `batch_download`). * **Setup Diagnostics**: * Setup checks verify internet connection and reachability of arXiv query export endpoints. * **Output Quality & Validation**: @@ -113,10 +113,10 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi ### 2.5 Zotero Integration (Web API & Local SQLite Import) * **CLI/Skill Entrypoint**: - * CLI: `scholaraio import-zotero` command (`cmd_import_zotero` inside [import_zotero.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/interfaces/cli/import_zotero.py)). + * CLI: `scholaraio import-zotero` command (`cmd_import_zotero` inside [import_zotero.py](../../scholaraio/interfaces/cli/import_zotero.py)). * Skill: `.claude/skills/import-zotero` * **Provider/Service Implementation Path**: - * [zotero.py](file:///c:/Users/hp/Desktop/Scholara_oss/scholaraio/providers/zotero.py) (`fetch_zotero_api` for cloud Web API, `parse_zotero_local` for local SQLite databases). + * [zotero.py](../../scholaraio/providers/zotero.py) (`fetch_zotero_api` for cloud Web API, `parse_zotero_local` for local SQLite databases). * **Setup Diagnostics**: * Checked in `setup.py` by verifying presence of Zotero API credentials. * **Output Quality & Validation**: diff --git a/scholaraio/providers/webtools.py b/scholaraio/providers/webtools.py index e9431f97..1722a321 100644 --- a/scholaraio/providers/webtools.py +++ b/scholaraio/providers/webtools.py @@ -572,6 +572,53 @@ def _extract_web_mcp(url: str, *, cfg: Config | None, timeout: float) -> dict: } +def _clean_single_row(row_text: str) -> str: + cells = row_text.split("|") + cleaned_cells = [] + + for i, cell in enumerate(cells): + if i == 0 and not cell.strip(): + cleaned_cells.append(cell) + continue + if i == len(cells) - 1 and not cell.strip() and row_text.endswith("|"): + cleaned_cells.append(cell) + continue + + if "```" in cell: + fence_count = cell.count("```") + cell_to_clean = cell + "\n```" if fence_count % 2 != 0 else cell + parts = cell_to_clean.split("```") + cleaned_parts = [] + for j, part in enumerate(parts): + if j % 2 == 0: + cleaned_parts.append(part.replace("\n", " ")) + else: + block = part + if block.startswith("\n"): + block = block[1:] + else: + block_lines = block.split("\n", 1) + if len(block_lines) > 1: + first_line = block_lines[0].strip() + if re.match(r"^[a-zA-Z0-9_-]+$", first_line): + block = block_lines[1] + block_clean = block.replace("\n", " ").strip() + if block_clean: + cleaned_parts.append(f"`{block_clean}`") + else: + cleaned_parts.append("") + cleaned_cell = "".join(cleaned_parts) + cleaned_cell = " " + cleaned_cell.strip() + " " + cleaned_cells.append(cleaned_cell) + else: + cleaned_cells.append(cell.replace("\n", " ")) + + res = "|".join(cleaned_cells) + if not res.endswith("|"): + res += "|" + return res + + def _clean_table_code_fences(text: str) -> str: """Sanitize Markdown table cells that contain block-level code blocks/fences. @@ -583,35 +630,77 @@ def _clean_table_code_fences(text: str) -> str: if not text: return "" - # Pattern to match a code block inside a table cell (bounded by pipes) - pattern = re.compile( - r"\|([^|]*?)```(?:[a-zA-Z0-9_-]*)\n(.*?)\n\s*```([^|]*?)\|", - re.DOTALL - ) + lines = text.splitlines() + cleaned_lines = [] + current_row_lines = [] + in_multiline_row = False + in_code_block = False + + def flush_current_row(): + nonlocal in_multiline_row, current_row_lines, in_code_block + if current_row_lines: + row_text = "\n".join(current_row_lines) + cleaned_row = _clean_single_row(row_text) + cleaned_lines.append(cleaned_row) + current_row_lines = [] + in_multiline_row = False + in_code_block = False + + for line in lines: + stripped = line.strip() + + if in_multiline_row: + num_fences = stripped.count("```") + if stripped.startswith("|") and (stripped.count("|") >= 2 or "```" in stripped): + flush_current_row() + # fall through to process as a new row start below + else: + if num_fences % 2 != 0: + in_code_block = not in_code_block + + if not in_code_block: + if stripped.endswith("|"): + current_row_lines.append(line) + flush_current_row() + continue + elif not stripped: + flush_current_row() + cleaned_lines.append(line) + continue + elif stripped.startswith("```"): + flush_current_row() + # fall through to process as normal + + if in_multiline_row: + current_row_lines.append(line) + continue + + if stripped.startswith("|") and (stripped.count("|") >= 2 or "```" in stripped): + if "```" in stripped: + num_fences = stripped.count("```") + in_code = num_fences % 2 != 0 + if not in_code and stripped.endswith("|"): + cleaned_lines.append(_clean_single_row(line)) + else: + in_multiline_row = True + in_code_block = in_code + current_row_lines = [line] + else: + if stripped.endswith("|"): + cleaned_lines.append(line) + else: + in_multiline_row = True + in_code_block = False + current_row_lines = [line] + else: + cleaned_lines.append(line) - def replace_match(match): - full_match = match.group(0) - if re.search(r"\n\s*\n", full_match): - return full_match - - before = match.group(1).replace("\n", " ").strip() - code_content = match.group(2).replace("\n", " ").strip() - after = match.group(3).replace("\n", " ").strip() - - # Format the code content as inline code - inline_code = f"`{code_content}`" if code_content else "" - - # Assemble the cleaned cell components - parts = [p for p in (before, inline_code, after) if p] - cleaned_cell = " " + " ".join(parts) + " " - return f"|{cleaned_cell}|" - - cleaned = text - prev = "" - while cleaned != prev: - prev = cleaned - cleaned = pattern.sub(replace_match, cleaned) - return cleaned + flush_current_row() + + result = "\n".join(cleaned_lines) + if text.endswith("\n") and not result.endswith("\n"): + result += "\n" + return result def extract_web( diff --git a/tests/test_webtools_source.py b/tests/test_webtools_source.py index 6c8447bd..7da2d0e8 100644 --- a/tests/test_webtools_source.py +++ b/tests/test_webtools_source.py @@ -3,6 +3,7 @@ from __future__ import annotations import json +import pathlib import pytest @@ -783,7 +784,6 @@ def fake_urlopen(req, timeout=0): assert "markdown body" in captured.out def test_clean_table_code_fences_with_fixtures(self): - import pathlib from scholaraio.providers.webtools import _clean_table_code_fences fixtures_dir = pathlib.Path(__file__).parent / "fixtures" @@ -803,22 +803,11 @@ def test_clean_table_code_fences_ignores_normal_structures(self): from scholaraio.providers.webtools import _clean_table_code_fences # Test normal code block outside table should not be changed - normal_code = ( - "Here is a code snippet:\n" - "```python\n" - "def test():\n" - " return True\n" - "```\n" - "And here is normal text." - ) + normal_code = "Here is a code snippet:\n```python\ndef test():\n return True\n```\nAnd here is normal text." assert _clean_table_code_fences(normal_code) == normal_code # Test normal table with inline code should not be changed - normal_table = ( - "| Column 1 | Column 2 |\n" - "| --- | --- |\n" - "| `inline code` | value |\n" - ) + normal_table = "| Column 1 | Column 2 |\n| --- | --- |\n| `inline code` | value |\n" assert _clean_table_code_fences(normal_table) == normal_table # Test standalone code block between tables should not be changed @@ -838,10 +827,7 @@ def test_clean_table_code_fences_ignores_normal_structures(self): def test_extract_web_applies_cleanup_http(self, monkeypatch): # Verify that HTTP path runs the clean helper def fake_urlopen(req, timeout=0): - return _FakeResponse({ - "title": "Page", - "text": "| 性别 |\n| 出生 | ```\n1902\n``` |" - }) + return _FakeResponse({"title": "Page", "text": "| 性别 |\n| 出生 | ```\n1902\n``` |"}) def fake_check_service(cfg, timeout=3.0): return True From 6d6bf2c819b230ff691263917806360f821e22db Mon Sep 17 00:00:00 2001 From: lzmo Date: Sun, 7 Jun 2026 00:09:58 +0800 Subject: [PATCH 7/8] Tighten integration audit PR validation (#110) - Keep qt-web-extractor audit claims within available fixture evidence - Add regression coverage for adjacent standalone fenced code blocks - Fix mypy inference for row cleanup state --- .../third-party-integration-audit.md | 88 ++++--------------- scholaraio/providers/webtools.py | 4 +- tests/test_webtools_source.py | 5 ++ 3 files changed, 25 insertions(+), 72 deletions(-) diff --git a/docs/development/third-party-integration-audit.md b/docs/development/third-party-integration-audit.md index e62eb82e..ed60d708 100644 --- a/docs/development/third-party-integration-audit.md +++ b/docs/development/third-party-integration-audit.md @@ -1,16 +1,22 @@ # ScholarAIO Third-Party Integration Quality Audit -This document records the quality, reachability, and output validation status of the third-party integrations, APIs, CLIs, and optional toolchains supported by ScholarAIO. +This document records the quality, reachability, and output validation status of the third-party integrations, APIs, CLIs, and optional toolchains supported by ScholarAIO. Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoints, provider implementations, setup diagnostics, output formatting, fallback behaviors, and failure handling. A config test or a broad unit-test filename is not enough evidence to mark an integration surface as Good. +Status is intentionally conservative: + +- **good**: workflow-boundary evidence exists, including commands, representative output, and failure handling. +- **partially-reviewed**: code-level or fixture evidence exists, but live workflow evidence is still missing. +- **not-yet-reviewed**: inventory only; no quality claim is made. + --- ## 1. Quality Matrix | Integration / Surface | Category | Status | Verification Path / Test Evidence | Observed Result / Config & Version Boundaries | | :--- | :--- | :--- | :--- | :--- | -| **qt-web-extractor (HTTP & MCP)** | Web / Agent | **good** | `extract_web` / `tests/test_webtools_source.py` | Sanitized output successfully resolves table-cell code fence corruption. Boundaries: `webextract.transport` (HTTP/MCP), `webextract.base_url`, `webextract.mcp_url`, `webextract.api_key`. | +| **qt-web-extractor (HTTP & MCP)** | Web / Agent | **partially-reviewed** | `extract_web`, `_clean_table_code_fences`, `tests/test_webtools_source.py`, fixture pair under `tests/fixtures/` | Sanitizer regression is covered for malformed table-cell code fences and adjacent standalone code blocks. Live daemon canary evidence is still required before this surface is promoted to `good`. Boundaries: `webextract.transport` (HTTP/MCP), `webextract.base_url`, `webextract.mcp_url`, `webextract.api_key`. | | **GUILessBingSearch** | Web / Agent | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **MinerU Local API** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. | | **MinerU Cloud CLI** | Parsing | **not-yet-reviewed** | N/A | Excluded from current triage phase. | @@ -42,7 +48,7 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi --- -## 2. Detailed Integration Audits (Workflow Boundary Analysis) +## 2. Current Reviewed Surface ### 2.1 qt-web-extractor (HTTP & MCP) * **CLI/Skill Entrypoint**: @@ -51,80 +57,22 @@ Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoi * **Provider/Service Implementation Path**: * [webtools.py:extract_web](../../scholaraio/providers/webtools.py) * **Setup Diagnostics**: - * Tested via `scholaraio setup check` (calls `_optional_webtool_detail` inside [setup.py](../../scholaraio/services/setup.py)), which executes `check_webextract_service` to verify that the HTTP/MCP endpoint responds. + * Diagnostic path exists through `scholaraio setup check` (calls `_optional_webtool_detail` inside [setup.py](../../scholaraio/services/setup.py)), which executes `check_webextract_service` to verify that the HTTP/MCP endpoint responds. This PR does not include live daemon evidence from that path. * **Output Quality & Validation**: * Outputs parsed GFM Markdown. Output quality is protected by `_clean_table_code_fences` to sanitize malformed block code fences in Wikipedia/infobox table cells, resolving broken table rendering. - * Verified via raw/cleaned fixtures: [wikipedia_infobox_bad.md](../../tests/fixtures/wikipedia_infobox_bad.md) and [wikipedia_infobox_clean.md](../../tests/fixtures/wikipedia_infobox_clean.md). + * Verified via unit and fixture coverage: [wikipedia_infobox_bad.md](../../tests/fixtures/wikipedia_infobox_bad.md), [wikipedia_infobox_clean.md](../../tests/fixtures/wikipedia_infobox_clean.md), and regression tests for standalone fenced code blocks near table or pipe-prefixed lines. * **Fallback Behavior**: * Configured via `webextract.transport` (HTTP or MCP). When configured as HTTP, failure to connect triggers fallback hint to MCP or setup checks. * **Failure Handling**: * Unreachable HTTP endpoints raise `WebExtractServiceUnavailableError`, returning a clean user-facing hint with exit code `1`. * API/Server errors raise `WebExtractError`, showing warnings/errors instead of generic crashes. -### 2.2 MinerU Cloud CLI (`mineru-open-api`) -* **CLI/Skill Entrypoint**: - * CLI: `scholaraio ingest ` or `scholaraio/providers/mineru.py` main parser CLI. - * Skill: `.claude/skills/ingest` -* **Provider/Service Implementation Path**: - * [mineru.py:convert_pdf_cloud](../../scholaraio/providers/mineru.py) -* **Setup Diagnostics**: - * Checked under `scholaraio setup check` via `_detect_mineru` which verifies presence of `mineru-open-api` in system path (`shutil.which`) and reads credential key values. -* **Output Quality & Validation**: - * Translates PDF structures to Markdown with images/formulas. - * Sanitizes cloud upload filenames via `_cloud_safe_pdf_name` to prevent platform-specific characters from crashing the extraction. - * Handles chunk merging for multi-part large PDF parsing. -* **Fallback Behavior**: - * When MinerU is missing or fails, it falls back to the list of alternatives defined in the configuration option `pdf_fallback_order` (e.g. `["docling", "pymupdf"]`). -* **Failure Handling**: - * Subprocess timeouts (`subprocess.TimeoutExpired`) are caught. - * Non-zero return codes from `mineru-open-api` raise descriptive errors containing stderr output. - * Retries are handled with exponential backoff (`attempts` based on `mineru_upload_retries`). +## 3. Not-Yet-Reviewed Inventory -### 2.3 PyMuPDF Fallback (`fitz`) -* **CLI/Skill Entrypoint**: - * CLI: Invoked automatically as part of PDF ingestion when MinerU fails, or manually by setting `pdf_preferred_parser: pymupdf`. -* **Provider/Service Implementation Path**: - * [pdf_fallback.py:run_pymupdf](../../scholaraio/providers/pdf_fallback.py) -* **Setup Diagnostics**: - * Checked in `scholaraio setup check` via `_check_dep_group("fitz")`. -* **Output Quality & Validation**: - * Extracts page-by-page flat plaintext with page headers (`## Page N\n\n`). Lacks complex block structure formatting but acts as a highly reliable baseline. -* **Fallback Behavior**: - * Represents the last-resort fallback in the fallback parser chain (since it has no model/server dependencies). -* **Failure Handling**: - * Catches general exception and formats error messages, skipping page crashes or file read errors gracefully without aborting the ingest execution pipeline. +Rows marked `not-yet-reviewed` in the matrix are intentionally inventory-only. Promoting any of them to `partially-reviewed` or `good` should happen in a focused follow-up that includes: -### 2.4 arXiv Search & PDF Download -* **CLI/Skill Entrypoint**: - * CLI: `scholaraio arxiv search` and `scholaraio arxiv fetch` (defined in [arxiv.py](../../scholaraio/interfaces/cli/arxiv.py)), or `scholaraio fsearch --scope arxiv` for federated search. - * Skill: `.claude/skills/search`, `.claude/skills/paper-guided-reading` -* **Provider/Service Implementation Path**: - * [arxiv.py](../../scholaraio/providers/arxiv.py) (`_query_arxiv_api`, `download_arxiv_pdf`, and `batch_download`). -* **Setup Diagnostics**: - * Setup checks verify internet connection and reachability of arXiv query export endpoints. -* **Output Quality & Validation**: - * Parses response XML via `defusedxml.ElementTree` to prevent XML External Entity (XXE) vulnerabilities, mapping properties directly to `ArxivPaper` dataclasses. - * Performs client-side field filtration (`_filter_search_results`) on author, title, and abstract fields to tighten results returned by arXiv's loose matching API. -* **Fallback Behavior**: - * Gracefully fails with standard warning logs if the arXiv endpoint is offline, returning empty results rather than hard crashes. -* **Failure Handling**: - * A requests session is mounted with a custom `urllib3` retry adapter to handle transient `429`, `502`, `503`, and `504` status codes automatically. - * Enforces a polite rate limit delay `RATE_LIMIT_DELAY = 3.0` between successive paper downloads in batch modes. - -### 2.5 Zotero Integration (Web API & Local SQLite Import) -* **CLI/Skill Entrypoint**: - * CLI: `scholaraio import-zotero` command (`cmd_import_zotero` inside [import_zotero.py](../../scholaraio/interfaces/cli/import_zotero.py)). - * Skill: `.claude/skills/import-zotero` -* **Provider/Service Implementation Path**: - * [zotero.py](../../scholaraio/providers/zotero.py) (`fetch_zotero_api` for cloud Web API, `parse_zotero_local` for local SQLite databases). -* **Setup Diagnostics**: - * Checked in `setup.py` by verifying presence of Zotero API credentials. -* **Output Quality & Validation**: - * Maps Zotero types (e.g. `journalArticle`, `preprint`) to standard `PaperMetadata` types. - * Locates corresponding PDF attachments and copies them into the import directory. -* **Fallback Behavior**: - * Supports local SQLite database import via `--local ` if API keys are missing or the API is unreachable. - * Skips unresolvable attachments/links instead of failing the import. -* **Failure Handling**: - * Catches `ImportError` on `pyzotero` to prompt users to install optional dependencies. - * Attachment download failures are caught per-item, logging warnings while continuing to parse the rest of the collection. +- exact CLI command or skill workflow exercised; +- relevant config/version boundaries; +- representative success output; +- failure-mode behavior; +- targeted tests or reproducible smoke evidence. diff --git a/scholaraio/providers/webtools.py b/scholaraio/providers/webtools.py index 1722a321..b366335f 100644 --- a/scholaraio/providers/webtools.py +++ b/scholaraio/providers/webtools.py @@ -631,8 +631,8 @@ def _clean_table_code_fences(text: str) -> str: return "" lines = text.splitlines() - cleaned_lines = [] - current_row_lines = [] + cleaned_lines: list[str] = [] + current_row_lines: list[str] = [] in_multiline_row = False in_code_block = False diff --git a/tests/test_webtools_source.py b/tests/test_webtools_source.py index 7da2d0e8..ee212ac4 100644 --- a/tests/test_webtools_source.py +++ b/tests/test_webtools_source.py @@ -824,6 +824,11 @@ def test_clean_table_code_fences_ignores_normal_structures(self): ) assert _clean_table_code_fences(standalone_between_tables) == standalone_between_tables + adjacent_standalone_code = ( + "| A | B |\n| one | two |\n```python\nprint(1)\n```\n| next paragraph starts with pipe |\n" + ) + assert _clean_table_code_fences(adjacent_standalone_code) == adjacent_standalone_code + def test_extract_web_applies_cleanup_http(self, monkeypatch): # Verify that HTTP path runs the clean helper def fake_urlopen(req, timeout=0): From 32c198b4208630fe93ef449a3016b263b59b6075 Mon Sep 17 00:00:00 2001 From: lzmo Date: Mon, 8 Jun 2026 21:57:49 +0800 Subject: [PATCH 8/8] Tighten webextract table cleanup coverage --- CHANGELOG.md | 4 +++ .../third-party-integration-audit.md | 2 ++ scholaraio/providers/webtools.py | 26 +++++++++++++++++-- tests/test_webtools_source.py | 3 +++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 18603744..7706f5c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/). - **Nature workflow bridge skill** ([#107](https://github.com/ZimoLiao/scholaraio/issues/107)): Added a ScholarAIO `nature-workflow` bridge skill that routes Nature Portfolio writing and figure workflows to the upstream `nature-skills` repository when installed, keeps ScholarAIO-native fallbacks explicit, documents the install and quick-start path, and includes deterministic plus product-demo fixtures that generate reviewable manuscript, figure, slide, and QA artifacts. +### Fixed + +- **Webextract Markdown table-cell cleanup** ([#110](https://github.com/ZimoLiao/scholaraio/pull/110)): Sanitized malformed block-level code fences emitted inside `qt-web-extractor` table cells before HTTP/MCP extraction results reach `webextract` and ingest consumers, while preserving standalone fenced code blocks and pipe characters inside code-cell content. + ## [1.5.0] — 2026-05-24 ### Added diff --git a/docs/development/third-party-integration-audit.md b/docs/development/third-party-integration-audit.md index ed60d708..9ddf38ea 100644 --- a/docs/development/third-party-integration-audit.md +++ b/docs/development/third-party-integration-audit.md @@ -4,6 +4,8 @@ This document records the quality, reachability, and output validation status of Integrations are evaluated at the workflow boundary, checking CLI/skill entrypoints, provider implementations, setup diagnostics, output formatting, fallback behaviors, and failure handling. A config test or a broad unit-test filename is not enough evidence to mark an integration surface as Good. +This audit is not a declaration that the full third-party toolchain is adapted or verified. Each row claims only the evidence listed in that row; everything else remains inventory until a focused live or workflow-boundary pass verifies it. + Status is intentionally conservative: - **good**: workflow-boundary evidence exists, including commands, representative output, and failure handling. diff --git a/scholaraio/providers/webtools.py b/scholaraio/providers/webtools.py index b366335f..22fcaf04 100644 --- a/scholaraio/providers/webtools.py +++ b/scholaraio/providers/webtools.py @@ -572,9 +572,31 @@ def _extract_web_mcp(url: str, *, cfg: Config | None, timeout: float) -> dict: } +def _split_table_row_cells(row_text: str) -> list[str]: + cells: list[str] = [] + current: list[str] = [] + in_code_fence = False + i = 0 + while i < len(row_text): + if row_text.startswith("```", i): + in_code_fence = not in_code_fence + current.append("```") + i += 3 + continue + char = row_text[i] + if char == "|" and not in_code_fence: + cells.append("".join(current)) + current = [] + else: + current.append(char) + i += 1 + cells.append("".join(current)) + return cells + + def _clean_single_row(row_text: str) -> str: - cells = row_text.split("|") - cleaned_cells = [] + cells = _split_table_row_cells(row_text) + cleaned_cells: list[str] = [] for i, cell in enumerate(cells): if i == 0 and not cell.strip(): diff --git a/tests/test_webtools_source.py b/tests/test_webtools_source.py index ee212ac4..c0088e77 100644 --- a/tests/test_webtools_source.py +++ b/tests/test_webtools_source.py @@ -829,6 +829,9 @@ def test_clean_table_code_fences_ignores_normal_structures(self): ) assert _clean_table_code_fences(adjacent_standalone_code) == adjacent_standalone_code + table_cell_code_with_pipe = "| A | B |\n| code | ```\na | b\n``` |\n" + assert _clean_table_code_fences(table_cell_code_with_pipe) == "| A | B |\n| code | `a | b` |\n" + def test_extract_web_applies_cleanup_http(self, monkeypatch): # Verify that HTTP path runs the clean helper def fake_urlopen(req, timeout=0):