diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py index d6fa8db69..801fa53c5 100644 --- a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py +++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py @@ -96,22 +96,30 @@ def _replace_equations(tag: Tag): raise ValueError(f"Not supported tag: {tag.name}") -def _pre_process_math(content: bytes) -> bytes: +def _pre_process_xml(content: bytes) -> bytes: """ - Pre-processes the math content in a DOCX -> XML file by converting OMML (Office Math Markup Language) elements to LaTeX. - This preprocessed content can be directly replaced in the DOCX file -> XMLs. + Pre-processes the XML content of a DOCX file (e.g. document.xml, footnotes.xml, endnotes.xml). + Converts OMML (Office Math Markup Language) elements to LaTeX and unwraps internal-only hyperlinks. Args: content (bytes): The XML content of the DOCX file as bytes. Returns: - bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes. + bytes: The processed content, encoded as bytes. """ soup = BeautifulSoup(content.decode(), features="xml") for tag in soup.find_all("oMathPara"): _replace_equations(tag) for tag in soup.find_all("oMath"): _replace_equations(tag) + + # Remove internal-only hyperlinks (e.g., TOC entries and cross-references) + for tag in soup.find_all(["w:hyperlink", "hyperlink"]): + has_anchor = any(k == "anchor" or k.endswith(":anchor") for k in tag.attrs) + has_id = any(k == "id" or k.endswith(":id") for k in tag.attrs) + if has_anchor and not has_id: + tag.unwrap() + return str(soup).encode() @@ -144,7 +152,7 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO: if name in pre_process_enable_files: try: # Pre-process the content - updated_content = _pre_process_math(content) + updated_content = _pre_process_xml(content) # In the future, if there are more pre-processing steps, they can be added here zip_output.writestr(name, updated_content) except Exception: diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..8dbd034c1 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -274,6 +274,40 @@ def test_docx_equations() -> None: assert block_equations, "No block equations found in the document." +def test_docx_internal_hyperlinks() -> None: + import tempfile + from docx import Document + from docx.oxml.ns import qn + from docx.oxml import OxmlElement + + doc = Document() + p = doc.add_paragraph() + + # Add internal anchor (a TOC entry) + hl = OxmlElement('w:hyperlink') + hl.set(qn('w:anchor'), '_Toc12345') + r = OxmlElement('w:r') + t = OxmlElement('w:t') + t.text = "Executive Summary" + r.append(t) + hl.append(r) + p._p.append(hl) + + path = os.path.join(tempfile.gettempdir(), "test_internal_hyperlinks.docx") + try: + doc.save(path) + markitdown = MarkItDown() + result = markitdown.convert(path) + # Should be converted to plain text, not a markdown hyperlink + assert "Executive Summary" in result.text_content + assert "[Executive Summary]" not in result.text_content + assert "#_Toc12345" not in result.text_content + finally: + if os.path.exists(path): + os.remove(path) + + + def test_input_as_strings() -> None: markitdown = MarkItDown() @@ -539,6 +573,7 @@ def test_markitdown_llm() -> None: test_data_uris, test_file_uris, test_docx_comments, + test_docx_internal_hyperlinks, test_input_as_strings, test_markitdown_remote, test_speech_transcription,