Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -96,22 +96,30 @@ def _replace_equations(tag: Tag):
raise ValueError(f"Not supported tag: {tag.name}")


def _pre_process_math(content: bytes) -> bytes:
def _pre_process_xml(content: bytes) -> bytes:
"""
Pre-processes the math content in a DOCX -> XML file by converting OMML (Office Math Markup Language) elements to LaTeX.
This preprocessed content can be directly replaced in the DOCX file -> XMLs.
Pre-processes the XML content of a DOCX file (e.g. document.xml, footnotes.xml, endnotes.xml).
Converts OMML (Office Math Markup Language) elements to LaTeX and unwraps internal-only hyperlinks.

Args:
content (bytes): The XML content of the DOCX file as bytes.

Returns:
bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes.
bytes: The processed content, encoded as bytes.
"""
soup = BeautifulSoup(content.decode(), features="xml")
for tag in soup.find_all("oMathPara"):
_replace_equations(tag)
for tag in soup.find_all("oMath"):
_replace_equations(tag)

# Remove internal-only hyperlinks (e.g., TOC entries and cross-references)
for tag in soup.find_all(["w:hyperlink", "hyperlink"]):
has_anchor = any(k == "anchor" or k.endswith(":anchor") for k in tag.attrs)
has_id = any(k == "id" or k.endswith(":id") for k in tag.attrs)
if has_anchor and not has_id:
tag.unwrap()

return str(soup).encode()


Expand Down Expand Up @@ -144,7 +152,7 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
if name in pre_process_enable_files:
try:
# Pre-process the content
updated_content = _pre_process_math(content)
updated_content = _pre_process_xml(content)
# In the future, if there are more pre-processing steps, they can be added here
zip_output.writestr(name, updated_content)
except Exception:
Expand Down
35 changes: 35 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,40 @@ def test_docx_equations() -> None:
assert block_equations, "No block equations found in the document."


def test_docx_internal_hyperlinks() -> None:
import tempfile
from docx import Document
from docx.oxml.ns import qn
from docx.oxml import OxmlElement

doc = Document()
p = doc.add_paragraph()

# Add internal anchor (a TOC entry)
hl = OxmlElement('w:hyperlink')
hl.set(qn('w:anchor'), '_Toc12345')
r = OxmlElement('w:r')
t = OxmlElement('w:t')
t.text = "Executive Summary"
r.append(t)
hl.append(r)
p._p.append(hl)

path = os.path.join(tempfile.gettempdir(), "test_internal_hyperlinks.docx")
try:
doc.save(path)
markitdown = MarkItDown()
result = markitdown.convert(path)
# Should be converted to plain text, not a markdown hyperlink
assert "Executive Summary" in result.text_content
assert "[Executive Summary]" not in result.text_content
assert "#_Toc12345" not in result.text_content
finally:
if os.path.exists(path):
os.remove(path)



def test_input_as_strings() -> None:
markitdown = MarkItDown()

Expand Down Expand Up @@ -539,6 +573,7 @@ def test_markitdown_llm() -> None:
test_data_uris,
test_file_uris,
test_docx_comments,
test_docx_internal_hyperlinks,
test_input_as_strings,
test_markitdown_remote,
test_speech_transcription,
Expand Down