From 5159999e9dc22670b018a62e6b164822ad7c569f Mon Sep 17 00:00:00 2001 From: Sai Teja Bandaru Date: Thu, 25 Jun 2026 02:28:36 +0200 Subject: [PATCH] fix(docx): properly escape special markdown characters at start of lines --- .../src/markitdown/converters/_markdownify.py | 8 ++++++++ packages/markitdown/tests/test_docx_escaping.py | 12 ++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 packages/markitdown/tests/test_docx_escaping.py diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index 19e8a2984..22a8ca09a 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -122,5 +122,13 @@ def convert_input( return "[x] " if el.has_attr("checked") else "[ ] " return "" + def process_text(self, text: str) -> str: + text = super().process_text(text) # type: ignore + # Escape markdown block characters at the beginning of the text node or line + # to prevent them from being parsed as headings, lists, blockquotes, etc. + # markdownify handles some inline escaping, but we need to ensure block starters are escaped. + text = re.sub(r"(?m)^([#>+\-=|])", r"\\\1", text) + return text + def convert_soup(self, soup: Any) -> str: return super().convert_soup(soup) # type: ignore diff --git a/packages/markitdown/tests/test_docx_escaping.py b/packages/markitdown/tests/test_docx_escaping.py new file mode 100644 index 000000000..7d5d2a1b1 --- /dev/null +++ b/packages/markitdown/tests/test_docx_escaping.py @@ -0,0 +1,12 @@ +import pytest +from markitdown.converters._html_converter import HtmlConverter + +def test_markdown_special_character_escaping(): + html_content = "

# Hello World

> Quote

- List

" + converter = HtmlConverter() + result = converter.convert_string(html_content) + + # Verify that the special characters at the start of paragraphs are escaped + assert r"\# Hello World" in result.markdown + assert r"\> Quote" in result.markdown + assert r"\- List" in result.markdown