diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index 19e8a2984..22a8ca09a 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -122,5 +122,13 @@ def convert_input( return "[x] " if el.has_attr("checked") else "[ ] " return "" + def process_text(self, text: str) -> str: + text = super().process_text(text) # type: ignore + # Escape markdown block characters at the beginning of the text node or line + # to prevent them from being parsed as headings, lists, blockquotes, etc. + # markdownify handles some inline escaping, but we need to ensure block starters are escaped. + text = re.sub(r"(?m)^([#>+\-=|])", r"\\\1", text) + return text + def convert_soup(self, soup: Any) -> str: return super().convert_soup(soup) # type: ignore diff --git a/packages/markitdown/tests/test_docx_escaping.py b/packages/markitdown/tests/test_docx_escaping.py new file mode 100644 index 000000000..7d5d2a1b1 --- /dev/null +++ b/packages/markitdown/tests/test_docx_escaping.py @@ -0,0 +1,12 @@ +import pytest +from markitdown.converters._html_converter import HtmlConverter + +def test_markdown_special_character_escaping(): + html_content = "

# Hello World

> Quote

- List

" + converter = HtmlConverter() + result = converter.convert_string(html_content) + + # Verify that the special characters at the start of paragraphs are escaped + assert r"\# Hello World" in result.markdown + assert r"\> Quote" in result.markdown + assert r"\- List" in result.markdown