Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions packages/markitdown/src/markitdown/converters/_markdownify.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,5 +122,13 @@ def convert_input(
return "[x] " if el.has_attr("checked") else "[ ] "
return ""

def process_text(self, text: str) -> str:
text = super().process_text(text) # type: ignore
# Escape markdown block characters at the beginning of the text node or line
# to prevent them from being parsed as headings, lists, blockquotes, etc.
# markdownify handles some inline escaping, but we need to ensure block starters are escaped.
text = re.sub(r"(?m)^([#>+\-=|])", r"\\\1", text)
return text

def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore
12 changes: 12 additions & 0 deletions packages/markitdown/tests/test_docx_escaping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pytest
from markitdown.converters._html_converter import HtmlConverter

def test_markdown_special_character_escaping():
html_content = "<p># Hello World</p><p>> Quote</p><p>- List</p>"
converter = HtmlConverter()
result = converter.convert_string(html_content)

# Verify that the special characters at the start of paragraphs are escaped
assert r"\# Hello World" in result.markdown
assert r"\> Quote" in result.markdown
assert r"\- List" in result.markdown