diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index 19e8a2984..a07614b0f 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -104,7 +104,7 @@ def convert_img( return alt # Remove dataURIs - if src.startswith("data:") and not self.options["keep_data_uris"]: + if src.lower().startswith("data:") and not self.options["keep_data_uris"]: src = src.split(",")[0] + "..." return "" % (alt, src, title_part) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..2568ac106 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -220,6 +220,21 @@ def test_data_uris() -> None: assert data == b"Hello, World!" +def test_uppercase_data_image_uri_is_truncated_by_default() -> None: + markitdown = MarkItDown() + html = b'