diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index 19e8a2984..a07614b0f 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -104,7 +104,7 @@ def convert_img( return alt # Remove dataURIs - if src.startswith("data:") and not self.options["keep_data_uris"]: + if src.lower().startswith("data:") and not self.options["keep_data_uris"]: src = src.split(",")[0] + "..." return "![%s](%s%s)" % (alt, src, title_part) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..2568ac106 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -220,6 +220,21 @@ def test_data_uris() -> None: assert data == b"Hello, World!" +def test_uppercase_data_image_uri_is_truncated_by_default() -> None: + markitdown = MarkItDown() + html = b'dot' + stream_info = StreamInfo(mimetype="text/html", extension=".html") + + result = markitdown.convert_stream(io.BytesIO(html), stream_info=stream_info) + assert result.markdown == "![dot](DATA:image/png;base64...)" + assert "AAAA" not in result.markdown + + result = markitdown.convert_stream( + io.BytesIO(html), stream_info=stream_info, keep_data_uris=True + ) + assert result.markdown == "![dot](DATA:image/png;base64,AAAA)" + + def test_file_uris() -> None: # Test file URI with an empty host file_uri = "file:///path/to/file.txt"