From 415173f9b18d149a3f8723b7d0055e226fa38234 Mon Sep 17 00:00:00 2001 From: Lucas Ma <7184042+pony-maggie@users.noreply.github.com> Date: Mon, 15 Jun 2026 08:30:20 +0800 Subject: [PATCH] fix: truncate uppercase data image URIs --- .../src/markitdown/converters/_markdownify.py | 2 +- packages/markitdown/tests/test_module_misc.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index 19e8a2984..a07614b0f 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -104,7 +104,7 @@ def convert_img( return alt # Remove dataURIs - if src.startswith("data:") and not self.options["keep_data_uris"]: + if src.lower().startswith("data:") and not self.options["keep_data_uris"]: src = src.split(",")[0] + "..." return "![%s](%s%s)" % (alt, src, title_part) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..2568ac106 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -220,6 +220,21 @@ def test_data_uris() -> None: assert data == b"Hello, World!" +def test_uppercase_data_image_uri_is_truncated_by_default() -> None: + markitdown = MarkItDown() + html = b'dot' + stream_info = StreamInfo(mimetype="text/html", extension=".html") + + result = markitdown.convert_stream(io.BytesIO(html), stream_info=stream_info) + assert result.markdown == "![dot](DATA:image/png;base64...)" + assert "AAAA" not in result.markdown + + result = markitdown.convert_stream( + io.BytesIO(html), stream_info=stream_info, keep_data_uris=True + ) + assert result.markdown == "![dot](DATA:image/png;base64,AAAA)" + + def test_file_uris() -> None: # Test file URI with an empty host file_uri = "file:///path/to/file.txt"