From 039e59155ecc44b0b1c3e963d7c6f1d0c3159504 Mon Sep 17 00:00:00 2001 From: Son Nguyen Date: Mon, 22 Jun 2026 06:22:52 +0700 Subject: [PATCH] feat: add RTF (Rich Text Format) converter --- packages/markitdown/pyproject.toml | 2 + .../markitdown/src/markitdown/_markitdown.py | 2 + .../src/markitdown/converters/__init__.py | 2 + .../markitdown/converters/_rtf_converter.py | 86 +++++++++++++++++++ packages/markitdown/tests/test_files/test.rtf | 8 ++ .../markitdown/tests/test_rtf_converter.py | 41 +++++++++ 6 files changed, 141 insertions(+) create mode 100644 packages/markitdown/src/markitdown/converters/_rtf_converter.py create mode 100644 packages/markitdown/tests/test_files/test.rtf create mode 100644 packages/markitdown/tests/test_rtf_converter.py diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index d4c20a402..24f1ab675 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -49,12 +49,14 @@ all = [ "azure-ai-documentintelligence", "azure-ai-contentunderstanding>=1.2.0b1", "azure-identity", + "striprtf", ] pptx = ["python-pptx"] docx = ["mammoth~=1.11.0", "lxml"] xlsx = ["pandas", "openpyxl"] xls = ["pandas", "xlrd"] pdf = ["pdfminer.six>=20251230", "pdfplumber>=0.11.9"] +rtf = ["striprtf"] outlook = ["olefile"] audio-transcription = ["pydub", "SpeechRecognition"] youtube-transcription = ["youtube-transcript-api"] diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f6aa4df0e..38c3412da 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -40,6 +40,7 @@ DocumentIntelligenceConverter, ContentUnderstandingConverter, CsvConverter, + RtfConverter, ) from ._base_converter import DocumentConverter, DocumentConverterResult @@ -203,6 +204,7 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(OutlookMsgConverter()) self.register_converter(EpubConverter()) self.register_converter(CsvConverter()) + self.register_converter(RtfConverter()) # Register Document Intelligence converter at the top of the stack if endpoint is provided docintel_endpoint = kwargs.get("docintel_endpoint") diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index 77f8b1acd..0b49b4be8 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -27,6 +27,7 @@ ) from ._epub_converter import EpubConverter from ._csv_converter import CsvConverter +from ._rtf_converter import RtfConverter __all__ = [ "PlainTextConverter", @@ -51,4 +52,5 @@ "ContentUnderstandingFileType", "EpubConverter", "CsvConverter", + "RtfConverter", ] diff --git a/packages/markitdown/src/markitdown/converters/_rtf_converter.py b/packages/markitdown/src/markitdown/converters/_rtf_converter.py new file mode 100644 index 000000000..baa06d18a --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_rtf_converter.py @@ -0,0 +1,86 @@ +import sys + +from typing import BinaryIO, Any + +from charset_normalizer import from_bytes + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo +from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE + +# Try loading optional (but in this case, required) dependencies +# Save reporting of any exceptions for later +_dependency_exc_info = None +try: + from striprtf.striprtf import rtf_to_text +except ImportError: + # Preserve the error and stack trace for later + _dependency_exc_info = sys.exc_info() + + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/rtf", + "application/x-rtf", + "text/rtf", + "text/richtext", +] + +ACCEPTED_FILE_EXTENSIONS = [".rtf"] + + +class RtfConverter(DocumentConverter): + """ + Converts RTF (Rich Text Format) files to Markdown. RTF formatting control + words are stripped and the underlying text content is preserved. + """ + + def __init__(self): + super().__init__() + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Check: the dependencies + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".rtf", + feature="rtf", + ) + ) from _dependency_exc_info[1].with_traceback( # type: ignore[union-attr] + _dependency_exc_info[2] + ) + + # RTF is an ASCII-based format, but may declare a code page for any + # non-ASCII bytes. Decode defensively so we always hand a str to + # striprtf, which performs the actual control-word stripping. + if stream_info.charset: + rtf_content = file_stream.read().decode(stream_info.charset) + else: + rtf_content = str(from_bytes(file_stream.read()).best()) + + text = rtf_to_text(rtf_content) + + return DocumentConverterResult(markdown=text.strip()) diff --git a/packages/markitdown/tests/test_files/test.rtf b/packages/markitdown/tests/test_files/test.rtf new file mode 100644 index 000000000..48875f18b --- /dev/null +++ b/packages/markitdown/tests/test_files/test.rtf @@ -0,0 +1,8 @@ +{\rtf1\ansi\ansicpg1252\deff0{\fonttbl{\f0 Helvetica;}} +{\b\fs36 RTF Test Document 8f14e45f}\par +\par +This is a plain paragraph c4ca4238 with some {\b bold a87ff679} text.\par +\par +{\i Italic line e4da3b7f} appears here.\par +A second paragraph 1679091c with more content.\par +} diff --git a/packages/markitdown/tests/test_rtf_converter.py b/packages/markitdown/tests/test_rtf_converter.py new file mode 100644 index 000000000..5b41c569b --- /dev/null +++ b/packages/markitdown/tests/test_rtf_converter.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 -m pytest +import os + +from markitdown import MarkItDown, StreamInfo + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") + +RTF_TEST_STRINGS = [ + "RTF Test Document 8f14e45f", + "This is a plain paragraph c4ca4238 with some bold a87ff679 text.", + "Italic line e4da3b7f appears here.", + "A second paragraph 1679091c with more content.", +] + + +def test_rtf_converter_local() -> None: + """RTF files convert to Markdown with control words stripped.""" + markitdown = MarkItDown() + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.rtf")) + for s in RTF_TEST_STRINGS: + assert s in result.markdown + # Control words must not leak into the output. + assert "\\rtf1" not in result.markdown + assert "\\par" not in result.markdown + + +def test_rtf_converter_stream() -> None: + """RTF conversion works from a binary stream with explicit StreamInfo.""" + markitdown = MarkItDown() + with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as stream: + result = markitdown.convert( + stream, stream_info=StreamInfo(extension=".rtf", mimetype="application/rtf") + ) + for s in RTF_TEST_STRINGS: + assert s in result.markdown + + +if __name__ == "__main__": + test_rtf_converter_local() + test_rtf_converter_stream() + print("All RTF converter tests passed.")