From 039e59155ecc44b0b1c3e963d7c6f1d0c3159504 Mon Sep 17 00:00:00 2001
From: Son Nguyen <hoangson091104@gmail.com>
Date: Mon, 22 Jun 2026 06:22:52 +0700
Subject: [PATCH] feat: add RTF (Rich Text Format) converter

---
 packages/markitdown/pyproject.toml            |  2 +
 .../markitdown/src/markitdown/_markitdown.py  |  2 +
 .../src/markitdown/converters/__init__.py     |  2 +
 .../markitdown/converters/_rtf_converter.py   | 86 +++++++++++++++++++
 packages/markitdown/tests/test_files/test.rtf |  8 ++
 .../markitdown/tests/test_rtf_converter.py    | 41 +++++++++
 6 files changed, 141 insertions(+)
 create mode 100644 packages/markitdown/src/markitdown/converters/_rtf_converter.py
 create mode 100644 packages/markitdown/tests/test_files/test.rtf
 create mode 100644 packages/markitdown/tests/test_rtf_converter.py

diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml
index d4c20a402..24f1ab675 100644
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@@ -49,12 +49,14 @@ all = [
   "azure-ai-documentintelligence",
   "azure-ai-contentunderstanding>=1.2.0b1",
   "azure-identity",
+  "striprtf",
 ]
 pptx = ["python-pptx"]
 docx = ["mammoth~=1.11.0", "lxml"]
 xlsx = ["pandas", "openpyxl"]
 xls = ["pandas", "xlrd"]
 pdf = ["pdfminer.six>=20251230", "pdfplumber>=0.11.9"]
+rtf = ["striprtf"]
 outlook = ["olefile"]
 audio-transcription = ["pydub", "SpeechRecognition"]
 youtube-transcription = ["youtube-transcript-api"]
diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
index f6aa4df0e..38c3412da 100644
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -40,6 +40,7 @@
     DocumentIntelligenceConverter,
     ContentUnderstandingConverter,
     CsvConverter,
+    RtfConverter,
 )
 
 from ._base_converter import DocumentConverter, DocumentConverterResult
@@ -203,6 +204,7 @@ def enable_builtins(self, **kwargs) -> None:
             self.register_converter(OutlookMsgConverter())
             self.register_converter(EpubConverter())
             self.register_converter(CsvConverter())
+            self.register_converter(RtfConverter())
 
             # Register Document Intelligence converter at the top of the stack if endpoint is provided
             docintel_endpoint = kwargs.get("docintel_endpoint")
diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py
index 77f8b1acd..0b49b4be8 100644
--- a/packages/markitdown/src/markitdown/converters/__init__.py
+++ b/packages/markitdown/src/markitdown/converters/__init__.py
@@ -27,6 +27,7 @@
 )
 from ._epub_converter import EpubConverter
 from ._csv_converter import CsvConverter
+from ._rtf_converter import RtfConverter
 
 __all__ = [
     "PlainTextConverter",
@@ -51,4 +52,5 @@
     "ContentUnderstandingFileType",
     "EpubConverter",
     "CsvConverter",
+    "RtfConverter",
 ]
diff --git a/packages/markitdown/src/markitdown/converters/_rtf_converter.py b/packages/markitdown/src/markitdown/converters/_rtf_converter.py
new file mode 100644
index 000000000..baa06d18a
--- /dev/null
+++ b/packages/markitdown/src/markitdown/converters/_rtf_converter.py
@@ -0,0 +1,86 @@
+import sys
+
+from typing import BinaryIO, Any
+
+from charset_normalizer import from_bytes
+
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
+from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+
+# Try loading optional (but in this case, required) dependencies
+# Save reporting of any exceptions for later
+_dependency_exc_info = None
+try:
+    from striprtf.striprtf import rtf_to_text
+except ImportError:
+    # Preserve the error and stack trace for later
+    _dependency_exc_info = sys.exc_info()
+
+
+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "application/rtf",
+    "application/x-rtf",
+    "text/rtf",
+    "text/richtext",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".rtf"]
+
+
+class RtfConverter(DocumentConverter):
+    """
+    Converts RTF (Rich Text Format) files to Markdown. RTF formatting control
+    words are stripped and the underlying text content is preserved.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
+        # Check: the dependencies
+        if _dependency_exc_info is not None:
+            raise MissingDependencyException(
+                MISSING_DEPENDENCY_MESSAGE.format(
+                    converter=type(self).__name__,
+                    extension=".rtf",
+                    feature="rtf",
+                )
+            ) from _dependency_exc_info[1].with_traceback(  # type: ignore[union-attr]
+                _dependency_exc_info[2]
+            )
+
+        # RTF is an ASCII-based format, but may declare a code page for any
+        # non-ASCII bytes. Decode defensively so we always hand a str to
+        # striprtf, which performs the actual control-word stripping.
+        if stream_info.charset:
+            rtf_content = file_stream.read().decode(stream_info.charset)
+        else:
+            rtf_content = str(from_bytes(file_stream.read()).best())
+
+        text = rtf_to_text(rtf_content)
+
+        return DocumentConverterResult(markdown=text.strip())
diff --git a/packages/markitdown/tests/test_files/test.rtf b/packages/markitdown/tests/test_files/test.rtf
new file mode 100644
index 000000000..48875f18b
--- /dev/null
+++ b/packages/markitdown/tests/test_files/test.rtf
@@ -0,0 +1,8 @@
+{\rtf1\ansi\ansicpg1252\deff0{\fonttbl{\f0 Helvetica;}}
+{\b\fs36 RTF Test Document 8f14e45f}\par
+\par
+This is a plain paragraph c4ca4238 with some {\b bold a87ff679} text.\par
+\par
+{\i Italic line e4da3b7f} appears here.\par
+A second paragraph 1679091c with more content.\par
+}
diff --git a/packages/markitdown/tests/test_rtf_converter.py b/packages/markitdown/tests/test_rtf_converter.py
new file mode 100644
index 000000000..5b41c569b
--- /dev/null
+++ b/packages/markitdown/tests/test_rtf_converter.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3 -m pytest
+import os
+
+from markitdown import MarkItDown, StreamInfo
+
+TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
+
+RTF_TEST_STRINGS = [
+    "RTF Test Document 8f14e45f",
+    "This is a plain paragraph c4ca4238 with some bold a87ff679 text.",
+    "Italic line e4da3b7f appears here.",
+    "A second paragraph 1679091c with more content.",
+]
+
+
+def test_rtf_converter_local() -> None:
+    """RTF files convert to Markdown with control words stripped."""
+    markitdown = MarkItDown()
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.rtf"))
+    for s in RTF_TEST_STRINGS:
+        assert s in result.markdown
+    # Control words must not leak into the output.
+    assert "\\rtf1" not in result.markdown
+    assert "\\par" not in result.markdown
+
+
+def test_rtf_converter_stream() -> None:
+    """RTF conversion works from a binary stream with explicit StreamInfo."""
+    markitdown = MarkItDown()
+    with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as stream:
+        result = markitdown.convert(
+            stream, stream_info=StreamInfo(extension=".rtf", mimetype="application/rtf")
+        )
+    for s in RTF_TEST_STRINGS:
+        assert s in result.markdown
+
+
+if __name__ == "__main__":
+    test_rtf_converter_local()
+    test_rtf_converter_stream()
+    print("All RTF converter tests passed.")