generative-computing · AngeloDanducci · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
@@ -31,18 +31,43 @@
 
 
 class RichDocument(Component[str]):
-    """A `RichDocument` is a block of content backed by a `DoclingDocument`.
+    """A document wrapper that exposes content to a language model as Markdown.
 
-    Provides helper functions for working with the document and extracting parts
-    such as tables. Use `from_document_file` to convert PDFs or other formats,
-    and `save`/`load` for persistence.
+    The two canonical ways to create a ``RichDocument``:
+
+    * **From a file** — use :meth:`from_document_file` to convert a PDF,
+      Markdown, DOCX, or other `Docling-supported format`_ into a
+      ``RichDocument``.  Set ``do_ocr=False`` for text-based PDFs to skip
+      downloading OCR model weights.
+    * **From a saved JSON** — use :meth:`load` to restore a document previously
+      saved with :meth:`save`.
+
+    Passing a ``DoclingDocument`` directly to the constructor is intended for
+    advanced use (e.g. when you already hold a ``DoclingDocument`` produced by
+    your own Docling pipeline).  If you pass any other type a ``TypeError`` is
+    raised immediately.
+
+    .. _Docling-supported format: https://ds4sd.github.io/docling/
 
     Args:
         doc (DoclingDocument): The underlying Docling document to wrap.
+
+    Raises:
+        TypeError: If *doc* is not a ``DoclingDocument`` instance.
     """
 
     def __init__(self, doc: DoclingDocument):
-        """Initialize RichDocument by wrapping the provided DoclingDocument."""
+        """Initialize RichDocument by wrapping the provided DoclingDocument.
+
+        Raises:
+            TypeError: If *doc* is not a ``DoclingDocument`` instance.
+        """
+        if not isinstance(doc, DoclingDocument):
+            raise TypeError(
+                f"RichDocument expects a DoclingDocument, got {type(doc).__name__!r}. "
+                "To create a RichDocument from a file, use RichDocument.from_document_file(). "
+                "To restore a saved document, use RichDocument.load()."
+            )
         self._doc = doc
 
     def parts(self) -> list[Component | CBlock]:
@@ -125,11 +150,28 @@ def from_document_file(
     ) -> RichDocument:
         """Convert a document file to a `RichDocument` using Docling.
 
+        Supported formats include PDF, Markdown, DOCX, PPTX, HTML, and any
+        other format that the installed version of Docling supports.
+
+        **Limitations to be aware of:**
+
+        * PDF pipeline options (image scaling, picture extraction, OCR) are
+          always configured internally. For non-PDF formats Docling ignores
+          these options, so they have no effect on Markdown or DOCX conversion.
+        * ``do_ocr=True`` (the default) causes Docling to download OCR model
+          weights on the *first* call, which can be several hundred MB. Pass
+          ``do_ocr=False`` for text-based PDFs or any format that does not
+          require OCR to avoid this download.
+        * Remote URLs (e.g. ``"https://arxiv.org/pdf/…"``) are accepted by
+          Docling but require network access and may be slow or fail if the
+          remote is unavailable.
+
         Args:
-            source (str | Path | DocumentStream): Path or stream for the
-                source document (e.g. a PDF or Markdown file).
-            do_ocr (bool): Whether to run OCR on the document. Disable for
-                text-based PDFs to avoid downloading OCR model weights.
+            source (str | Path | DocumentStream): Path, URL, or stream for the
+                source document.
+            do_ocr (bool): Whether to run OCR on the document. Defaults to
+                ``True``. Set to ``False`` for text-based PDFs or when you want
+                to avoid downloading OCR model weights.
 
         Returns:
             RichDocument: A new `RichDocument` wrapping the converted document.

@@ -0,0 +1,24 @@
+import pytest
+
+pytest.importorskip(
+    "docling_core", reason="docling_core not installed — install mellea[docling]"
+)
+
+from mellea.stdlib.components.docs.richdocument import RichDocument
+
+
+@pytest.mark.parametrize(
+    "bad_value",
+    ["path/to/file.pdf", "/absolute/path.pdf", 42, None],
+    ids=["str-path", "absolute-str-path", "int", "none"],
+)
+def test_richdocument_init_rejects_non_docling_document(bad_value):
+    with pytest.raises(TypeError, match="DoclingDocument"):
+        RichDocument(bad_value)
+
+
+def test_richdocument_init_rejects_path_object():
+    from pathlib import Path
+
+    with pytest.raises(TypeError, match="from_document_file"):
+        RichDocument(Path("some/file.pdf"))