From 53ffee829c2eee15c03d8b3e6e2b087e30a54cbd Mon Sep 17 00:00:00 2001
From: AngeloDanducci <angelo.danducci.ii@ibm.com>
Date: Mon, 22 Jun 2026 09:46:17 -0400
Subject: [PATCH 1/2] fix: type assertion and richdocument docstrings

Signed-off-by: AngeloDanducci <angelo.danducci.ii@ibm.com>
---
 mellea/stdlib/components/docs/richdocument.py | 60 ++++++++++++++++---
 .../components/docs/test_richdocument.py      | 17 ++++++
 2 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/mellea/stdlib/components/docs/richdocument.py b/mellea/stdlib/components/docs/richdocument.py
index 62d9c18a3..d640e8d2c 100644
--- a/mellea/stdlib/components/docs/richdocument.py
+++ b/mellea/stdlib/components/docs/richdocument.py
@@ -31,18 +31,43 @@
 
 
 class RichDocument(Component[str]):
-    """A `RichDocument` is a block of content backed by a `DoclingDocument`.
+    """A document wrapper that exposes content to a language model as Markdown.
 
-    Provides helper functions for working with the document and extracting parts
-    such as tables. Use `from_document_file` to convert PDFs or other formats,
-    and `save`/`load` for persistence.
+    The two canonical ways to create a ``RichDocument``:
+
+    * **From a file** — use :meth:`from_document_file` to convert a PDF,
+      Markdown, DOCX, or other `Docling-supported format`_ into a
+      ``RichDocument``.  Set ``do_ocr=False`` for text-based PDFs to skip
+      downloading OCR model weights.
+    * **From a saved JSON** — use :meth:`load` to restore a document previously
+      saved with :meth:`save`.
+
+    Passing a ``DoclingDocument`` directly to the constructor is intended for
+    advanced use (e.g. when you already hold a ``DoclingDocument`` produced by
+    your own Docling pipeline).  If you pass any other type a ``TypeError`` is
+    raised immediately.
+
+    .. _Docling-supported format: https://ds4sd.github.io/docling/
 
     Args:
         doc (DoclingDocument): The underlying Docling document to wrap.
+
+    Raises:
+        TypeError: If *doc* is not a ``DoclingDocument`` instance.
     """
 
     def __init__(self, doc: DoclingDocument):
-        """Initialize RichDocument by wrapping the provided DoclingDocument."""
+        """Initialize RichDocument by wrapping the provided DoclingDocument.
+
+        Raises:
+            TypeError: If *doc* is not a ``DoclingDocument`` instance.
+        """
+        if not isinstance(doc, DoclingDocument):
+            raise TypeError(
+                f"RichDocument expects a DoclingDocument, got {type(doc).__name__!r}. "
+                "To create a RichDocument from a file, use RichDocument.from_document_file(). "
+                "To restore a saved document, use RichDocument.load()."
+            )
         self._doc = doc
 
     def parts(self) -> list[Component | CBlock]:
@@ -125,11 +150,28 @@ def from_document_file(
     ) -> RichDocument:
         """Convert a document file to a `RichDocument` using Docling.
 
+        Supported formats include PDF, Markdown, DOCX, PPTX, HTML, and any
+        other format that the installed version of Docling supports.
+
+        **Limitations to be aware of:**
+
+        * PDF pipeline options (image scaling, picture extraction, OCR) are
+          always configured internally. For non-PDF formats Docling ignores
+          these options, so they have no effect on Markdown or DOCX conversion.
+        * ``do_ocr=True`` (the default) causes Docling to download OCR model
+          weights on the *first* call, which can be several hundred MB. Pass
+          ``do_ocr=False`` for text-based PDFs or any format that does not
+          require OCR to avoid this download.
+        * Remote URLs (e.g. ``"https://arxiv.org/pdf/…"``) are accepted by
+          Docling but require network access and may be slow or fail if the
+          remote is unavailable.
+
         Args:
-            source (str | Path | DocumentStream): Path or stream for the
-                source document (e.g. a PDF or Markdown file).
-            do_ocr (bool): Whether to run OCR on the document. Disable for
-                text-based PDFs to avoid downloading OCR model weights.
+            source (str | Path | DocumentStream): Path, URL, or stream for the
+                source document.
+            do_ocr (bool): Whether to run OCR on the document. Defaults to
+                ``True``. Set to ``False`` for text-based PDFs or when you want
+                to avoid downloading OCR model weights.
 
         Returns:
             RichDocument: A new `RichDocument` wrapping the converted document.
diff --git a/test/stdlib/components/docs/test_richdocument.py b/test/stdlib/components/docs/test_richdocument.py
index 06e861a4d..02d11a90c 100644
--- a/test/stdlib/components/docs/test_richdocument.py
+++ b/test/stdlib/components/docs/test_richdocument.py
@@ -108,6 +108,23 @@ def test_empty_table():
     assert table is None, "table should be empty when supplied string is empty"
 
 
+@pytest.mark.parametrize(
+    "bad_value",
+    ["path/to/file.pdf", "/absolute/path.pdf", 42, None],
+    ids=["str-path", "absolute-str-path", "int", "none"],
+)
+def test_richdocument_init_rejects_non_docling_document(bad_value):
+    with pytest.raises(TypeError, match="DoclingDocument"):
+        RichDocument(bad_value)
+
+
+def test_richdocument_init_rejects_path_object():
+    from pathlib import Path
+
+    with pytest.raises(TypeError, match="from_document_file"):
+        RichDocument(Path("some/file.pdf"))
+
+
 @pytest.mark.skip  # Test requires too much memory for smaller machines.
 @pytest.mark.e2e
 @pytest.mark.huggingface

From 7eacabd859f4cb7fa27285f2cd26da5213f446f3 Mon Sep 17 00:00:00 2001
From: AngeloDanducci <angelo.danducci.ii@ibm.com>
Date: Mon, 22 Jun 2026 12:13:30 -0400
Subject: [PATCH 2/2] reclassify richdocument typeerror tests as unit

Signed-off-by: AngeloDanducci <angelo.danducci.ii@ibm.com>
---
 .../components/docs/test_richdocument.py      | 17 -------------
 .../docs/test_richdocument_typing.py          | 24 +++++++++++++++++++
 2 files changed, 24 insertions(+), 17 deletions(-)
 create mode 100644 test/stdlib/components/docs/test_richdocument_typing.py

diff --git a/test/stdlib/components/docs/test_richdocument.py b/test/stdlib/components/docs/test_richdocument.py
index 02d11a90c..06e861a4d 100644
--- a/test/stdlib/components/docs/test_richdocument.py
+++ b/test/stdlib/components/docs/test_richdocument.py
@@ -108,23 +108,6 @@ def test_empty_table():
     assert table is None, "table should be empty when supplied string is empty"
 
 
-@pytest.mark.parametrize(
-    "bad_value",
-    ["path/to/file.pdf", "/absolute/path.pdf", 42, None],
-    ids=["str-path", "absolute-str-path", "int", "none"],
-)
-def test_richdocument_init_rejects_non_docling_document(bad_value):
-    with pytest.raises(TypeError, match="DoclingDocument"):
-        RichDocument(bad_value)
-
-
-def test_richdocument_init_rejects_path_object():
-    from pathlib import Path
-
-    with pytest.raises(TypeError, match="from_document_file"):
-        RichDocument(Path("some/file.pdf"))
-
-
 @pytest.mark.skip  # Test requires too much memory for smaller machines.
 @pytest.mark.e2e
 @pytest.mark.huggingface
diff --git a/test/stdlib/components/docs/test_richdocument_typing.py b/test/stdlib/components/docs/test_richdocument_typing.py
new file mode 100644
index 000000000..1e8e6d3e2
--- /dev/null
+++ b/test/stdlib/components/docs/test_richdocument_typing.py
@@ -0,0 +1,24 @@
+import pytest
+
+pytest.importorskip(
+    "docling_core", reason="docling_core not installed — install mellea[docling]"
+)
+
+from mellea.stdlib.components.docs.richdocument import RichDocument
+
+
+@pytest.mark.parametrize(
+    "bad_value",
+    ["path/to/file.pdf", "/absolute/path.pdf", 42, None],
+    ids=["str-path", "absolute-str-path", "int", "none"],
+)
+def test_richdocument_init_rejects_non_docling_document(bad_value):
+    with pytest.raises(TypeError, match="DoclingDocument"):
+        RichDocument(bad_value)
+
+
+def test_richdocument_init_rejects_path_object():
+    from pathlib import Path
+
+    with pytest.raises(TypeError, match="from_document_file"):
+        RichDocument(Path("some/file.pdf"))