From 53ffee829c2eee15c03d8b3e6e2b087e30a54cbd Mon Sep 17 00:00:00 2001 From: AngeloDanducci Date: Mon, 22 Jun 2026 09:46:17 -0400 Subject: [PATCH 1/2] fix: type assertion and richdocument docstrings Signed-off-by: AngeloDanducci --- mellea/stdlib/components/docs/richdocument.py | 60 ++++++++++++++++--- .../components/docs/test_richdocument.py | 17 ++++++ 2 files changed, 68 insertions(+), 9 deletions(-) diff --git a/mellea/stdlib/components/docs/richdocument.py b/mellea/stdlib/components/docs/richdocument.py index 62d9c18a3..d640e8d2c 100644 --- a/mellea/stdlib/components/docs/richdocument.py +++ b/mellea/stdlib/components/docs/richdocument.py @@ -31,18 +31,43 @@ class RichDocument(Component[str]): - """A `RichDocument` is a block of content backed by a `DoclingDocument`. + """A document wrapper that exposes content to a language model as Markdown. - Provides helper functions for working with the document and extracting parts - such as tables. Use `from_document_file` to convert PDFs or other formats, - and `save`/`load` for persistence. + The two canonical ways to create a ``RichDocument``: + + * **From a file** — use :meth:`from_document_file` to convert a PDF, + Markdown, DOCX, or other `Docling-supported format`_ into a + ``RichDocument``. Set ``do_ocr=False`` for text-based PDFs to skip + downloading OCR model weights. + * **From a saved JSON** — use :meth:`load` to restore a document previously + saved with :meth:`save`. + + Passing a ``DoclingDocument`` directly to the constructor is intended for + advanced use (e.g. when you already hold a ``DoclingDocument`` produced by + your own Docling pipeline). If you pass any other type a ``TypeError`` is + raised immediately. + + .. _Docling-supported format: https://ds4sd.github.io/docling/ Args: doc (DoclingDocument): The underlying Docling document to wrap. + + Raises: + TypeError: If *doc* is not a ``DoclingDocument`` instance. """ def __init__(self, doc: DoclingDocument): - """Initialize RichDocument by wrapping the provided DoclingDocument.""" + """Initialize RichDocument by wrapping the provided DoclingDocument. + + Raises: + TypeError: If *doc* is not a ``DoclingDocument`` instance. + """ + if not isinstance(doc, DoclingDocument): + raise TypeError( + f"RichDocument expects a DoclingDocument, got {type(doc).__name__!r}. " + "To create a RichDocument from a file, use RichDocument.from_document_file(). " + "To restore a saved document, use RichDocument.load()." + ) self._doc = doc def parts(self) -> list[Component | CBlock]: @@ -125,11 +150,28 @@ def from_document_file( ) -> RichDocument: """Convert a document file to a `RichDocument` using Docling. + Supported formats include PDF, Markdown, DOCX, PPTX, HTML, and any + other format that the installed version of Docling supports. + + **Limitations to be aware of:** + + * PDF pipeline options (image scaling, picture extraction, OCR) are + always configured internally. For non-PDF formats Docling ignores + these options, so they have no effect on Markdown or DOCX conversion. + * ``do_ocr=True`` (the default) causes Docling to download OCR model + weights on the *first* call, which can be several hundred MB. Pass + ``do_ocr=False`` for text-based PDFs or any format that does not + require OCR to avoid this download. + * Remote URLs (e.g. ``"https://arxiv.org/pdf/…"``) are accepted by + Docling but require network access and may be slow or fail if the + remote is unavailable. + Args: - source (str | Path | DocumentStream): Path or stream for the - source document (e.g. a PDF or Markdown file). - do_ocr (bool): Whether to run OCR on the document. Disable for - text-based PDFs to avoid downloading OCR model weights. + source (str | Path | DocumentStream): Path, URL, or stream for the + source document. + do_ocr (bool): Whether to run OCR on the document. Defaults to + ``True``. Set to ``False`` for text-based PDFs or when you want + to avoid downloading OCR model weights. Returns: RichDocument: A new `RichDocument` wrapping the converted document. diff --git a/test/stdlib/components/docs/test_richdocument.py b/test/stdlib/components/docs/test_richdocument.py index 06e861a4d..02d11a90c 100644 --- a/test/stdlib/components/docs/test_richdocument.py +++ b/test/stdlib/components/docs/test_richdocument.py @@ -108,6 +108,23 @@ def test_empty_table(): assert table is None, "table should be empty when supplied string is empty" +@pytest.mark.parametrize( + "bad_value", + ["path/to/file.pdf", "/absolute/path.pdf", 42, None], + ids=["str-path", "absolute-str-path", "int", "none"], +) +def test_richdocument_init_rejects_non_docling_document(bad_value): + with pytest.raises(TypeError, match="DoclingDocument"): + RichDocument(bad_value) + + +def test_richdocument_init_rejects_path_object(): + from pathlib import Path + + with pytest.raises(TypeError, match="from_document_file"): + RichDocument(Path("some/file.pdf")) + + @pytest.mark.skip # Test requires too much memory for smaller machines. @pytest.mark.e2e @pytest.mark.huggingface From 7eacabd859f4cb7fa27285f2cd26da5213f446f3 Mon Sep 17 00:00:00 2001 From: AngeloDanducci Date: Mon, 22 Jun 2026 12:13:30 -0400 Subject: [PATCH 2/2] reclassify richdocument typeerror tests as unit Signed-off-by: AngeloDanducci --- .../components/docs/test_richdocument.py | 17 ------------- .../docs/test_richdocument_typing.py | 24 +++++++++++++++++++ 2 files changed, 24 insertions(+), 17 deletions(-) create mode 100644 test/stdlib/components/docs/test_richdocument_typing.py diff --git a/test/stdlib/components/docs/test_richdocument.py b/test/stdlib/components/docs/test_richdocument.py index 02d11a90c..06e861a4d 100644 --- a/test/stdlib/components/docs/test_richdocument.py +++ b/test/stdlib/components/docs/test_richdocument.py @@ -108,23 +108,6 @@ def test_empty_table(): assert table is None, "table should be empty when supplied string is empty" -@pytest.mark.parametrize( - "bad_value", - ["path/to/file.pdf", "/absolute/path.pdf", 42, None], - ids=["str-path", "absolute-str-path", "int", "none"], -) -def test_richdocument_init_rejects_non_docling_document(bad_value): - with pytest.raises(TypeError, match="DoclingDocument"): - RichDocument(bad_value) - - -def test_richdocument_init_rejects_path_object(): - from pathlib import Path - - with pytest.raises(TypeError, match="from_document_file"): - RichDocument(Path("some/file.pdf")) - - @pytest.mark.skip # Test requires too much memory for smaller machines. @pytest.mark.e2e @pytest.mark.huggingface diff --git a/test/stdlib/components/docs/test_richdocument_typing.py b/test/stdlib/components/docs/test_richdocument_typing.py new file mode 100644 index 000000000..1e8e6d3e2 --- /dev/null +++ b/test/stdlib/components/docs/test_richdocument_typing.py @@ -0,0 +1,24 @@ +import pytest + +pytest.importorskip( + "docling_core", reason="docling_core not installed — install mellea[docling]" +) + +from mellea.stdlib.components.docs.richdocument import RichDocument + + +@pytest.mark.parametrize( + "bad_value", + ["path/to/file.pdf", "/absolute/path.pdf", 42, None], + ids=["str-path", "absolute-str-path", "int", "none"], +) +def test_richdocument_init_rejects_non_docling_document(bad_value): + with pytest.raises(TypeError, match="DoclingDocument"): + RichDocument(bad_value) + + +def test_richdocument_init_rejects_path_object(): + from pathlib import Path + + with pytest.raises(TypeError, match="from_document_file"): + RichDocument(Path("some/file.pdf"))