diff --git a/mellea/stdlib/components/docs/richdocument.py b/mellea/stdlib/components/docs/richdocument.py index 62d9c18a3..d640e8d2c 100644 --- a/mellea/stdlib/components/docs/richdocument.py +++ b/mellea/stdlib/components/docs/richdocument.py @@ -31,18 +31,43 @@ class RichDocument(Component[str]): - """A `RichDocument` is a block of content backed by a `DoclingDocument`. + """A document wrapper that exposes content to a language model as Markdown. - Provides helper functions for working with the document and extracting parts - such as tables. Use `from_document_file` to convert PDFs or other formats, - and `save`/`load` for persistence. + The two canonical ways to create a ``RichDocument``: + + * **From a file** — use :meth:`from_document_file` to convert a PDF, + Markdown, DOCX, or other `Docling-supported format`_ into a + ``RichDocument``. Set ``do_ocr=False`` for text-based PDFs to skip + downloading OCR model weights. + * **From a saved JSON** — use :meth:`load` to restore a document previously + saved with :meth:`save`. + + Passing a ``DoclingDocument`` directly to the constructor is intended for + advanced use (e.g. when you already hold a ``DoclingDocument`` produced by + your own Docling pipeline). If you pass any other type a ``TypeError`` is + raised immediately. + + .. _Docling-supported format: https://ds4sd.github.io/docling/ Args: doc (DoclingDocument): The underlying Docling document to wrap. + + Raises: + TypeError: If *doc* is not a ``DoclingDocument`` instance. """ def __init__(self, doc: DoclingDocument): - """Initialize RichDocument by wrapping the provided DoclingDocument.""" + """Initialize RichDocument by wrapping the provided DoclingDocument. + + Raises: + TypeError: If *doc* is not a ``DoclingDocument`` instance. + """ + if not isinstance(doc, DoclingDocument): + raise TypeError( + f"RichDocument expects a DoclingDocument, got {type(doc).__name__!r}. " + "To create a RichDocument from a file, use RichDocument.from_document_file(). " + "To restore a saved document, use RichDocument.load()." + ) self._doc = doc def parts(self) -> list[Component | CBlock]: @@ -125,11 +150,28 @@ def from_document_file( ) -> RichDocument: """Convert a document file to a `RichDocument` using Docling. + Supported formats include PDF, Markdown, DOCX, PPTX, HTML, and any + other format that the installed version of Docling supports. + + **Limitations to be aware of:** + + * PDF pipeline options (image scaling, picture extraction, OCR) are + always configured internally. For non-PDF formats Docling ignores + these options, so they have no effect on Markdown or DOCX conversion. + * ``do_ocr=True`` (the default) causes Docling to download OCR model + weights on the *first* call, which can be several hundred MB. Pass + ``do_ocr=False`` for text-based PDFs or any format that does not + require OCR to avoid this download. + * Remote URLs (e.g. ``"https://arxiv.org/pdf/…"``) are accepted by + Docling but require network access and may be slow or fail if the + remote is unavailable. + Args: - source (str | Path | DocumentStream): Path or stream for the - source document (e.g. a PDF or Markdown file). - do_ocr (bool): Whether to run OCR on the document. Disable for - text-based PDFs to avoid downloading OCR model weights. + source (str | Path | DocumentStream): Path, URL, or stream for the + source document. + do_ocr (bool): Whether to run OCR on the document. Defaults to + ``True``. Set to ``False`` for text-based PDFs or when you want + to avoid downloading OCR model weights. Returns: RichDocument: A new `RichDocument` wrapping the converted document. diff --git a/test/stdlib/components/docs/test_richdocument_typing.py b/test/stdlib/components/docs/test_richdocument_typing.py new file mode 100644 index 000000000..1e8e6d3e2 --- /dev/null +++ b/test/stdlib/components/docs/test_richdocument_typing.py @@ -0,0 +1,24 @@ +import pytest + +pytest.importorskip( + "docling_core", reason="docling_core not installed — install mellea[docling]" +) + +from mellea.stdlib.components.docs.richdocument import RichDocument + + +@pytest.mark.parametrize( + "bad_value", + ["path/to/file.pdf", "/absolute/path.pdf", 42, None], + ids=["str-path", "absolute-str-path", "int", "none"], +) +def test_richdocument_init_rejects_non_docling_document(bad_value): + with pytest.raises(TypeError, match="DoclingDocument"): + RichDocument(bad_value) + + +def test_richdocument_init_rejects_path_object(): + from pathlib import Path + + with pytest.raises(TypeError, match="from_document_file"): + RichDocument(Path("some/file.pdf"))