diff --git a/pdf_craft/functions.py b/pdf_craft/functions.py index ecc9a07..6c1ff44 100644 --- a/pdf_craft/functions.py +++ b/pdf_craft/functions.py @@ -43,6 +43,7 @@ def transform_markdown( max_ocr_tokens: int | None = None, max_ocr_output_tokens: int | None = None, on_ocr_event: Callable[[OCREvent], None] = lambda _: None, + page_indexes: range | None = None, ) -> OCRTokensMetering: return Transform( @@ -67,6 +68,7 @@ def transform_markdown( max_ocr_tokens=max_ocr_tokens, max_ocr_output_tokens=max_ocr_output_tokens, on_ocr_event=on_ocr_event, + page_indexes=page_indexes, ) @@ -95,6 +97,7 @@ def transform_epub( max_ocr_tokens: int | None = None, max_ocr_output_tokens: int | None = None, on_ocr_event: Callable[[OCREvent], None] = lambda _: None, + page_indexes: range | None = None, ) -> OCRTokensMetering: return Transform( @@ -123,4 +126,5 @@ def transform_epub( max_ocr_tokens=max_ocr_tokens, max_ocr_output_tokens=max_ocr_output_tokens, on_ocr_event=on_ocr_event, + page_indexes=page_indexes, ) diff --git a/pdf_craft/transform.py b/pdf_craft/transform.py index d77cd1e..2561f47 100644 --- a/pdf_craft/transform.py +++ b/pdf_craft/transform.py @@ -55,6 +55,7 @@ def transform_markdown( max_ocr_tokens: int | None = None, max_ocr_output_tokens: int | None = None, on_ocr_event: Callable[[OCREvent], None] = lambda _: None, + page_indexes: range | None = None, ) -> OCRTokensMetering: # pyright: ignore[reportReturnType] if markdown_assets_path is None: @@ -81,6 +82,7 @@ def transform_markdown( max_tokens=max_ocr_tokens, max_output_tokens=max_ocr_output_tokens, on_ocr_event=on_ocr_event, + page_indexes=page_indexes, ) render_markdown_file( chapters_path=chapters_path, @@ -124,6 +126,7 @@ def transform_epub( max_ocr_tokens: int | None = None, max_ocr_output_tokens: int | None = None, on_ocr_event: Callable[[OCREvent], None] = lambda _: None, + page_indexes: range | None = None, ) -> OCRTokensMetering: # pyright: ignore[reportReturnType] try: with EnsureFolder( @@ -146,6 +149,7 @@ def transform_epub( max_tokens=max_ocr_tokens, max_output_tokens=max_ocr_output_tokens, on_ocr_event=on_ocr_event, + page_indexes=page_indexes, ) book_meta = book_meta or self._extract_book_meta(pdf_path) @@ -190,6 +194,7 @@ def _extract_from_pdf( max_tokens: int | None, max_output_tokens: int | None, on_ocr_event: Callable[[OCREvent], None], + page_indexes: range | None = None, ): asserts_path = analysing_path / "assets" @@ -208,22 +213,26 @@ def _extract_from_pdf( input_tokens=0, output_tokens=0, ) - for event in self._ocr.recognize( - pdf_path=pdf_path, - asset_path=asserts_path, - ocr_path=pages_path, - ocr_size=ocr_size, - dpi=dpi, - max_page_image_file_size=max_page_image_file_size, - includes_footnotes=includes_footnotes, - ignore_pdf_errors=ignore_pdf_errors, - ignore_ocr_errors=ignore_ocr_errors, - plot_path=plot_path, - cover_path=cover_path, - aborted=aborted, - max_tokens=max_tokens, - max_output_tokens=max_output_tokens, - ): + recognize_kwargs = { + "pdf_path": pdf_path, + "asset_path": asserts_path, + "ocr_path": pages_path, + "ocr_size": ocr_size, + "dpi": dpi, + "max_page_image_file_size": max_page_image_file_size, + "includes_footnotes": includes_footnotes, + "ignore_pdf_errors": ignore_pdf_errors, + "ignore_ocr_errors": ignore_ocr_errors, + "plot_path": plot_path, + "cover_path": cover_path, + "aborted": aborted, + "max_tokens": max_tokens, + "max_output_tokens": max_output_tokens, + } + if page_indexes is not None: + recognize_kwargs["page_indexes"] = page_indexes + + for event in self._ocr.recognize(**recognize_kwargs): on_ocr_event(event) metering.input_tokens += event.input_tokens metering.output_tokens += event.output_tokens