From 09c7a86b0632544556ee2003a58331585e449677 Mon Sep 17 00:00:00 2001 From: Eliska Suchardova Date: Tue, 27 May 2025 10:50:47 +0200 Subject: [PATCH 1/7] Added mathpix option for equations --- marker/config/parser.py | 9 ++ marker/converters/__init__.py | 7 +- marker/converters/ocr.py | 13 ++- marker/converters/pdf.py | 4 +- marker/processors/equation.py | 153 +++++++++++++++++++++++++++------- marker/providers/mathpix.py | 58 +++++++++++++ marker/settings.py | 11 ++- 7 files changed, 216 insertions(+), 39 deletions(-) create mode 100644 marker/providers/mathpix.py diff --git a/marker/config/parser.py b/marker/config/parser.py index 8e6bd8de0..e3543beef 100644 --- a/marker/config/parser.py +++ b/marker/config/parser.py @@ -60,6 +60,12 @@ def common_options(fn): default=False, help="Disable image extraction.", )(fn) + fn = click.option( + "--mathpix", + is_flag=True, + default=False, + help="Use Mathpix for equation processing.", + )(fn) # these are options that need a list transformation, i.e splitting/parsing a string fn = click.option( @@ -106,6 +112,9 @@ def generate_config_dict(self) -> Dict[str, any]: config["pdftext_workers"] = 1 case "disable_image_extraction": config["extract_images"] = False + case "mathpix": + if v: + settings.EQUATION_PROCESSOR = "mathpix" case _: if k in crawler.attr_set: config[k] = v diff --git a/marker/converters/__init__.py b/marker/converters/__init__.py index 8357a4991..fea1bd20b 100644 --- a/marker/converters/__init__.py +++ b/marker/converters/__init__.py @@ -43,7 +43,12 @@ def resolve_dependencies(self, cls): def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) -> List[BaseProcessor]: processors = [] for processor_cls in processor_cls_lst: - processors.append(self.resolve_dependencies(processor_cls)) + if callable(processor_cls) and processor_cls.__name__ == 'get_equation_processor': + # Special case for equation processor + processor = processor_cls(config=self.config) + else: + processor = self.resolve_dependencies(processor_cls) + processors.append(processor) simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)] other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)] diff --git a/marker/converters/ocr.py b/marker/converters/ocr.py index f7e562553..2dab5cc5a 100644 --- a/marker/converters/ocr.py +++ b/marker/converters/ocr.py @@ -5,13 +5,15 @@ from marker.builders.ocr import OcrBuilder from marker.converters.pdf import PdfConverter from marker.processors import BaseProcessor -from marker.processors.equation import EquationProcessor +from marker.processors.equation import get_equation_processor from marker.providers.registry import provider_from_filepath from marker.renderers.ocr_json import OCRJSONRenderer +from marker.providers.mathpix import MathpixProvider +from marker.settings import settings class OCRConverter(PdfConverter): - default_processors: Tuple[BaseProcessor, ...] = (EquationProcessor,) + default_processors: Tuple[BaseProcessor, ...] = (get_equation_processor(),) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -21,6 +23,12 @@ def __init__(self, *args, **kwargs): self.config["format_lines"] = True self.renderer = OCRJSONRenderer + + # Initialize Mathpix provider + self.mathpix_provider = MathpixProvider( + app_id=settings.MATHPIX_APP_ID, + app_key=settings.MATHPIX_APP_KEY + ) def build_document(self, filepath: str): provider_cls = provider_from_filepath(filepath) @@ -32,6 +40,7 @@ def build_document(self, filepath: str): provider = provider_cls(filepath, self.config) document = document_builder(provider, layout_builder, line_builder, ocr_builder) + # Initialize processors for processor in self.processor_list: processor(document) diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py index 21a604431..20b0f20c5 100644 --- a/marker/converters/pdf.py +++ b/marker/converters/pdf.py @@ -24,7 +24,7 @@ from marker.processors.code import CodeProcessor from marker.processors.debug import DebugProcessor from marker.processors.document_toc import DocumentTOCProcessor -from marker.processors.equation import EquationProcessor +from marker.processors.equation import get_equation_processor from marker.processors.footnote import FootnoteProcessor from marker.processors.ignoretext import IgnoreTextProcessor from marker.processors.line_numbers import LineNumbersProcessor @@ -73,7 +73,7 @@ class PdfConverter(BaseConverter): BlockquoteProcessor, CodeProcessor, DocumentTOCProcessor, - EquationProcessor, + get_equation_processor, FootnoteProcessor, IgnoreTextProcessor, LineNumbersProcessor, diff --git a/marker/processors/equation.py b/marker/processors/equation.py index 36124645c..6cc83c98d 100644 --- a/marker/processors/equation.py +++ b/marker/processors/equation.py @@ -4,25 +4,58 @@ from bs4 import BeautifulSoup from ftfy import fix_text, TextFixerConfig -from surya.recognition import RecognitionPredictor, OCRResult - from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.document import Document from marker.settings import settings +from marker.providers.mathpix import MathpixProvider +from surya.recognition import RecognitionPredictor, OCRResult MATH_TAG_PATTERN = re.compile(r"]*>(.*?)") - -class EquationProcessor(BaseProcessor): +class BaseEquationProcessor(BaseProcessor): """ - A processor for recognizing equations in the document. + Base class for equation processors. """ - block_types: Annotated[ Tuple[BlockTypes], "The block types to process.", ] = (BlockTypes.Equation,) + + def fix_latex(self, math_html: str): + math_html = math_html.strip() + soup = BeautifulSoup(math_html, "html.parser") + opening_math_tag = soup.find("math") + + # No math block found + if not opening_math_tag: + return "" + + # Force block format + opening_math_tag.attrs["display"] = "block" + fixed_math_html = str(soup) + + # Sometimes model outputs newlines at the beginning/end of tags + fixed_math_html = re.sub( + r"^\\n(?![a-zA-Z])", + '', + fixed_math_html, + ) + fixed_math_html = re.sub(r"\\n$", "", fixed_math_html) + fixed_math_html = fix_text( + fixed_math_html, config=TextFixerConfig(unescape_html=True) + ) + return fixed_math_html + + def process_equations(self, document: Document): + """Process equations in the document. To be implemented by subclasses.""" + raise NotImplementedError + + +class OriginalEquationProcessor(BaseEquationProcessor): + """ + Original equation processor using the recognition model. + """ model_max_length: Annotated[ int, "The maximum number of tokens to allow for the Recognition model.", @@ -39,7 +72,6 @@ class EquationProcessor(BaseProcessor): def __init__(self, recognition_model: RecognitionPredictor, config=None): super().__init__(config) - self.recognition_model = recognition_model def get_batch_size(self): @@ -93,31 +125,6 @@ def __call__(self, document: Document): block = document.get_block(block_id) block.html = self.fix_latex(block_prediction) - def fix_latex(self, math_html: str): - math_html = math_html.strip() - soup = BeautifulSoup(math_html, "html.parser") - opening_math_tag = soup.find("math") - - # No math block found - if not opening_math_tag: - return "" - - # Force block format - opening_math_tag.attrs["display"] = "block" - fixed_math_html = str(soup) - - # Sometimes model outputs newlines at the beginning/end of tags - fixed_math_html = re.sub( - r"^\\n(?![a-zA-Z])", - '', - fixed_math_html, - ) - fixed_math_html = re.sub(r"\\n$", "", fixed_math_html) - fixed_math_html = fix_text( - fixed_math_html, config=TextFixerConfig(unescape_html=True) - ) - return fixed_math_html - def get_latex_batched( self, page_images: List[Image.Image], @@ -138,3 +145,85 @@ def get_latex_batched( ] return equation_predictions + + +class MathpixEquationProcessor(BaseEquationProcessor): + """ + Equation processor using Mathpix API. + """ + def __init__(self, mathpix_provider: MathpixProvider, config=None): + super().__init__(config) + self.mathpix_provider = mathpix_provider + + def __call__(self, document: Document): + images = [] + equation_boxes = [] + equation_block_ids = [] + total_equation_blocks = 0 + + for page in document.pages: + page_image = page.get_image(highres=True) + page_size = page.polygon.width, page.polygon.height + image_size = page_image.size + + page_equation_boxes = [] + page_equation_block_ids = [] + equation_blocks = page.contained_blocks(document, self.block_types) + for block in equation_blocks: + page_equation_boxes.append( + block.polygon.rescale(page_size, image_size).bbox + ) + page_equation_block_ids.append(block.id) + total_equation_blocks += 1 + + images.append(page_image) + equation_boxes.append(page_equation_boxes) + equation_block_ids.append(page_equation_block_ids) + + if total_equation_blocks == 0: + return + + # Process each equation with Mathpix + for page_idx, (page_image, page_boxes, page_block_ids) in enumerate( + zip(images, equation_boxes, equation_block_ids) + ): + for box_idx, (box, block_id) in enumerate(zip(page_boxes, page_block_ids)): + # Crop the equation from the page + x1, y1, x2, y2 = [int(coord) for coord in box] + equation_image = page_image.crop((x1, y1, x2, y2)) + + # Process with Mathpix + try: + result = self.mathpix_provider.process_equation(equation_image) + + # Extract LaTeX from the result + latex = result.get('latex_styled', '') + if latex: + # Wrap in math tags + block = document.get_block(block_id) + block.html = self.fix_latex(f'{latex}') + except Exception as e: + print(f"Error processing equation {block_id}: {str(e)}") + continue + + +def get_equation_processor(config=None): + """ + Factory function to get the appropriate equation processor based on settings. + """ + if settings.EQUATION_PROCESSOR == "mathpix": + if not settings.MATHPIX_APP_ID or not settings.MATHPIX_APP_KEY: + raise ValueError("Mathpix API credentials not configured") + return MathpixEquationProcessor( + mathpix_provider=MathpixProvider( + app_id=settings.MATHPIX_APP_ID, + app_key=settings.MATHPIX_APP_KEY + ), + config=config + ) + else: + from surya.recognition import RecognitionPredictor + return OriginalEquationProcessor( + recognition_model=RecognitionPredictor(), + config=config + ) diff --git a/marker/providers/mathpix.py b/marker/providers/mathpix.py new file mode 100644 index 000000000..daa35bcb2 --- /dev/null +++ b/marker/providers/mathpix.py @@ -0,0 +1,58 @@ +from typing import Optional, Dict, Any +import requests +from PIL import Image +import io +import base64 + +class MathpixProvider: + def __init__(self, app_id: str, app_key: str): + self.app_id = app_id + self.app_key = app_key + self.api_url = "https://api.mathpix.com/v3/text" + + def _encode_image(self, image: Image.Image) -> str: + """Convert PIL Image to base64 string""" + buffered = io.BytesIO() + image.save(buffered, format="PNG") + return base64.b64encode(buffered.getvalue()).decode() + + def process_equation(self, image: Image.Image, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """ + Process an equation image using Mathpix API + + Args: + image: PIL Image containing the equation + options: Additional options for Mathpix API + + Returns: + Dict containing the processed equation data + """ + if options is None: + options = {} + + # Prepare the request + headers = { + "app_id": self.app_id, + "app_key": self.app_key, + "Content-Type": "application/json" + } + + # Convert image to base64 + image_data = self._encode_image(image) + + # Prepare request body + data = { + "src": f"data:image/png;base64,{image_data}", + "formats": ["text", "latex_styled"], + "data_options": { + "include_asciimath": True, + "include_latex": True + }, + **options + } + + # Make API request + response = requests.post(self.api_url, headers=headers, json=data) + response.raise_for_status() + + return response.json() \ No newline at end of file diff --git a/marker/settings.py b/marker/settings.py index 5660ada90..d31a51e61 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -1,11 +1,13 @@ -from typing import Optional +from typing import Optional, Literal -from dotenv import find_dotenv +from dotenv import find_dotenv, load_dotenv from pydantic import computed_field from pydantic_settings import BaseSettings import torch import os +# Load environment variables from .env file +load_dotenv(find_dotenv(".env")) class Settings(BaseSettings): # Paths @@ -30,6 +32,11 @@ class Settings(BaseSettings): None # Note: MPS device does not work for text detection, and will default to CPU ) + # Equation processing settings + EQUATION_PROCESSOR: Literal["mathpix", "original"] = "original" # Default to original implementation + MATHPIX_APP_ID: str = os.getenv("MATHPIX_APP_ID", "") + MATHPIX_APP_KEY: str = os.getenv("MATHPIX_APP_KEY", "") + @computed_field @property def TORCH_DEVICE_MODEL(self) -> str: From 4c6c7c2f4155b03b176a9848d66c573116bfb85b Mon Sep 17 00:00:00 2001 From: Eliska Suchardova Date: Thu, 29 May 2025 12:27:17 +0200 Subject: [PATCH 2/7] Refactoring mathpix implementation --- marker/converters/ocr.py | 4 +- marker/converters/pdf.py | 4 +- marker/processors/equation.py | 187 +++++++++++++--------------------- 3 files changed, 74 insertions(+), 121 deletions(-) diff --git a/marker/converters/ocr.py b/marker/converters/ocr.py index 2dab5cc5a..38ee71dd1 100644 --- a/marker/converters/ocr.py +++ b/marker/converters/ocr.py @@ -5,7 +5,7 @@ from marker.builders.ocr import OcrBuilder from marker.converters.pdf import PdfConverter from marker.processors import BaseProcessor -from marker.processors.equation import get_equation_processor +from marker.processors.equation import EquationProcessor from marker.providers.registry import provider_from_filepath from marker.renderers.ocr_json import OCRJSONRenderer from marker.providers.mathpix import MathpixProvider @@ -13,7 +13,7 @@ class OCRConverter(PdfConverter): - default_processors: Tuple[BaseProcessor, ...] = (get_equation_processor(),) + default_processors: Tuple[BaseProcessor, ...] = (EquationProcessor,) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py index 20b0f20c5..21a604431 100644 --- a/marker/converters/pdf.py +++ b/marker/converters/pdf.py @@ -24,7 +24,7 @@ from marker.processors.code import CodeProcessor from marker.processors.debug import DebugProcessor from marker.processors.document_toc import DocumentTOCProcessor -from marker.processors.equation import get_equation_processor +from marker.processors.equation import EquationProcessor from marker.processors.footnote import FootnoteProcessor from marker.processors.ignoretext import IgnoreTextProcessor from marker.processors.line_numbers import LineNumbersProcessor @@ -73,7 +73,7 @@ class PdfConverter(BaseConverter): BlockquoteProcessor, CodeProcessor, DocumentTOCProcessor, - get_equation_processor, + EquationProcessor, FootnoteProcessor, IgnoreTextProcessor, LineNumbersProcessor, diff --git a/marker/processors/equation.py b/marker/processors/equation.py index 6cc83c98d..836d286e6 100644 --- a/marker/processors/equation.py +++ b/marker/processors/equation.py @@ -13,49 +13,14 @@ MATH_TAG_PATTERN = re.compile(r"]*>(.*?)") -class BaseEquationProcessor(BaseProcessor): +class EquationProcessor(BaseProcessor): """ - Base class for equation processors. + A processor for recognizing equations in the document. """ block_types: Annotated[ Tuple[BlockTypes], "The block types to process.", ] = (BlockTypes.Equation,) - - def fix_latex(self, math_html: str): - math_html = math_html.strip() - soup = BeautifulSoup(math_html, "html.parser") - opening_math_tag = soup.find("math") - - # No math block found - if not opening_math_tag: - return "" - - # Force block format - opening_math_tag.attrs["display"] = "block" - fixed_math_html = str(soup) - - # Sometimes model outputs newlines at the beginning/end of tags - fixed_math_html = re.sub( - r"^\\n(?![a-zA-Z])", - '', - fixed_math_html, - ) - fixed_math_html = re.sub(r"\\n$", "", fixed_math_html) - fixed_math_html = fix_text( - fixed_math_html, config=TextFixerConfig(unescape_html=True) - ) - return fixed_math_html - - def process_equations(self, document: Document): - """Process equations in the document. To be implemented by subclasses.""" - raise NotImplementedError - - -class OriginalEquationProcessor(BaseEquationProcessor): - """ - Original equation processor using the recognition model. - """ model_max_length: Annotated[ int, "The maximum number of tokens to allow for the Recognition model.", @@ -69,10 +34,24 @@ class OriginalEquationProcessor(BaseEquationProcessor): bool, "Whether to disable the tqdm progress bar.", ] = False + use_mathpix: Annotated[ + bool, + "Whether to use Mathpix for equation processing.", + ] = False def __init__(self, recognition_model: RecognitionPredictor, config=None): super().__init__(config) self.recognition_model = recognition_model + + if config and config.get("use_mathpix", False): + self.use_mathpix = True + + if not settings.MATHPIX_APP_ID or not settings.MATHPIX_APP_KEY: + raise ValueError("Mathpix API credentials not configured") + self.mathpix_provider = MathpixProvider( + app_id=settings.MATHPIX_APP_ID, + app_key=settings.MATHPIX_APP_KEY + ) def get_batch_size(self): # Set to 1/4th of OCR batch size due to sequence length with tiling @@ -112,6 +91,35 @@ def __call__(self, document: Document): if total_equation_blocks == 0: return + if self.use_mathpix: + self._process_with_mathpix(images, equation_boxes, equation_block_ids, document) + else: + self._process_with_recognition(images, equation_boxes, equation_block_ids, document) + + def _process_with_mathpix(self, images, equation_boxes, equation_block_ids, document): + for page_idx, (page_image, page_boxes, page_block_ids) in enumerate( + zip(images, equation_boxes, equation_block_ids) + ): + for box_idx, (box, block_id) in enumerate(zip(page_boxes, page_block_ids)): + # Crop the equation from the page + x1, y1, x2, y2 = [int(coord) for coord in box] + equation_image = page_image.crop((x1, y1, x2, y2)) + + # Process with Mathpix + try: + result = self.mathpix_provider.process_equation(equation_image) + + # Extract LaTeX from the result + latex = result.get('latex_styled', '') + if latex: + # Wrap in math tags + block = document.get_block(block_id) + block.html = self.fix_latex(f'{latex}') + except Exception as e: + print(f"Error processing equation {block_id}: {str(e)}") + continue + + def _process_with_recognition(self, images, equation_boxes, equation_block_ids, document): predictions = self.get_latex_batched(images, equation_boxes) for page_predictions, page_equation_block_ids in zip( predictions, equation_block_ids @@ -125,6 +133,31 @@ def __call__(self, document: Document): block = document.get_block(block_id) block.html = self.fix_latex(block_prediction) + def fix_latex(self, math_html: str): + math_html = math_html.strip() + soup = BeautifulSoup(math_html, "html.parser") + opening_math_tag = soup.find("math") + + # No math block found + if not opening_math_tag: + return "" + + # Force block format + opening_math_tag.attrs["display"] = "block" + fixed_math_html = str(soup) + + # Sometimes model outputs newlines at the beginning/end of tags + fixed_math_html = re.sub( + r"^\\n(?![a-zA-Z])", + '', + fixed_math_html, + ) + fixed_math_html = re.sub(r"\\n$", "", fixed_math_html) + fixed_math_html = fix_text( + fixed_math_html, config=TextFixerConfig(unescape_html=True) + ) + return fixed_math_html + def get_latex_batched( self, page_images: List[Image.Image], @@ -147,83 +180,3 @@ def get_latex_batched( return equation_predictions -class MathpixEquationProcessor(BaseEquationProcessor): - """ - Equation processor using Mathpix API. - """ - def __init__(self, mathpix_provider: MathpixProvider, config=None): - super().__init__(config) - self.mathpix_provider = mathpix_provider - - def __call__(self, document: Document): - images = [] - equation_boxes = [] - equation_block_ids = [] - total_equation_blocks = 0 - - for page in document.pages: - page_image = page.get_image(highres=True) - page_size = page.polygon.width, page.polygon.height - image_size = page_image.size - - page_equation_boxes = [] - page_equation_block_ids = [] - equation_blocks = page.contained_blocks(document, self.block_types) - for block in equation_blocks: - page_equation_boxes.append( - block.polygon.rescale(page_size, image_size).bbox - ) - page_equation_block_ids.append(block.id) - total_equation_blocks += 1 - - images.append(page_image) - equation_boxes.append(page_equation_boxes) - equation_block_ids.append(page_equation_block_ids) - - if total_equation_blocks == 0: - return - - # Process each equation with Mathpix - for page_idx, (page_image, page_boxes, page_block_ids) in enumerate( - zip(images, equation_boxes, equation_block_ids) - ): - for box_idx, (box, block_id) in enumerate(zip(page_boxes, page_block_ids)): - # Crop the equation from the page - x1, y1, x2, y2 = [int(coord) for coord in box] - equation_image = page_image.crop((x1, y1, x2, y2)) - - # Process with Mathpix - try: - result = self.mathpix_provider.process_equation(equation_image) - - # Extract LaTeX from the result - latex = result.get('latex_styled', '') - if latex: - # Wrap in math tags - block = document.get_block(block_id) - block.html = self.fix_latex(f'{latex}') - except Exception as e: - print(f"Error processing equation {block_id}: {str(e)}") - continue - - -def get_equation_processor(config=None): - """ - Factory function to get the appropriate equation processor based on settings. - """ - if settings.EQUATION_PROCESSOR == "mathpix": - if not settings.MATHPIX_APP_ID or not settings.MATHPIX_APP_KEY: - raise ValueError("Mathpix API credentials not configured") - return MathpixEquationProcessor( - mathpix_provider=MathpixProvider( - app_id=settings.MATHPIX_APP_ID, - app_key=settings.MATHPIX_APP_KEY - ), - config=config - ) - else: - from surya.recognition import RecognitionPredictor - return OriginalEquationProcessor( - recognition_model=RecognitionPredictor(), - config=config - ) From 69a15325f206d41daf9ed663dee99b946c33caff Mon Sep 17 00:00:00 2001 From: Eliska Suchardova Date: Thu, 29 May 2025 12:32:20 +0200 Subject: [PATCH 3/7] Adjusting config --- marker/config/parser.py | 3 +-- marker/processors/equation.py | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/marker/config/parser.py b/marker/config/parser.py index e3543beef..1af9c2dc2 100644 --- a/marker/config/parser.py +++ b/marker/config/parser.py @@ -113,8 +113,7 @@ def generate_config_dict(self) -> Dict[str, any]: case "disable_image_extraction": config["extract_images"] = False case "mathpix": - if v: - settings.EQUATION_PROCESSOR = "mathpix" + config["use_mathpix"] = v case _: if k in crawler.attr_set: config[k] = v diff --git a/marker/processors/equation.py b/marker/processors/equation.py index 836d286e6..fb747536b 100644 --- a/marker/processors/equation.py +++ b/marker/processors/equation.py @@ -42,10 +42,8 @@ class EquationProcessor(BaseProcessor): def __init__(self, recognition_model: RecognitionPredictor, config=None): super().__init__(config) self.recognition_model = recognition_model - - if config and config.get("use_mathpix", False): - self.use_mathpix = True - + + if self.use_mathpix == True: if not settings.MATHPIX_APP_ID or not settings.MATHPIX_APP_KEY: raise ValueError("Mathpix API credentials not configured") self.mathpix_provider = MathpixProvider( From a6a5d45f64b8026193a0c29571a6f8c65339d86b Mon Sep 17 00:00:00 2001 From: Eliska Suchardova Date: Thu, 29 May 2025 12:44:21 +0200 Subject: [PATCH 4/7] Removing unneccessary code --- marker/converters/__init__.py | 9 ++------- marker/settings.py | 1 - 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/marker/converters/__init__.py b/marker/converters/__init__.py index fea1bd20b..be56fbfe2 100644 --- a/marker/converters/__init__.py +++ b/marker/converters/__init__.py @@ -43,13 +43,8 @@ def resolve_dependencies(self, cls): def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) -> List[BaseProcessor]: processors = [] for processor_cls in processor_cls_lst: - if callable(processor_cls) and processor_cls.__name__ == 'get_equation_processor': - # Special case for equation processor - processor = processor_cls(config=self.config) - else: - processor = self.resolve_dependencies(processor_cls) - processors.append(processor) - + processors.append(self.resolve_dependencies(processor_cls)) + simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)] other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)] diff --git a/marker/settings.py b/marker/settings.py index d31a51e61..d5d606150 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -33,7 +33,6 @@ class Settings(BaseSettings): ) # Equation processing settings - EQUATION_PROCESSOR: Literal["mathpix", "original"] = "original" # Default to original implementation MATHPIX_APP_ID: str = os.getenv("MATHPIX_APP_ID", "") MATHPIX_APP_KEY: str = os.getenv("MATHPIX_APP_KEY", "") From 4332294d6203b1c714e48e348a738b6f9fcb0357 Mon Sep 17 00:00:00 2001 From: Eliska Suchardova Date: Thu, 29 May 2025 14:16:00 +0200 Subject: [PATCH 5/7] MAthipix processes equations and InlineMath --- marker/processors/equation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/marker/processors/equation.py b/marker/processors/equation.py index fb747536b..83f71fb86 100644 --- a/marker/processors/equation.py +++ b/marker/processors/equation.py @@ -50,6 +50,8 @@ def __init__(self, recognition_model: RecognitionPredictor, config=None): app_id=settings.MATHPIX_APP_ID, app_key=settings.MATHPIX_APP_KEY ) + # Add TextInlineMath to block types when Mathpix is enabled + self.block_types = (BlockTypes.Equation, BlockTypes.TextInlineMath) def get_batch_size(self): # Set to 1/4th of OCR batch size due to sequence length with tiling From b3cc3e416465a32270046e6c4db3dd710fdb2c8d Mon Sep 17 00:00:00 2001 From: Eliska Suchardova Date: Thu, 29 May 2025 16:43:12 +0200 Subject: [PATCH 6/7] Testing script for box filtering --- test_clipping.py | 110 ++++++++++++++++++++++++++++++++++++ visualize_matching_boxes.py | 104 ++++++++++++++++++++++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 test_clipping.py create mode 100644 visualize_matching_boxes.py diff --git a/test_clipping.py b/test_clipping.py new file mode 100644 index 000000000..7fc9d65a4 --- /dev/null +++ b/test_clipping.py @@ -0,0 +1,110 @@ +import os +import argparse +import pypdfium2 as pdfium +from PIL import Image, ImageDraw +import pypdfium2.raw as pdfium_c +import ctypes + +def get_text_from_raw_text_obj(raw_text_obj, page): + textpage = pdfium_c.FPDFText_LoadPage(page.raw) + if not textpage: + return "" + try: + buflen = pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, None, 0) + if buflen <= 0: + return "" + buf = (ctypes.c_ushort * buflen)() + pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, buf, buflen) + # Convert UTF-16LE buffer to Python string, strip trailing nulls + return bytearray(buf).decode('utf-16-le').rstrip('\x00') + finally: + pdfium_c.FPDFText_ClosePage(textpage) + +def draw_text_objects(page, clip_region, output_path): + objects = list(page.get_objects()) + scale = 2.0 + bitmap = page.render(scale=scale) + pil_image = bitmap.to_pil() + page_width, page_height = pil_image.size + boxes_image = Image.new('RGB', (page_width, page_height), 'white') + boxes_draw = ImageDraw.Draw(boxes_image) + + found = False + for i, obj in enumerate(objects): + if obj.type == 1: # 1 = text object + text = get_text_from_raw_text_obj(obj.raw, page) + if text and text.strip() == "30075": + found = True + print(f"Found text '30075' in object {i+1}") + # Get and draw object bounding box (red) + left = ctypes.c_float() + bottom = ctypes.c_float() + right = ctypes.c_float() + top = ctypes.c_float() + success = pdfium_c.FPDFPageObj_GetBounds(obj.raw, ctypes.byref(left), ctypes.byref(bottom), ctypes.byref(right), ctypes.byref(top)) + if success: + print(f"Object bounding box: left={left.value}, bottom={bottom.value}, right={right.value}, top={top.value}") + pil_top = page_height - (top.value * scale) + pil_bottom = page_height - (bottom.value * scale) + pil_left = left.value * scale + pil_right = right.value * scale + boxes_draw.rectangle([pil_left, pil_top, pil_right, pil_bottom], outline='red', width=3) + else: + print("Could not get object bounding box.") + # Try to get and draw actual clipping path (blue) + try: + clip_path = pdfium_c.FPDFPageObj_GetClipPath(obj.raw) + if clip_path: + if all(hasattr(pdfium_c, fn) for fn in [ + "FPDFClipPath_CountPaths", "FPDFClipPath_CountPathSegments", "FPDFClipPath_GetPathSegment", "FPDFPathSegment_GetPoint", "FPDFPathSegment_GetType", "FPDFPathSegment_GetClose"]): + num_paths = pdfium_c.FPDFClipPath_CountPaths(clip_path) + for path_idx in range(num_paths): + num_segs = pdfium_c.FPDFClipPath_CountPathSegments(clip_path, path_idx) + points = [] + for seg_idx in range(num_segs): + seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, seg_idx) + x = ctypes.c_float() + y = ctypes.c_float() + pdfium_c.FPDFPathSegment_GetPoint(seg, ctypes.byref(x), ctypes.byref(y)) + pil_x = x.value * scale + pil_y = page_height - (y.value * scale) + points.append((pil_x, pil_y)) + # Log the raw PDF coordinates and the PIL coordinates + print(f"Object {i+1}, path {path_idx}, seg {seg_idx}: PDF ({x.value}, {y.value}) -> PIL ({pil_x}, {pil_y})") + # Check if path is closed + closed = False + if num_segs > 0: + last_seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, num_segs-1) + closed = bool(pdfium_c.FPDFPathSegment_GetClose(last_seg)) + if len(points) > 1: + if closed: + boxes_draw.polygon(points, outline='blue') + else: + boxes_draw.line(points, fill='blue', width=3) + else: + print("Clipping path exists, but path segment functions are not available in this pypdfium2 version.") + else: + print("No clipping path for this object.") + except Exception as e: + print(f"No clipping path or error: {e}") + if not found: + print("No text object with text '30075' found.") + + boxes_output_path = output_path.replace('.png', '_boxes.png') + boxes_image.save(boxes_output_path) + print(f"Boxes-only visualization saved to {boxes_output_path}") + +def main(): + parser = argparse.ArgumentParser(description='Visualize PDF text objects with clipping') + parser.add_argument('pdf_path', help='Path to the PDF file') + parser.add_argument('--page', type=int, default=0, help='Page number (0-based)') + args = parser.parse_args() + output_dir = "test_output" + os.makedirs(output_dir, exist_ok=True) + doc = pdfium.PdfDocument(args.pdf_path) + page = doc[args.page] + output_path = os.path.join(output_dir, f"page_{args.page}.png") + draw_text_objects(page, None, output_path) + +if __name__ == '__main__': + main() diff --git a/visualize_matching_boxes.py b/visualize_matching_boxes.py new file mode 100644 index 000000000..f70939851 --- /dev/null +++ b/visualize_matching_boxes.py @@ -0,0 +1,104 @@ +import argparse +import pypdfium2 as pdfium +import pypdfium2.raw as pdfium_c +import ctypes +from PIL import Image, ImageDraw + +def get_text_from_raw_text_obj(raw_text_obj, page): + textpage = pdfium_c.FPDFText_LoadPage(page.raw) + if not textpage: + return "" + try: + buflen = pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, None, 0) + if buflen <= 0: + return "" + buf = (ctypes.c_ushort * buflen)() + pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, buf, buflen) + return bytearray(buf).decode('utf-16-le').rstrip('\x00') + finally: + pdfium_c.FPDFText_ClosePage(textpage) + +def boxes_are_equal(box1, box2, tol=1e-2): + return all(abs(a - b) < tol for a, b in zip(box1, box2)) + +def draw_matching_boxes(page, output_path): + objects = list(page.get_objects()) + scale = 2.0 + bitmap = page.render(scale=scale) + pil_image = bitmap.to_pil() + page_width, page_height = pil_image.size + boxes_draw = ImageDraw.Draw(pil_image) + + for i, obj in enumerate(objects): + # Get object bounding box + left = ctypes.c_float() + bottom = ctypes.c_float() + right = ctypes.c_float() + top = ctypes.c_float() + success = pdfium_c.FPDFPageObj_GetBounds(obj.raw, ctypes.byref(left), ctypes.byref(bottom), ctypes.byref(right), ctypes.byref(top)) + if not success: + print(f"Object {i+1}: Could not get object bounding box.") + continue + obj_box = (left.value, bottom.value, right.value, top.value) + print(f"Object {i+1}: Bounding box: {obj_box}") + # Try to get clipping path + show_box = True + try: + clip_path = pdfium_c.FPDFPageObj_GetClipPath(obj.raw) + if clip_path and all(hasattr(pdfium_c, fn) for fn in [ + "FPDFClipPath_CountPaths", "FPDFClipPath_CountPathSegments", "FPDFClipPath_GetPathSegment", "FPDFPathSegment_GetPoint", "FPDFPathSegment_GetType", "FPDFPathSegment_GetClose"]): + num_paths = pdfium_c.FPDFClipPath_CountPaths(clip_path) + for path_idx in range(num_paths): + num_segs = pdfium_c.FPDFClipPath_CountPathSegments(clip_path, path_idx) + points = [] + for seg_idx in range(num_segs): + seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, seg_idx) + x = ctypes.c_float() + y = ctypes.c_float() + pdfium_c.FPDFPathSegment_GetPoint(seg, ctypes.byref(x), ctypes.byref(y)) + points.append((x.value, y.value)) + if len(points) >= 2: + # Get the bounding box of the clip path + xs = [pt[0] for pt in points] + ys = [pt[1] for pt in points] + clip_box = (min(xs), min(ys), max(xs), max(ys)) + print(f"Object {i+1}: Clip path box: {clip_box}") + # Draw the clip path in blue + pil_points = [(x * scale, page_height - (y * scale)) for x, y in points] + closed = False + if num_segs > 0: + last_seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, num_segs-1) + closed = bool(pdfium_c.FPDFPathSegment_GetClose(last_seg)) + if closed: + boxes_draw.polygon(pil_points, outline='blue') + else: + boxes_draw.line(pil_points, fill='blue', width=3) + # Only show the bounding box if it matches the clip box + if not boxes_are_equal(obj_box, clip_box): + show_box = False + else: + print(f"Object {i+1}: No usable clipping path.") + except Exception as e: + print(f"Object {i+1}: Error getting clip path: {e}") + if show_box: + pil_top = page_height - (obj_box[3] * scale) + pil_bottom = page_height - (obj_box[1] * scale) + pil_left = obj_box[0] * scale + pil_right = obj_box[2] * scale + boxes_draw.rectangle([pil_left, pil_top, pil_right, pil_bottom], outline='red', width=3) + + pil_image.save(output_path) + print(f"Matching boxes visualization saved to {output_path}") + +def main(): + parser = argparse.ArgumentParser(description='Visualize matching bounding boxes and clip paths for all objects') + parser.add_argument('pdf_path', help='Path to the PDF file') + parser.add_argument('--page', type=int, default=0, help='Page number (0-based)') + parser.add_argument('--output', type=str, default='matching_boxes.png', help='Output image path') + args = parser.parse_args() + doc = pdfium.PdfDocument(args.pdf_path) + page = doc[args.page] + draw_matching_boxes(page, args.output) + +if __name__ == '__main__': + main() \ No newline at end of file From 166cd0ce45963874103a5f70ac3eed3a369be8c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Va=CC=81cha?= Date: Thu, 5 Jun 2025 21:44:44 +0200 Subject: [PATCH 7/7] improve clip visualization --- poetry.lock | 102 +++++++++++----------- pyproject.toml | 6 +- visualize_matching_boxes.py | 167 +++++++++++++++++++++++++++++------- 3 files changed, 191 insertions(+), 84 deletions(-) diff --git a/poetry.lock b/poetry.lock index 90b66ece0..29216101b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -3366,10 +3366,10 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\""}, - {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.23.5", markers = "python_version == \"3.11\""}, + {version = ">=1.21.4", markers = "python_version == \"3.10\" and platform_system == \"Darwin\""}, + {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version == \"3.10\""}, ] [[package]] @@ -3466,9 +3466,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.22.4", markers = "python_version < \"3.11\""}, - {version = ">=1.23.2", markers = "python_version == \"3.11\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -3532,18 +3532,20 @@ name = "pdftext" version = "0.6.2" description = "Extract structured text from pdfs quickly" optional = false -python-versions = "<4.0,>=3.10" +python-versions = "^3.10" groups = ["main"] -files = [ - {file = "pdftext-0.6.2-py3-none-any.whl", hash = "sha256:905d11e62d548e307933c25865a69c8e993947bb5b40b1535b0a2aa8f07a71d4"}, - {file = "pdftext-0.6.2.tar.gz", hash = "sha256:ff5b92462ac03ae63a23429384ae123d45c162dcda30e7bf2c5c92a6b208c9de"}, -] +files = [] +develop = true [package.dependencies] -click = ">=8.1.8,<9.0.0" -pydantic = ">=2.7.1,<3.0.0" -pydantic-settings = ">=2.2.1,<3.0.0" -pypdfium2 = "4.30.0" +click = "^8.1.8" +pydantic = "^2.7.1" +pydantic-settings = "^2.2.1" +pypdfium2 = "=4.30.1" + +[package.source] +type = "directory" +url = "../pdftext" [[package]] name = "pexpect" @@ -4264,25 +4266,25 @@ windows-terminal = ["colorama (>=0.4.6)"] [[package]] name = "pypdfium2" -version = "4.30.0" +version = "4.30.1" description = "Python bindings to PDFium" optional = false python-versions = ">=3.6" groups = ["main"] files = [ - {file = "pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab"}, - {file = "pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de"}, - {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854"}, - {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2"}, - {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad"}, - {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f"}, - {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163"}, - {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e"}, - {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be"}, - {file = "pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e"}, - {file = "pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c"}, - {file = "pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29"}, - {file = "pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16"}, + {file = "pypdfium2-4.30.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:e07c47633732cc18d890bb7e965ad28a9c5a932e548acb928596f86be2e5ae37"}, + {file = "pypdfium2-4.30.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5ea2d44e96d361123b67b00f527017aa9c847c871b5714e013c01c3eb36a79fe"}, + {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1de7a3a36803171b3f66911131046d65a732f9e7834438191cb58235e6163c4e"}, + {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b8a4231efb13170354f568c722d6540b8d5b476b08825586d48ef70c40d16e03"}, + {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f434a4934e8244aa95343ffcf24e9ad9f120dbb4785f631bb40a88c39292493"}, + {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f454032a0bc7681900170f67d8711b3942824531e765f91c2f5ce7937f999794"}, + {file = "pypdfium2-4.30.1-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:bbf9130a72370ee9d602e39949b902db669a2a1c24746a91e5586eb829055d9f"}, + {file = "pypdfium2-4.30.1-py3-none-musllinux_1_1_i686.whl", hash = "sha256:5cb52884b1583b96e94fd78542c63bb42e06df5e8f9e52f8f31f5ad5a1e53367"}, + {file = "pypdfium2-4.30.1-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:1a9e372bd4867ff223cc8c338e33fe11055dad12f22885950fc27646cc8d9122"}, + {file = "pypdfium2-4.30.1-py3-none-win32.whl", hash = "sha256:421f1cf205e213e07c1f2934905779547f4f4a2ff2f59dde29da3d511d3fc806"}, + {file = "pypdfium2-4.30.1-py3-none-win_amd64.whl", hash = "sha256:598a7f20264ab5113853cba6d86c4566e4356cad037d7d1f849c8c9021007e05"}, + {file = "pypdfium2-4.30.1-py3-none-win_arm64.whl", hash = "sha256:c2b6d63f6d425d9416c08d2511822b54b8e3ac38e639fc41164b1d75584b3a8c"}, + {file = "pypdfium2-4.30.1.tar.gz", hash = "sha256:5f5c7c6d03598e107d974f66b220a49436aceb191da34cda5f692be098a814ce"}, ] [[package]] @@ -5390,30 +5392,32 @@ snowflake = ["snowflake-connector-python (>=3.3.0) ; python_version < \"3.12\"", [[package]] name = "surya-ocr" -version = "0.14.2" +version = "0.14.5" description = "OCR, layout, reading order, and table recognition in 90+ languages" optional = false -python-versions = "<4.0,>=3.10" +python-versions = "^3.10" groups = ["main"] -files = [ - {file = "surya_ocr-0.14.2-py3-none-any.whl", hash = "sha256:0c402705c860f8bf98fc2bf2a3b49d7f0e16fba587aed6d3f01bb53bb776d283"}, - {file = "surya_ocr-0.14.2.tar.gz", hash = "sha256:852af681073167beba9a638658c70b81318f1a8f3d558db68dead1b2c391e862"}, -] +files = [] +develop = true [package.dependencies] -click = ">=8.1.8,<9.0.0" -einops = ">=0.8.1,<0.9.0" -filetype = ">=1.2.0,<2.0.0" -opencv-python-headless = ">=4.11.0.86,<5.0.0.0" -pillow = ">=10.2.0,<11.0.0" -platformdirs = ">=4.3.6,<5.0.0" -pre-commit = ">=4.2.0,<5.0.0" -pydantic = ">=2.5.3,<3.0.0" -pydantic-settings = ">=2.1.0,<3.0.0" -pypdfium2 = "4.30.0" -python-dotenv = ">=1.0.0,<2.0.0" -torch = ">=2.7.0,<3.0.0" -transformers = ">=4.51.2,<5.0.0" +click = "^8.1.8" +einops = "^0.8.1" +filetype = "^1.2.0" +opencv-python-headless = "^4.11.0.86" +pillow = "^10.2.0" +platformdirs = "^4.3.6" +pre-commit = "^4.2.0" +pydantic = "^2.5.3" +pydantic-settings = "^2.1.0" +pypdfium2 = "=4.30.1" +python-dotenv = "^1.0.0" +torch = "^2.7.0" +transformers = "^4.51.2" + +[package.source] +type = "directory" +url = "../surya" [[package]] name = "sympy" @@ -6505,4 +6509,4 @@ full = ["ebooklib", "mammoth", "openpyxl", "python-pptx", "weasyprint"] [metadata] lock-version = "2.1" python-versions = "^3.10" -content-hash = "c18debb8d18aec4081c31ff32f9dc2bde6f4c0189f1d7647bb6061f685e0e319" +content-hash = "484459202f1148269601972c07e461c46f51f424919485090b707b350ce7fd74" diff --git a/pyproject.toml b/pyproject.toml index 2056da009..a5f82aea4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,9 +26,9 @@ torch = "^2.7.0" tqdm = "^4.66.1" ftfy = "^6.1.1" rapidfuzz = "^3.8.1" -surya-ocr = "^0.14.2" +surya-ocr = {path = "../surya", develop = true} regex = "^2024.4.28" -pdftext = "~0.6.2" +pdftext = {path = "../pdftext", develop = true} markdownify = "^0.13.1" click = "^8.2.0" markdown2 = "^2.5.2" @@ -75,4 +75,4 @@ marker_server = "marker.scripts.server:server_cli" [build-system] requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" \ No newline at end of file +build-backend = "poetry.core.masonry.api" diff --git a/visualize_matching_boxes.py b/visualize_matching_boxes.py index f70939851..2062fb96d 100644 --- a/visualize_matching_boxes.py +++ b/visualize_matching_boxes.py @@ -18,8 +18,39 @@ def get_text_from_raw_text_obj(raw_text_obj, page): finally: pdfium_c.FPDFText_ClosePage(textpage) -def boxes_are_equal(box1, box2, tol=1e-2): - return all(abs(a - b) < tol for a, b in zip(box1, box2)) +def boxes_intersect(box1, box2): + """Check if two boxes intersect using the algorithm from fz_glyph_entirely_outside_box. + + Args: + box1, box2: Tuples of (x0, y0, x1, y1) where (x0,y0) is bottom-left, (x1,y1) is top-right + + Returns: + True if boxes intersect, False if they are entirely separate + """ + # If box1 is entirely outside box2, they don't intersect + if (box1[2] <= box2[0] or # box1.x1 <= box2.x0 (box1 right edge <= box2 left edge) + box1[3] <= box2[1] or # box1.y1 <= box2.y0 (box1 top edge <= box2 bottom edge) + box1[0] >= box2[2] or # box1.x0 >= box2.x1 (box1 left edge >= box2 right edge) + box1[1] >= box2[3]): # box1.y0 >= box2.y1 (box1 bottom edge >= box2 top edge) + return False + return True + +def draw_box(box, boxes_draw, scale, page_height, color='gray', width=1): + """Draw a bounding box on the image. + + Args: + box: Tuple of (x0, y0, x1, y1) coordinates in PDF space + boxes_draw: ImageDraw object to draw on + scale: Scale factor for coordinate conversion + page_height: Height of the page in pixels + color: Color of the outline + width: Width of the outline + """ + pil_top = page_height - (box[3] * scale) + pil_bottom = page_height - (box[1] * scale) + pil_left = box[0] * scale + pil_right = box[2] * scale + boxes_draw.rectangle([pil_left, pil_top, pil_right, pil_bottom], outline=color, width=width) def draw_matching_boxes(page, output_path): objects = list(page.get_objects()) @@ -29,7 +60,57 @@ def draw_matching_boxes(page, output_path): page_width, page_height = pil_image.size boxes_draw = ImageDraw.Draw(pil_image) + # Check for required clip path functions once + required_clip_fns = [ + "FPDFClipPath_CountPaths", "FPDFClipPath_CountPathSegments", + "FPDFClipPath_GetPathSegment", "FPDFPathSegment_GetPoint", + "FPDFPathSegment_GetType", "FPDFPathSegment_GetClose" + ] + has_clip_path_api = all(hasattr(pdfium_c, fn) for fn in required_clip_fns) + if not has_clip_path_api: + raise RuntimeError("Required PDFium clip path API functions are missing in pdfium_c. Please check your PDFium installation.") + + # Draw the page crop box in green + try: + left = ctypes.c_float() + bottom = ctypes.c_float() + right = ctypes.c_float() + top = ctypes.c_float() + success = pdfium_c.FPDFPage_GetCropBox(page.raw, ctypes.byref(left), ctypes.byref(bottom), ctypes.byref(right), ctypes.byref(top)) + if success: + crop_box = (left.value, bottom.value, right.value, top.value) + print(f"Page crop box: ({crop_box[0]:.2f}, {crop_box[1]:.2f}, {crop_box[2]:.2f}, {crop_box[3]:.2f})") + draw_box(crop_box, boxes_draw, scale, page_height, color='green', width=2) + else: + print("Could not get page crop box") + except Exception as e: + print(f"Error getting page crop box: {e}") + + # Initialize counters for statistics + total_objects = len(objects) + text_objects = 0 + visible_objects = 0 + clipped_objects = 0 + + # Load textpage once for efficiency + textpage = pdfium_c.FPDFText_LoadPage(page.raw) + if not textpage: + print("Warning: Could not load textpage for text checking") + for i, obj in enumerate(objects): + # Check if object is a text object and has text content + obj_type = pdfium_c.FPDFPageObj_GetType(obj.raw) + if obj_type != 1: # FPDF_PAGEOBJ_TEXT = 1 + continue + + # Check if text object has any text content (without extracting it) + if textpage: + buflen = pdfium_c.FPDFTextObj_GetText(obj.raw, textpage, None, 0) + if buflen <= 0: # No text content + continue + + text_objects += 1 + # Get object bounding box left = ctypes.c_float() bottom = ctypes.c_float() @@ -40,55 +121,77 @@ def draw_matching_boxes(page, output_path): print(f"Object {i+1}: Could not get object bounding box.") continue obj_box = (left.value, bottom.value, right.value, top.value) - print(f"Object {i+1}: Bounding box: {obj_box}") + #print(f"Object {i+1}: Bounding box: ({obj_box[0]:.2f}, {obj_box[1]:.2f}, {obj_box[2]:.2f}, {obj_box[3]:.2f})") # Try to get clipping path show_box = True try: clip_path = pdfium_c.FPDFPageObj_GetClipPath(obj.raw) - if clip_path and all(hasattr(pdfium_c, fn) for fn in [ - "FPDFClipPath_CountPaths", "FPDFClipPath_CountPathSegments", "FPDFClipPath_GetPathSegment", "FPDFPathSegment_GetPoint", "FPDFPathSegment_GetType", "FPDFPathSegment_GetClose"]): + if clip_path: + # Collect all points from all paths to calculate a bounding rectangle + all_points = [] num_paths = pdfium_c.FPDFClipPath_CountPaths(clip_path) for path_idx in range(num_paths): num_segs = pdfium_c.FPDFClipPath_CountPathSegments(clip_path, path_idx) - points = [] for seg_idx in range(num_segs): seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, seg_idx) x = ctypes.c_float() y = ctypes.c_float() pdfium_c.FPDFPathSegment_GetPoint(seg, ctypes.byref(x), ctypes.byref(y)) - points.append((x.value, y.value)) - if len(points) >= 2: - # Get the bounding box of the clip path - xs = [pt[0] for pt in points] - ys = [pt[1] for pt in points] - clip_box = (min(xs), min(ys), max(xs), max(ys)) - print(f"Object {i+1}: Clip path box: {clip_box}") - # Draw the clip path in blue - pil_points = [(x * scale, page_height - (y * scale)) for x, y in points] - closed = False - if num_segs > 0: - last_seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, num_segs-1) - closed = bool(pdfium_c.FPDFPathSegment_GetClose(last_seg)) - if closed: - boxes_draw.polygon(pil_points, outline='blue') - else: - boxes_draw.line(pil_points, fill='blue', width=3) - # Only show the bounding box if it matches the clip box - if not boxes_are_equal(obj_box, clip_box): - show_box = False + all_points.append((x.value, y.value)) + + if all_points: + # Calculate the minimal bounding rectangle that fits the clip path + xs = [pt[0] for pt in all_points] + ys = [pt[1] for pt in all_points] + clip_box = (min(xs), min(ys), max(xs), max(ys)) + #print(f"Object {i+1}: Clip path bounding box: ({clip_box[0]:.2f}, {clip_box[1]:.2f}, {clip_box[2]:.2f}, {clip_box[3]:.2f}), # of paths: {num_paths}, # of segments: {num_segs}") + + + # Only show the object's bounding box if it doesn't match the clip box + if not boxes_intersect(obj_box, clip_box): + show_box = False + clipped_objects += 1 + else: + # Draw the clip path bounding box in blue + draw_box(clip_box, boxes_draw, scale, page_height, color='blue') + + # If boxes intersect, draw in red and extract text + draw_box(obj_box, boxes_draw, scale, page_height, color='red', width=3) + show_box = True + visible_objects += 1 + + # Extract and print text for red boxes + if textpage: + buflen = pdfium_c.FPDFTextObj_GetText(obj.raw, textpage, None, 0) + if buflen > 0: + buf = (ctypes.c_ushort * buflen)() + pdfium_c.FPDFTextObj_GetText(obj.raw, textpage, buf, buflen) + byte_buf = bytearray(buf) + text_content = byte_buf.decode('utf-16-le').rstrip('\x00') + utf16_bytes = text_content.encode('utf-16-le') + hex_bytes = ' '.join(f'{b:02x}' for b in utf16_bytes) + print(f"Object {i+1} text (red box): '{text_content}', utf16 bytes: {hex_bytes}") else: - print(f"Object {i+1}: No usable clipping path.") + print(f"Object {i+1}: No clipping path.") except Exception as e: print(f"Object {i+1}: Error getting clip path: {e}") if show_box: - pil_top = page_height - (obj_box[3] * scale) - pil_bottom = page_height - (obj_box[1] * scale) - pil_left = obj_box[0] * scale - pil_right = obj_box[2] * scale - boxes_draw.rectangle([pil_left, pil_top, pil_right, pil_bottom], outline='red', width=3) + draw_box(obj_box, boxes_draw, scale, page_height) + visible_objects += 1 pil_image.save(output_path) print(f"Matching boxes visualization saved to {output_path}") + + # Close textpage to avoid memory leaks + if textpage: + pdfium_c.FPDFText_ClosePage(textpage) + + # Print statistics + print(f"\nStatistics:") + print(f"Total objects: {total_objects}") + print(f"Text objects: {text_objects}") + print(f"Visible objects: {visible_objects}") + print(f"Clipped objects: {clipped_objects}") def main(): parser = argparse.ArgumentParser(description='Visualize matching bounding boxes and clip paths for all objects')