From 09c7a86b0632544556ee2003a58331585e449677 Mon Sep 17 00:00:00 2001
From: Eliska Suchardova <eliska.suchardova@gmail.com>
Date: Tue, 27 May 2025 10:50:47 +0200
Subject: [PATCH 1/7] Added mathpix option for equations

---
 marker/config/parser.py       |   9 ++
 marker/converters/__init__.py |   7 +-
 marker/converters/ocr.py      |  13 ++-
 marker/converters/pdf.py      |   4 +-
 marker/processors/equation.py | 153 +++++++++++++++++++++++++++-------
 marker/providers/mathpix.py   |  58 +++++++++++++
 marker/settings.py            |  11 ++-
 7 files changed, 216 insertions(+), 39 deletions(-)
 create mode 100644 marker/providers/mathpix.py

diff --git a/marker/config/parser.py b/marker/config/parser.py
index 8e6bd8de0..e3543beef 100644
--- a/marker/config/parser.py
+++ b/marker/config/parser.py
@@ -60,6 +60,12 @@ def common_options(fn):
             default=False,
             help="Disable image extraction.",
         )(fn)
+        fn = click.option(
+            "--mathpix",
+            is_flag=True,
+            default=False,
+            help="Use Mathpix for equation processing.",
+        )(fn)
 
         # these are options that need a list transformation, i.e splitting/parsing a string
         fn = click.option(
@@ -106,6 +112,9 @@ def generate_config_dict(self) -> Dict[str, any]:
                     config["pdftext_workers"] = 1
                 case "disable_image_extraction":
                     config["extract_images"] = False
+                case "mathpix":
+                    if v:
+                        settings.EQUATION_PROCESSOR = "mathpix"
                 case _:
                     if k in crawler.attr_set:
                         config[k] = v
diff --git a/marker/converters/__init__.py b/marker/converters/__init__.py
index 8357a4991..fea1bd20b 100644
--- a/marker/converters/__init__.py
+++ b/marker/converters/__init__.py
@@ -43,7 +43,12 @@ def resolve_dependencies(self, cls):
     def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) -> List[BaseProcessor]:
         processors = []
         for processor_cls in processor_cls_lst:
-            processors.append(self.resolve_dependencies(processor_cls))
+            if callable(processor_cls) and processor_cls.__name__ == 'get_equation_processor':
+                # Special case for equation processor
+                processor = processor_cls(config=self.config)
+            else:
+                processor = self.resolve_dependencies(processor_cls)
+            processors.append(processor)
 
         simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
         other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)]
diff --git a/marker/converters/ocr.py b/marker/converters/ocr.py
index f7e562553..2dab5cc5a 100644
--- a/marker/converters/ocr.py
+++ b/marker/converters/ocr.py
@@ -5,13 +5,15 @@
 from marker.builders.ocr import OcrBuilder
 from marker.converters.pdf import PdfConverter
 from marker.processors import BaseProcessor
-from marker.processors.equation import EquationProcessor
+from marker.processors.equation import get_equation_processor
 from marker.providers.registry import provider_from_filepath
 from marker.renderers.ocr_json import OCRJSONRenderer
+from marker.providers.mathpix import MathpixProvider
+from marker.settings import settings
 
 
 class OCRConverter(PdfConverter):
-    default_processors: Tuple[BaseProcessor, ...] = (EquationProcessor,)
+    default_processors: Tuple[BaseProcessor, ...] = (get_equation_processor(),)
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -21,6 +23,12 @@ def __init__(self, *args, **kwargs):
 
         self.config["format_lines"] = True
         self.renderer = OCRJSONRenderer
+        
+        # Initialize Mathpix provider
+        self.mathpix_provider = MathpixProvider(
+            app_id=settings.MATHPIX_APP_ID,
+            app_key=settings.MATHPIX_APP_KEY
+        )
 
     def build_document(self, filepath: str):
         provider_cls = provider_from_filepath(filepath)
@@ -32,6 +40,7 @@ def build_document(self, filepath: str):
         provider = provider_cls(filepath, self.config)
         document = document_builder(provider, layout_builder, line_builder, ocr_builder)
 
+        # Initialize processors
         for processor in self.processor_list:
             processor(document)
 
diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py
index 21a604431..20b0f20c5 100644
--- a/marker/converters/pdf.py
+++ b/marker/converters/pdf.py
@@ -24,7 +24,7 @@
 from marker.processors.code import CodeProcessor
 from marker.processors.debug import DebugProcessor
 from marker.processors.document_toc import DocumentTOCProcessor
-from marker.processors.equation import EquationProcessor
+from marker.processors.equation import get_equation_processor
 from marker.processors.footnote import FootnoteProcessor
 from marker.processors.ignoretext import IgnoreTextProcessor
 from marker.processors.line_numbers import LineNumbersProcessor
@@ -73,7 +73,7 @@ class PdfConverter(BaseConverter):
         BlockquoteProcessor,
         CodeProcessor,
         DocumentTOCProcessor,
-        EquationProcessor,
+        get_equation_processor,
         FootnoteProcessor,
         IgnoreTextProcessor,
         LineNumbersProcessor,
diff --git a/marker/processors/equation.py b/marker/processors/equation.py
index 36124645c..6cc83c98d 100644
--- a/marker/processors/equation.py
+++ b/marker/processors/equation.py
@@ -4,25 +4,58 @@
 from bs4 import BeautifulSoup
 
 from ftfy import fix_text, TextFixerConfig
-from surya.recognition import RecognitionPredictor, OCRResult
-
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
 from marker.settings import settings
+from marker.providers.mathpix import MathpixProvider
+from surya.recognition import RecognitionPredictor, OCRResult
 
 MATH_TAG_PATTERN = re.compile(r"<math[^>]*>(.*?)</math>")
 
-
-class EquationProcessor(BaseProcessor):
+class BaseEquationProcessor(BaseProcessor):
     """
-    A processor for recognizing equations in the document.
+    Base class for equation processors.
     """
-
     block_types: Annotated[
         Tuple[BlockTypes],
         "The block types to process.",
     ] = (BlockTypes.Equation,)
+
+    def fix_latex(self, math_html: str):
+        math_html = math_html.strip()
+        soup = BeautifulSoup(math_html, "html.parser")
+        opening_math_tag = soup.find("math")
+
+        # No math block found
+        if not opening_math_tag:
+            return ""
+
+        # Force block format
+        opening_math_tag.attrs["display"] = "block"
+        fixed_math_html = str(soup)
+
+        # Sometimes model outputs newlines at the beginning/end of tags
+        fixed_math_html = re.sub(
+            r"^<math display=\"block\">\\n(?![a-zA-Z])",
+            '<math display="block">',
+            fixed_math_html,
+        )
+        fixed_math_html = re.sub(r"\\n</math>$", "</math>", fixed_math_html)
+        fixed_math_html = fix_text(
+            fixed_math_html, config=TextFixerConfig(unescape_html=True)
+        )
+        return fixed_math_html
+
+    def process_equations(self, document: Document):
+        """Process equations in the document. To be implemented by subclasses."""
+        raise NotImplementedError
+
+
+class OriginalEquationProcessor(BaseEquationProcessor):
+    """
+    Original equation processor using the recognition model.
+    """
     model_max_length: Annotated[
         int,
         "The maximum number of tokens to allow for the Recognition model.",
@@ -39,7 +72,6 @@ class EquationProcessor(BaseProcessor):
 
     def __init__(self, recognition_model: RecognitionPredictor, config=None):
         super().__init__(config)
-
         self.recognition_model = recognition_model
 
     def get_batch_size(self):
@@ -93,31 +125,6 @@ def __call__(self, document: Document):
                 block = document.get_block(block_id)
                 block.html = self.fix_latex(block_prediction)
 
-    def fix_latex(self, math_html: str):
-        math_html = math_html.strip()
-        soup = BeautifulSoup(math_html, "html.parser")
-        opening_math_tag = soup.find("math")
-
-        # No math block found
-        if not opening_math_tag:
-            return ""
-
-        # Force block format
-        opening_math_tag.attrs["display"] = "block"
-        fixed_math_html = str(soup)
-
-        # Sometimes model outputs newlines at the beginning/end of tags
-        fixed_math_html = re.sub(
-            r"^<math display=\"block\">\\n(?![a-zA-Z])",
-            '<math display="block">',
-            fixed_math_html,
-        )
-        fixed_math_html = re.sub(r"\\n</math>$", "</math>", fixed_math_html)
-        fixed_math_html = fix_text(
-            fixed_math_html, config=TextFixerConfig(unescape_html=True)
-        )
-        return fixed_math_html
-
     def get_latex_batched(
         self,
         page_images: List[Image.Image],
@@ -138,3 +145,85 @@ def get_latex_batched(
         ]
 
         return equation_predictions
+
+
+class MathpixEquationProcessor(BaseEquationProcessor):
+    """
+    Equation processor using Mathpix API.
+    """
+    def __init__(self, mathpix_provider: MathpixProvider, config=None):
+        super().__init__(config)
+        self.mathpix_provider = mathpix_provider
+
+    def __call__(self, document: Document):
+        images = []
+        equation_boxes = []
+        equation_block_ids = []
+        total_equation_blocks = 0
+
+        for page in document.pages:
+            page_image = page.get_image(highres=True)
+            page_size = page.polygon.width, page.polygon.height
+            image_size = page_image.size
+
+            page_equation_boxes = []
+            page_equation_block_ids = []
+            equation_blocks = page.contained_blocks(document, self.block_types)
+            for block in equation_blocks:
+                page_equation_boxes.append(
+                    block.polygon.rescale(page_size, image_size).bbox
+                )
+                page_equation_block_ids.append(block.id)
+                total_equation_blocks += 1
+
+            images.append(page_image)
+            equation_boxes.append(page_equation_boxes)
+            equation_block_ids.append(page_equation_block_ids)
+
+        if total_equation_blocks == 0:
+            return
+
+        # Process each equation with Mathpix
+        for page_idx, (page_image, page_boxes, page_block_ids) in enumerate(
+            zip(images, equation_boxes, equation_block_ids)
+        ):
+            for box_idx, (box, block_id) in enumerate(zip(page_boxes, page_block_ids)):
+                # Crop the equation from the page
+                x1, y1, x2, y2 = [int(coord) for coord in box]
+                equation_image = page_image.crop((x1, y1, x2, y2))
+                
+                # Process with Mathpix
+                try:
+                    result = self.mathpix_provider.process_equation(equation_image)
+                    
+                    # Extract LaTeX from the result
+                    latex = result.get('latex_styled', '')
+                    if latex:
+                        # Wrap in math tags
+                        block = document.get_block(block_id)
+                        block.html = self.fix_latex(f'<math display="block">{latex}</math>')
+                except Exception as e:
+                    print(f"Error processing equation {block_id}: {str(e)}")
+                    continue
+
+
+def get_equation_processor(config=None):
+    """
+    Factory function to get the appropriate equation processor based on settings.
+    """
+    if settings.EQUATION_PROCESSOR == "mathpix":
+        if not settings.MATHPIX_APP_ID or not settings.MATHPIX_APP_KEY:
+            raise ValueError("Mathpix API credentials not configured")
+        return MathpixEquationProcessor(
+            mathpix_provider=MathpixProvider(
+                app_id=settings.MATHPIX_APP_ID,
+                app_key=settings.MATHPIX_APP_KEY
+            ),
+            config=config
+        )
+    else:
+        from surya.recognition import RecognitionPredictor
+        return OriginalEquationProcessor(
+            recognition_model=RecognitionPredictor(),
+            config=config
+        )
diff --git a/marker/providers/mathpix.py b/marker/providers/mathpix.py
new file mode 100644
index 000000000..daa35bcb2
--- /dev/null
+++ b/marker/providers/mathpix.py
@@ -0,0 +1,58 @@
+from typing import Optional, Dict, Any
+import requests
+from PIL import Image
+import io
+import base64
+
+class MathpixProvider:
+    def __init__(self, app_id: str, app_key: str):
+        self.app_id = app_id
+        self.app_key = app_key
+        self.api_url = "https://api.mathpix.com/v3/text"
+        
+    def _encode_image(self, image: Image.Image) -> str:
+        """Convert PIL Image to base64 string"""
+        buffered = io.BytesIO()
+        image.save(buffered, format="PNG")
+        return base64.b64encode(buffered.getvalue()).decode()
+
+    def process_equation(self, image: Image.Image, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        """
+        Process an equation image using Mathpix API
+        
+        Args:
+            image: PIL Image containing the equation
+            options: Additional options for Mathpix API
+            
+        Returns:
+            Dict containing the processed equation data
+        """
+        if options is None:
+            options = {}
+            
+        # Prepare the request
+        headers = {
+            "app_id": self.app_id,
+            "app_key": self.app_key,
+            "Content-Type": "application/json"
+        }
+        
+        # Convert image to base64
+        image_data = self._encode_image(image)
+        
+        # Prepare request body
+        data = {
+            "src": f"data:image/png;base64,{image_data}",
+            "formats": ["text", "latex_styled"],
+            "data_options": {
+                "include_asciimath": True,
+                "include_latex": True
+            },
+            **options
+        }
+        
+        # Make API request
+        response = requests.post(self.api_url, headers=headers, json=data)
+        response.raise_for_status()
+        
+        return response.json() 
\ No newline at end of file
diff --git a/marker/settings.py b/marker/settings.py
index 5660ada90..d31a51e61 100644
--- a/marker/settings.py
+++ b/marker/settings.py
@@ -1,11 +1,13 @@
-from typing import Optional
+from typing import Optional, Literal
 
-from dotenv import find_dotenv
+from dotenv import find_dotenv, load_dotenv
 from pydantic import computed_field
 from pydantic_settings import BaseSettings
 import torch
 import os
 
+# Load environment variables from .env file
+load_dotenv(find_dotenv(".env"))
 
 class Settings(BaseSettings):
     # Paths
@@ -30,6 +32,11 @@ class Settings(BaseSettings):
         None  # Note: MPS device does not work for text detection, and will default to CPU
     )
 
+    # Equation processing settings
+    EQUATION_PROCESSOR: Literal["mathpix", "original"] = "original"  # Default to original implementation
+    MATHPIX_APP_ID: str = os.getenv("MATHPIX_APP_ID", "")
+    MATHPIX_APP_KEY: str = os.getenv("MATHPIX_APP_KEY", "")
+
     @computed_field
     @property
     def TORCH_DEVICE_MODEL(self) -> str:

From 4c6c7c2f4155b03b176a9848d66c573116bfb85b Mon Sep 17 00:00:00 2001
From: Eliska Suchardova <eliska.suchardova@gmail.com>
Date: Thu, 29 May 2025 12:27:17 +0200
Subject: [PATCH 2/7] Refactoring mathpix implementation

---
 marker/converters/ocr.py      |   4 +-
 marker/converters/pdf.py      |   4 +-
 marker/processors/equation.py | 187 +++++++++++++---------------------
 3 files changed, 74 insertions(+), 121 deletions(-)

diff --git a/marker/converters/ocr.py b/marker/converters/ocr.py
index 2dab5cc5a..38ee71dd1 100644
--- a/marker/converters/ocr.py
+++ b/marker/converters/ocr.py
@@ -5,7 +5,7 @@
 from marker.builders.ocr import OcrBuilder
 from marker.converters.pdf import PdfConverter
 from marker.processors import BaseProcessor
-from marker.processors.equation import get_equation_processor
+from marker.processors.equation import EquationProcessor
 from marker.providers.registry import provider_from_filepath
 from marker.renderers.ocr_json import OCRJSONRenderer
 from marker.providers.mathpix import MathpixProvider
@@ -13,7 +13,7 @@
 
 
 class OCRConverter(PdfConverter):
-    default_processors: Tuple[BaseProcessor, ...] = (get_equation_processor(),)
+    default_processors: Tuple[BaseProcessor, ...] = (EquationProcessor,)
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py
index 20b0f20c5..21a604431 100644
--- a/marker/converters/pdf.py
+++ b/marker/converters/pdf.py
@@ -24,7 +24,7 @@
 from marker.processors.code import CodeProcessor
 from marker.processors.debug import DebugProcessor
 from marker.processors.document_toc import DocumentTOCProcessor
-from marker.processors.equation import get_equation_processor
+from marker.processors.equation import EquationProcessor
 from marker.processors.footnote import FootnoteProcessor
 from marker.processors.ignoretext import IgnoreTextProcessor
 from marker.processors.line_numbers import LineNumbersProcessor
@@ -73,7 +73,7 @@ class PdfConverter(BaseConverter):
         BlockquoteProcessor,
         CodeProcessor,
         DocumentTOCProcessor,
-        get_equation_processor,
+        EquationProcessor,
         FootnoteProcessor,
         IgnoreTextProcessor,
         LineNumbersProcessor,
diff --git a/marker/processors/equation.py b/marker/processors/equation.py
index 6cc83c98d..836d286e6 100644
--- a/marker/processors/equation.py
+++ b/marker/processors/equation.py
@@ -13,49 +13,14 @@
 
 MATH_TAG_PATTERN = re.compile(r"<math[^>]*>(.*?)</math>")
 
-class BaseEquationProcessor(BaseProcessor):
+class EquationProcessor(BaseProcessor):
     """
-    Base class for equation processors.
+    A processor for recognizing equations in the document.
     """
     block_types: Annotated[
         Tuple[BlockTypes],
         "The block types to process.",
     ] = (BlockTypes.Equation,)
-
-    def fix_latex(self, math_html: str):
-        math_html = math_html.strip()
-        soup = BeautifulSoup(math_html, "html.parser")
-        opening_math_tag = soup.find("math")
-
-        # No math block found
-        if not opening_math_tag:
-            return ""
-
-        # Force block format
-        opening_math_tag.attrs["display"] = "block"
-        fixed_math_html = str(soup)
-
-        # Sometimes model outputs newlines at the beginning/end of tags
-        fixed_math_html = re.sub(
-            r"^<math display=\"block\">\\n(?![a-zA-Z])",
-            '<math display="block">',
-            fixed_math_html,
-        )
-        fixed_math_html = re.sub(r"\\n</math>$", "</math>", fixed_math_html)
-        fixed_math_html = fix_text(
-            fixed_math_html, config=TextFixerConfig(unescape_html=True)
-        )
-        return fixed_math_html
-
-    def process_equations(self, document: Document):
-        """Process equations in the document. To be implemented by subclasses."""
-        raise NotImplementedError
-
-
-class OriginalEquationProcessor(BaseEquationProcessor):
-    """
-    Original equation processor using the recognition model.
-    """
     model_max_length: Annotated[
         int,
         "The maximum number of tokens to allow for the Recognition model.",
@@ -69,10 +34,24 @@ class OriginalEquationProcessor(BaseEquationProcessor):
         bool,
         "Whether to disable the tqdm progress bar.",
     ] = False
+    use_mathpix: Annotated[
+        bool,
+        "Whether to use Mathpix for equation processing.",
+    ] = False
 
     def __init__(self, recognition_model: RecognitionPredictor, config=None):
         super().__init__(config)
         self.recognition_model = recognition_model
+        
+        if config and config.get("use_mathpix", False):
+            self.use_mathpix = True
+            
+            if not settings.MATHPIX_APP_ID or not settings.MATHPIX_APP_KEY:
+                raise ValueError("Mathpix API credentials not configured")
+            self.mathpix_provider = MathpixProvider(
+                app_id=settings.MATHPIX_APP_ID,
+                app_key=settings.MATHPIX_APP_KEY
+            )
 
     def get_batch_size(self):
         # Set to 1/4th of OCR batch size due to sequence length with tiling
@@ -112,6 +91,35 @@ def __call__(self, document: Document):
         if total_equation_blocks == 0:
             return
 
+        if self.use_mathpix:
+            self._process_with_mathpix(images, equation_boxes, equation_block_ids, document)
+        else:
+            self._process_with_recognition(images, equation_boxes, equation_block_ids, document)
+
+    def _process_with_mathpix(self, images, equation_boxes, equation_block_ids, document):
+        for page_idx, (page_image, page_boxes, page_block_ids) in enumerate(
+            zip(images, equation_boxes, equation_block_ids)
+        ):
+            for box_idx, (box, block_id) in enumerate(zip(page_boxes, page_block_ids)):
+                # Crop the equation from the page
+                x1, y1, x2, y2 = [int(coord) for coord in box]
+                equation_image = page_image.crop((x1, y1, x2, y2))
+                
+                # Process with Mathpix
+                try:
+                    result = self.mathpix_provider.process_equation(equation_image)
+                    
+                    # Extract LaTeX from the result
+                    latex = result.get('latex_styled', '')
+                    if latex:
+                        # Wrap in math tags
+                        block = document.get_block(block_id)
+                        block.html = self.fix_latex(f'<math display="block">{latex}</math>')
+                except Exception as e:
+                    print(f"Error processing equation {block_id}: {str(e)}")
+                    continue
+
+    def _process_with_recognition(self, images, equation_boxes, equation_block_ids, document):
         predictions = self.get_latex_batched(images, equation_boxes)
         for page_predictions, page_equation_block_ids in zip(
             predictions, equation_block_ids
@@ -125,6 +133,31 @@ def __call__(self, document: Document):
                 block = document.get_block(block_id)
                 block.html = self.fix_latex(block_prediction)
 
+    def fix_latex(self, math_html: str):
+        math_html = math_html.strip()
+        soup = BeautifulSoup(math_html, "html.parser")
+        opening_math_tag = soup.find("math")
+
+        # No math block found
+        if not opening_math_tag:
+            return ""
+
+        # Force block format
+        opening_math_tag.attrs["display"] = "block"
+        fixed_math_html = str(soup)
+
+        # Sometimes model outputs newlines at the beginning/end of tags
+        fixed_math_html = re.sub(
+            r"^<math display=\"block\">\\n(?![a-zA-Z])",
+            '<math display="block">',
+            fixed_math_html,
+        )
+        fixed_math_html = re.sub(r"\\n</math>$", "</math>", fixed_math_html)
+        fixed_math_html = fix_text(
+            fixed_math_html, config=TextFixerConfig(unescape_html=True)
+        )
+        return fixed_math_html
+
     def get_latex_batched(
         self,
         page_images: List[Image.Image],
@@ -147,83 +180,3 @@ def get_latex_batched(
         return equation_predictions
 
 
-class MathpixEquationProcessor(BaseEquationProcessor):
-    """
-    Equation processor using Mathpix API.
-    """
-    def __init__(self, mathpix_provider: MathpixProvider, config=None):
-        super().__init__(config)
-        self.mathpix_provider = mathpix_provider
-
-    def __call__(self, document: Document):
-        images = []
-        equation_boxes = []
-        equation_block_ids = []
-        total_equation_blocks = 0
-
-        for page in document.pages:
-            page_image = page.get_image(highres=True)
-            page_size = page.polygon.width, page.polygon.height
-            image_size = page_image.size
-
-            page_equation_boxes = []
-            page_equation_block_ids = []
-            equation_blocks = page.contained_blocks(document, self.block_types)
-            for block in equation_blocks:
-                page_equation_boxes.append(
-                    block.polygon.rescale(page_size, image_size).bbox
-                )
-                page_equation_block_ids.append(block.id)
-                total_equation_blocks += 1
-
-            images.append(page_image)
-            equation_boxes.append(page_equation_boxes)
-            equation_block_ids.append(page_equation_block_ids)
-
-        if total_equation_blocks == 0:
-            return
-
-        # Process each equation with Mathpix
-        for page_idx, (page_image, page_boxes, page_block_ids) in enumerate(
-            zip(images, equation_boxes, equation_block_ids)
-        ):
-            for box_idx, (box, block_id) in enumerate(zip(page_boxes, page_block_ids)):
-                # Crop the equation from the page
-                x1, y1, x2, y2 = [int(coord) for coord in box]
-                equation_image = page_image.crop((x1, y1, x2, y2))
-                
-                # Process with Mathpix
-                try:
-                    result = self.mathpix_provider.process_equation(equation_image)
-                    
-                    # Extract LaTeX from the result
-                    latex = result.get('latex_styled', '')
-                    if latex:
-                        # Wrap in math tags
-                        block = document.get_block(block_id)
-                        block.html = self.fix_latex(f'<math display="block">{latex}</math>')
-                except Exception as e:
-                    print(f"Error processing equation {block_id}: {str(e)}")
-                    continue
-
-
-def get_equation_processor(config=None):
-    """
-    Factory function to get the appropriate equation processor based on settings.
-    """
-    if settings.EQUATION_PROCESSOR == "mathpix":
-        if not settings.MATHPIX_APP_ID or not settings.MATHPIX_APP_KEY:
-            raise ValueError("Mathpix API credentials not configured")
-        return MathpixEquationProcessor(
-            mathpix_provider=MathpixProvider(
-                app_id=settings.MATHPIX_APP_ID,
-                app_key=settings.MATHPIX_APP_KEY
-            ),
-            config=config
-        )
-    else:
-        from surya.recognition import RecognitionPredictor
-        return OriginalEquationProcessor(
-            recognition_model=RecognitionPredictor(),
-            config=config
-        )

From 69a15325f206d41daf9ed663dee99b946c33caff Mon Sep 17 00:00:00 2001
From: Eliska Suchardova <eliska.suchardova@gmail.com>
Date: Thu, 29 May 2025 12:32:20 +0200
Subject: [PATCH 3/7] Adjusting config

---
 marker/config/parser.py       | 3 +--
 marker/processors/equation.py | 6 ++----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/marker/config/parser.py b/marker/config/parser.py
index e3543beef..1af9c2dc2 100644
--- a/marker/config/parser.py
+++ b/marker/config/parser.py
@@ -113,8 +113,7 @@ def generate_config_dict(self) -> Dict[str, any]:
                 case "disable_image_extraction":
                     config["extract_images"] = False
                 case "mathpix":
-                    if v:
-                        settings.EQUATION_PROCESSOR = "mathpix"
+                    config["use_mathpix"] = v
                 case _:
                     if k in crawler.attr_set:
                         config[k] = v
diff --git a/marker/processors/equation.py b/marker/processors/equation.py
index 836d286e6..fb747536b 100644
--- a/marker/processors/equation.py
+++ b/marker/processors/equation.py
@@ -42,10 +42,8 @@ class EquationProcessor(BaseProcessor):
     def __init__(self, recognition_model: RecognitionPredictor, config=None):
         super().__init__(config)
         self.recognition_model = recognition_model
-        
-        if config and config.get("use_mathpix", False):
-            self.use_mathpix = True
-            
+
+        if self.use_mathpix == True:
             if not settings.MATHPIX_APP_ID or not settings.MATHPIX_APP_KEY:
                 raise ValueError("Mathpix API credentials not configured")
             self.mathpix_provider = MathpixProvider(

From a6a5d45f64b8026193a0c29571a6f8c65339d86b Mon Sep 17 00:00:00 2001
From: Eliska Suchardova <eliska.suchardova@gmail.com>
Date: Thu, 29 May 2025 12:44:21 +0200
Subject: [PATCH 4/7] Removing unneccessary code

---
 marker/converters/__init__.py | 9 ++-------
 marker/settings.py            | 1 -
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/marker/converters/__init__.py b/marker/converters/__init__.py
index fea1bd20b..be56fbfe2 100644
--- a/marker/converters/__init__.py
+++ b/marker/converters/__init__.py
@@ -43,13 +43,8 @@ def resolve_dependencies(self, cls):
     def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) -> List[BaseProcessor]:
         processors = []
         for processor_cls in processor_cls_lst:
-            if callable(processor_cls) and processor_cls.__name__ == 'get_equation_processor':
-                # Special case for equation processor
-                processor = processor_cls(config=self.config)
-            else:
-                processor = self.resolve_dependencies(processor_cls)
-            processors.append(processor)
-
+            processors.append(self.resolve_dependencies(processor_cls))
+            
         simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
         other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)]
 
diff --git a/marker/settings.py b/marker/settings.py
index d31a51e61..d5d606150 100644
--- a/marker/settings.py
+++ b/marker/settings.py
@@ -33,7 +33,6 @@ class Settings(BaseSettings):
     )
 
     # Equation processing settings
-    EQUATION_PROCESSOR: Literal["mathpix", "original"] = "original"  # Default to original implementation
     MATHPIX_APP_ID: str = os.getenv("MATHPIX_APP_ID", "")
     MATHPIX_APP_KEY: str = os.getenv("MATHPIX_APP_KEY", "")
 

From 4332294d6203b1c714e48e348a738b6f9fcb0357 Mon Sep 17 00:00:00 2001
From: Eliska Suchardova <eliska.suchardova@gmail.com>
Date: Thu, 29 May 2025 14:16:00 +0200
Subject: [PATCH 5/7] MAthipix processes equations and InlineMath

---
 marker/processors/equation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/marker/processors/equation.py b/marker/processors/equation.py
index fb747536b..83f71fb86 100644
--- a/marker/processors/equation.py
+++ b/marker/processors/equation.py
@@ -50,6 +50,8 @@ def __init__(self, recognition_model: RecognitionPredictor, config=None):
                 app_id=settings.MATHPIX_APP_ID,
                 app_key=settings.MATHPIX_APP_KEY
             )
+            # Add TextInlineMath to block types when Mathpix is enabled
+            self.block_types = (BlockTypes.Equation, BlockTypes.TextInlineMath)
 
     def get_batch_size(self):
         # Set to 1/4th of OCR batch size due to sequence length with tiling

From b3cc3e416465a32270046e6c4db3dd710fdb2c8d Mon Sep 17 00:00:00 2001
From: Eliska Suchardova <eliska.suchardova@gmail.com>
Date: Thu, 29 May 2025 16:43:12 +0200
Subject: [PATCH 6/7] Testing script for box filtering

---
 test_clipping.py            | 110 ++++++++++++++++++++++++++++++++++++
 visualize_matching_boxes.py | 104 ++++++++++++++++++++++++++++++++++
 2 files changed, 214 insertions(+)
 create mode 100644 test_clipping.py
 create mode 100644 visualize_matching_boxes.py

diff --git a/test_clipping.py b/test_clipping.py
new file mode 100644
index 000000000..7fc9d65a4
--- /dev/null
+++ b/test_clipping.py
@@ -0,0 +1,110 @@
+import os
+import argparse
+import pypdfium2 as pdfium
+from PIL import Image, ImageDraw
+import pypdfium2.raw as pdfium_c
+import ctypes
+
+def get_text_from_raw_text_obj(raw_text_obj, page):
+    textpage = pdfium_c.FPDFText_LoadPage(page.raw)
+    if not textpage:
+        return ""
+    try:
+        buflen = pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, None, 0)
+        if buflen <= 0:
+            return ""
+        buf = (ctypes.c_ushort * buflen)()
+        pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, buf, buflen)
+        # Convert UTF-16LE buffer to Python string, strip trailing nulls
+        return bytearray(buf).decode('utf-16-le').rstrip('\x00')
+    finally:
+        pdfium_c.FPDFText_ClosePage(textpage)
+
+def draw_text_objects(page, clip_region, output_path):
+    objects = list(page.get_objects())
+    scale = 2.0
+    bitmap = page.render(scale=scale)
+    pil_image = bitmap.to_pil()
+    page_width, page_height = pil_image.size
+    boxes_image = Image.new('RGB', (page_width, page_height), 'white')
+    boxes_draw = ImageDraw.Draw(boxes_image)
+
+    found = False
+    for i, obj in enumerate(objects):
+        if obj.type == 1:  # 1 = text object
+            text = get_text_from_raw_text_obj(obj.raw, page)
+            if text and text.strip() == "30075":
+                found = True
+                print(f"Found text '30075' in object {i+1}")
+                # Get and draw object bounding box (red)
+                left = ctypes.c_float()
+                bottom = ctypes.c_float()
+                right = ctypes.c_float()
+                top = ctypes.c_float()
+                success = pdfium_c.FPDFPageObj_GetBounds(obj.raw, ctypes.byref(left), ctypes.byref(bottom), ctypes.byref(right), ctypes.byref(top))
+                if success:
+                    print(f"Object bounding box: left={left.value}, bottom={bottom.value}, right={right.value}, top={top.value}")
+                    pil_top = page_height - (top.value * scale)
+                    pil_bottom = page_height - (bottom.value * scale)
+                    pil_left = left.value * scale
+                    pil_right = right.value * scale
+                    boxes_draw.rectangle([pil_left, pil_top, pil_right, pil_bottom], outline='red', width=3)
+                else:
+                    print("Could not get object bounding box.")
+                # Try to get and draw actual clipping path (blue)
+                try:
+                    clip_path = pdfium_c.FPDFPageObj_GetClipPath(obj.raw)
+                    if clip_path:
+                        if all(hasattr(pdfium_c, fn) for fn in [
+                            "FPDFClipPath_CountPaths", "FPDFClipPath_CountPathSegments", "FPDFClipPath_GetPathSegment", "FPDFPathSegment_GetPoint", "FPDFPathSegment_GetType", "FPDFPathSegment_GetClose"]):
+                            num_paths = pdfium_c.FPDFClipPath_CountPaths(clip_path)
+                            for path_idx in range(num_paths):
+                                num_segs = pdfium_c.FPDFClipPath_CountPathSegments(clip_path, path_idx)
+                                points = []
+                                for seg_idx in range(num_segs):
+                                    seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, seg_idx)
+                                    x = ctypes.c_float()
+                                    y = ctypes.c_float()
+                                    pdfium_c.FPDFPathSegment_GetPoint(seg, ctypes.byref(x), ctypes.byref(y))
+                                    pil_x = x.value * scale
+                                    pil_y = page_height - (y.value * scale)
+                                    points.append((pil_x, pil_y))
+                                    # Log the raw PDF coordinates and the PIL coordinates
+                                    print(f"Object {i+1}, path {path_idx}, seg {seg_idx}: PDF ({x.value}, {y.value}) -> PIL ({pil_x}, {pil_y})")
+                                # Check if path is closed
+                                closed = False
+                                if num_segs > 0:
+                                    last_seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, num_segs-1)
+                                    closed = bool(pdfium_c.FPDFPathSegment_GetClose(last_seg))
+                                if len(points) > 1:
+                                    if closed:
+                                        boxes_draw.polygon(points, outline='blue')
+                                    else:
+                                        boxes_draw.line(points, fill='blue', width=3)
+                        else:
+                            print("Clipping path exists, but path segment functions are not available in this pypdfium2 version.")
+                    else:
+                        print("No clipping path for this object.")
+                except Exception as e:
+                    print(f"No clipping path or error: {e}")
+    if not found:
+        print("No text object with text '30075' found.")
+
+    boxes_output_path = output_path.replace('.png', '_boxes.png')
+    boxes_image.save(boxes_output_path)
+    print(f"Boxes-only visualization saved to {boxes_output_path}")
+
+def main():
+    parser = argparse.ArgumentParser(description='Visualize PDF text objects with clipping')
+    parser.add_argument('pdf_path', help='Path to the PDF file')
+    parser.add_argument('--page', type=int, default=0, help='Page number (0-based)')
+    args = parser.parse_args()
+    output_dir = "test_output"
+    os.makedirs(output_dir, exist_ok=True)
+    doc = pdfium.PdfDocument(args.pdf_path)
+    page = doc[args.page]
+    output_path = os.path.join(output_dir, f"page_{args.page}.png")
+    draw_text_objects(page, None, output_path)
+
+if __name__ == '__main__':
+    main()
diff --git a/visualize_matching_boxes.py b/visualize_matching_boxes.py
new file mode 100644
index 000000000..f70939851
--- /dev/null
+++ b/visualize_matching_boxes.py
@@ -0,0 +1,104 @@
+import argparse
+import pypdfium2 as pdfium
+import pypdfium2.raw as pdfium_c
+import ctypes
+from PIL import Image, ImageDraw
+
+def get_text_from_raw_text_obj(raw_text_obj, page):
+    textpage = pdfium_c.FPDFText_LoadPage(page.raw)
+    if not textpage:
+        return ""
+    try:
+        buflen = pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, None, 0)
+        if buflen <= 0:
+            return ""
+        buf = (ctypes.c_ushort * buflen)()
+        pdfium_c.FPDFTextObj_GetText(raw_text_obj, textpage, buf, buflen)
+        return bytearray(buf).decode('utf-16-le').rstrip('\x00')
+    finally:
+        pdfium_c.FPDFText_ClosePage(textpage)
+
+def boxes_are_equal(box1, box2, tol=1e-2):
+    return all(abs(a - b) < tol for a, b in zip(box1, box2))
+
+def draw_matching_boxes(page, output_path):
+    objects = list(page.get_objects())
+    scale = 2.0
+    bitmap = page.render(scale=scale)
+    pil_image = bitmap.to_pil()
+    page_width, page_height = pil_image.size
+    boxes_draw = ImageDraw.Draw(pil_image)
+
+    for i, obj in enumerate(objects):
+        # Get object bounding box
+        left = ctypes.c_float()
+        bottom = ctypes.c_float()
+        right = ctypes.c_float()
+        top = ctypes.c_float()
+        success = pdfium_c.FPDFPageObj_GetBounds(obj.raw, ctypes.byref(left), ctypes.byref(bottom), ctypes.byref(right), ctypes.byref(top))
+        if not success:
+            print(f"Object {i+1}: Could not get object bounding box.")
+            continue
+        obj_box = (left.value, bottom.value, right.value, top.value)
+        print(f"Object {i+1}: Bounding box: {obj_box}")
+        # Try to get clipping path
+        show_box = True
+        try:
+            clip_path = pdfium_c.FPDFPageObj_GetClipPath(obj.raw)
+            if clip_path and all(hasattr(pdfium_c, fn) for fn in [
+                "FPDFClipPath_CountPaths", "FPDFClipPath_CountPathSegments", "FPDFClipPath_GetPathSegment", "FPDFPathSegment_GetPoint", "FPDFPathSegment_GetType", "FPDFPathSegment_GetClose"]):
+                num_paths = pdfium_c.FPDFClipPath_CountPaths(clip_path)
+                for path_idx in range(num_paths):
+                    num_segs = pdfium_c.FPDFClipPath_CountPathSegments(clip_path, path_idx)
+                    points = []
+                    for seg_idx in range(num_segs):
+                        seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, seg_idx)
+                        x = ctypes.c_float()
+                        y = ctypes.c_float()
+                        pdfium_c.FPDFPathSegment_GetPoint(seg, ctypes.byref(x), ctypes.byref(y))
+                        points.append((x.value, y.value))
+                    if len(points) >= 2:
+                        # Get the bounding box of the clip path
+                        xs = [pt[0] for pt in points]
+                        ys = [pt[1] for pt in points]
+                        clip_box = (min(xs), min(ys), max(xs), max(ys))
+                        print(f"Object {i+1}: Clip path box: {clip_box}")
+                        # Draw the clip path in blue
+                        pil_points = [(x * scale, page_height - (y * scale)) for x, y in points]
+                        closed = False
+                        if num_segs > 0:
+                            last_seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, num_segs-1)
+                            closed = bool(pdfium_c.FPDFPathSegment_GetClose(last_seg))
+                        if closed:
+                            boxes_draw.polygon(pil_points, outline='blue')
+                        else:
+                            boxes_draw.line(pil_points, fill='blue', width=3)
+                        # Only show the bounding box if it matches the clip box
+                        if not boxes_are_equal(obj_box, clip_box):
+                            show_box = False
+            else:
+                print(f"Object {i+1}: No usable clipping path.")
+        except Exception as e:
+            print(f"Object {i+1}: Error getting clip path: {e}")
+        if show_box:
+            pil_top = page_height - (obj_box[3] * scale)
+            pil_bottom = page_height - (obj_box[1] * scale)
+            pil_left = obj_box[0] * scale
+            pil_right = obj_box[2] * scale
+            boxes_draw.rectangle([pil_left, pil_top, pil_right, pil_bottom], outline='red', width=3)
+
+    pil_image.save(output_path)
+    print(f"Matching boxes visualization saved to {output_path}")
+
+def main():
+    parser = argparse.ArgumentParser(description='Visualize matching bounding boxes and clip paths for all objects')
+    parser.add_argument('pdf_path', help='Path to the PDF file')
+    parser.add_argument('--page', type=int, default=0, help='Page number (0-based)')
+    parser.add_argument('--output', type=str, default='matching_boxes.png', help='Output image path')
+    args = parser.parse_args()
+    doc = pdfium.PdfDocument(args.pdf_path)
+    page = doc[args.page]
+    draw_matching_boxes(page, args.output)
+
+if __name__ == '__main__':
+    main() 
\ No newline at end of file

From 166cd0ce45963874103a5f70ac3eed3a369be8c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Va=CC=81cha?= <michal.vacha@live.com>
Date: Thu, 5 Jun 2025 21:44:44 +0200
Subject: [PATCH 7/7] improve clip visualization

---
 poetry.lock                 | 102 +++++++++++-----------
 pyproject.toml              |   6 +-
 visualize_matching_boxes.py | 167 +++++++++++++++++++++++++++++-------
 3 files changed, 191 insertions(+), 84 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 90b66ece0..29216101b 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -3366,10 +3366,10 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\""},
-    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\""},
-    {version = ">=1.23.5", markers = "python_version >= \"3.11\""},
     {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+    {version = ">=1.23.5", markers = "python_version == \"3.11\""},
+    {version = ">=1.21.4", markers = "python_version == \"3.10\" and platform_system == \"Darwin\""},
+    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version == \"3.10\""},
 ]
 
 [[package]]
@@ -3466,9 +3466,9 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
-    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
     {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
 ]
 python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
@@ -3532,18 +3532,20 @@ name = "pdftext"
 version = "0.6.2"
 description = "Extract structured text from pdfs quickly"
 optional = false
-python-versions = "<4.0,>=3.10"
+python-versions = "^3.10"
 groups = ["main"]
-files = [
-    {file = "pdftext-0.6.2-py3-none-any.whl", hash = "sha256:905d11e62d548e307933c25865a69c8e993947bb5b40b1535b0a2aa8f07a71d4"},
-    {file = "pdftext-0.6.2.tar.gz", hash = "sha256:ff5b92462ac03ae63a23429384ae123d45c162dcda30e7bf2c5c92a6b208c9de"},
-]
+files = []
+develop = true
 
 [package.dependencies]
-click = ">=8.1.8,<9.0.0"
-pydantic = ">=2.7.1,<3.0.0"
-pydantic-settings = ">=2.2.1,<3.0.0"
-pypdfium2 = "4.30.0"
+click = "^8.1.8"
+pydantic = "^2.7.1"
+pydantic-settings = "^2.2.1"
+pypdfium2 = "=4.30.1"
+
+[package.source]
+type = "directory"
+url = "../pdftext"
 
 [[package]]
 name = "pexpect"
@@ -4264,25 +4266,25 @@ windows-terminal = ["colorama (>=0.4.6)"]
 
 [[package]]
 name = "pypdfium2"
-version = "4.30.0"
+version = "4.30.1"
 description = "Python bindings to PDFium"
 optional = false
 python-versions = ">=3.6"
 groups = ["main"]
 files = [
-    {file = "pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab"},
-    {file = "pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de"},
-    {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854"},
-    {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2"},
-    {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad"},
-    {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f"},
-    {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163"},
-    {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e"},
-    {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be"},
-    {file = "pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e"},
-    {file = "pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c"},
-    {file = "pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29"},
-    {file = "pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16"},
+    {file = "pypdfium2-4.30.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:e07c47633732cc18d890bb7e965ad28a9c5a932e548acb928596f86be2e5ae37"},
+    {file = "pypdfium2-4.30.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5ea2d44e96d361123b67b00f527017aa9c847c871b5714e013c01c3eb36a79fe"},
+    {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1de7a3a36803171b3f66911131046d65a732f9e7834438191cb58235e6163c4e"},
+    {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b8a4231efb13170354f568c722d6540b8d5b476b08825586d48ef70c40d16e03"},
+    {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f434a4934e8244aa95343ffcf24e9ad9f120dbb4785f631bb40a88c39292493"},
+    {file = "pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f454032a0bc7681900170f67d8711b3942824531e765f91c2f5ce7937f999794"},
+    {file = "pypdfium2-4.30.1-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:bbf9130a72370ee9d602e39949b902db669a2a1c24746a91e5586eb829055d9f"},
+    {file = "pypdfium2-4.30.1-py3-none-musllinux_1_1_i686.whl", hash = "sha256:5cb52884b1583b96e94fd78542c63bb42e06df5e8f9e52f8f31f5ad5a1e53367"},
+    {file = "pypdfium2-4.30.1-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:1a9e372bd4867ff223cc8c338e33fe11055dad12f22885950fc27646cc8d9122"},
+    {file = "pypdfium2-4.30.1-py3-none-win32.whl", hash = "sha256:421f1cf205e213e07c1f2934905779547f4f4a2ff2f59dde29da3d511d3fc806"},
+    {file = "pypdfium2-4.30.1-py3-none-win_amd64.whl", hash = "sha256:598a7f20264ab5113853cba6d86c4566e4356cad037d7d1f849c8c9021007e05"},
+    {file = "pypdfium2-4.30.1-py3-none-win_arm64.whl", hash = "sha256:c2b6d63f6d425d9416c08d2511822b54b8e3ac38e639fc41164b1d75584b3a8c"},
+    {file = "pypdfium2-4.30.1.tar.gz", hash = "sha256:5f5c7c6d03598e107d974f66b220a49436aceb191da34cda5f692be098a814ce"},
 ]
 
 [[package]]
@@ -5390,30 +5392,32 @@ snowflake = ["snowflake-connector-python (>=3.3.0) ; python_version < \"3.12\"",
 
 [[package]]
 name = "surya-ocr"
-version = "0.14.2"
+version = "0.14.5"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 optional = false
-python-versions = "<4.0,>=3.10"
+python-versions = "^3.10"
 groups = ["main"]
-files = [
-    {file = "surya_ocr-0.14.2-py3-none-any.whl", hash = "sha256:0c402705c860f8bf98fc2bf2a3b49d7f0e16fba587aed6d3f01bb53bb776d283"},
-    {file = "surya_ocr-0.14.2.tar.gz", hash = "sha256:852af681073167beba9a638658c70b81318f1a8f3d558db68dead1b2c391e862"},
-]
+files = []
+develop = true
 
 [package.dependencies]
-click = ">=8.1.8,<9.0.0"
-einops = ">=0.8.1,<0.9.0"
-filetype = ">=1.2.0,<2.0.0"
-opencv-python-headless = ">=4.11.0.86,<5.0.0.0"
-pillow = ">=10.2.0,<11.0.0"
-platformdirs = ">=4.3.6,<5.0.0"
-pre-commit = ">=4.2.0,<5.0.0"
-pydantic = ">=2.5.3,<3.0.0"
-pydantic-settings = ">=2.1.0,<3.0.0"
-pypdfium2 = "4.30.0"
-python-dotenv = ">=1.0.0,<2.0.0"
-torch = ">=2.7.0,<3.0.0"
-transformers = ">=4.51.2,<5.0.0"
+click = "^8.1.8"
+einops = "^0.8.1"
+filetype = "^1.2.0"
+opencv-python-headless = "^4.11.0.86"
+pillow = "^10.2.0"
+platformdirs = "^4.3.6"
+pre-commit = "^4.2.0"
+pydantic = "^2.5.3"
+pydantic-settings = "^2.1.0"
+pypdfium2 = "=4.30.1"
+python-dotenv = "^1.0.0"
+torch = "^2.7.0"
+transformers = "^4.51.2"
+
+[package.source]
+type = "directory"
+url = "../surya"
 
 [[package]]
 name = "sympy"
@@ -6505,4 +6509,4 @@ full = ["ebooklib", "mammoth", "openpyxl", "python-pptx", "weasyprint"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.10"
-content-hash = "c18debb8d18aec4081c31ff32f9dc2bde6f4c0189f1d7647bb6061f685e0e319"
+content-hash = "484459202f1148269601972c07e461c46f51f424919485090b707b350ce7fd74"
diff --git a/pyproject.toml b/pyproject.toml
index 2056da009..a5f82aea4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,9 +26,9 @@ torch = "^2.7.0"
 tqdm = "^4.66.1"
 ftfy = "^6.1.1"
 rapidfuzz = "^3.8.1"
-surya-ocr = "^0.14.2"
+surya-ocr = {path = "../surya", develop = true}
 regex = "^2024.4.28"
-pdftext = "~0.6.2"
+pdftext = {path = "../pdftext", develop = true}
 markdownify = "^0.13.1"
 click = "^8.2.0"
 markdown2 = "^2.5.2"
@@ -75,4 +75,4 @@ marker_server = "marker.scripts.server:server_cli"
 
 [build-system]
 requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"
\ No newline at end of file
+build-backend = "poetry.core.masonry.api"
diff --git a/visualize_matching_boxes.py b/visualize_matching_boxes.py
index f70939851..2062fb96d 100644
--- a/visualize_matching_boxes.py
+++ b/visualize_matching_boxes.py
@@ -18,8 +18,39 @@ def get_text_from_raw_text_obj(raw_text_obj, page):
     finally:
         pdfium_c.FPDFText_ClosePage(textpage)
 
-def boxes_are_equal(box1, box2, tol=1e-2):
-    return all(abs(a - b) < tol for a, b in zip(box1, box2))
+def boxes_intersect(box1, box2):
+    """Check if two boxes intersect using the algorithm from fz_glyph_entirely_outside_box.
+    
+    Args:
+        box1, box2: Tuples of (x0, y0, x1, y1) where (x0,y0) is bottom-left, (x1,y1) is top-right
+    
+    Returns:
+        True if boxes intersect, False if they are entirely separate
+    """
+    # If box1 is entirely outside box2, they don't intersect
+    if (box1[2] <= box2[0] or  # box1.x1 <= box2.x0 (box1 right edge <= box2 left edge)
+        box1[3] <= box2[1] or  # box1.y1 <= box2.y0 (box1 top edge <= box2 bottom edge)  
+        box1[0] >= box2[2] or  # box1.x0 >= box2.x1 (box1 left edge >= box2 right edge)
+        box1[1] >= box2[3]):   # box1.y0 >= box2.y1 (box1 bottom edge >= box2 top edge)
+        return False
+    return True
+
+def draw_box(box, boxes_draw, scale, page_height, color='gray', width=1):
+    """Draw a bounding box on the image.
+    
+    Args:
+        box: Tuple of (x0, y0, x1, y1) coordinates in PDF space
+        boxes_draw: ImageDraw object to draw on
+        scale: Scale factor for coordinate conversion
+        page_height: Height of the page in pixels
+        color: Color of the outline
+        width: Width of the outline
+    """
+    pil_top = page_height - (box[3] * scale)
+    pil_bottom = page_height - (box[1] * scale)
+    pil_left = box[0] * scale
+    pil_right = box[2] * scale
+    boxes_draw.rectangle([pil_left, pil_top, pil_right, pil_bottom], outline=color, width=width)
 
 def draw_matching_boxes(page, output_path):
     objects = list(page.get_objects())
@@ -29,7 +60,57 @@ def draw_matching_boxes(page, output_path):
     page_width, page_height = pil_image.size
     boxes_draw = ImageDraw.Draw(pil_image)
 
+    # Check for required clip path functions once
+    required_clip_fns = [
+        "FPDFClipPath_CountPaths", "FPDFClipPath_CountPathSegments",
+        "FPDFClipPath_GetPathSegment", "FPDFPathSegment_GetPoint",
+        "FPDFPathSegment_GetType", "FPDFPathSegment_GetClose"
+    ]
+    has_clip_path_api = all(hasattr(pdfium_c, fn) for fn in required_clip_fns)
+    if not has_clip_path_api:
+        raise RuntimeError("Required PDFium clip path API functions are missing in pdfium_c. Please check your PDFium installation.")
+
+    # Draw the page crop box in green
+    try:
+        left = ctypes.c_float()
+        bottom = ctypes.c_float()
+        right = ctypes.c_float()
+        top = ctypes.c_float()
+        success = pdfium_c.FPDFPage_GetCropBox(page.raw, ctypes.byref(left), ctypes.byref(bottom), ctypes.byref(right), ctypes.byref(top))
+        if success:
+            crop_box = (left.value, bottom.value, right.value, top.value)
+            print(f"Page crop box: ({crop_box[0]:.2f}, {crop_box[1]:.2f}, {crop_box[2]:.2f}, {crop_box[3]:.2f})")
+            draw_box(crop_box, boxes_draw, scale, page_height, color='green', width=2)
+        else:
+            print("Could not get page crop box")
+    except Exception as e:
+        print(f"Error getting page crop box: {e}")
+
+    # Initialize counters for statistics
+    total_objects = len(objects)
+    text_objects = 0
+    visible_objects = 0
+    clipped_objects = 0
+
+    # Load textpage once for efficiency
+    textpage = pdfium_c.FPDFText_LoadPage(page.raw)
+    if not textpage:
+        print("Warning: Could not load textpage for text checking")
+
     for i, obj in enumerate(objects):
+        # Check if object is a text object and has text content
+        obj_type = pdfium_c.FPDFPageObj_GetType(obj.raw)
+        if obj_type != 1:  # FPDF_PAGEOBJ_TEXT = 1
+            continue
+        
+        # Check if text object has any text content (without extracting it)
+        if textpage:
+            buflen = pdfium_c.FPDFTextObj_GetText(obj.raw, textpage, None, 0)
+            if buflen <= 0:  # No text content
+                continue
+        
+        text_objects += 1
+        
         # Get object bounding box
         left = ctypes.c_float()
         bottom = ctypes.c_float()
@@ -40,55 +121,77 @@ def draw_matching_boxes(page, output_path):
             print(f"Object {i+1}: Could not get object bounding box.")
             continue
         obj_box = (left.value, bottom.value, right.value, top.value)
-        print(f"Object {i+1}: Bounding box: {obj_box}")
+        #print(f"Object {i+1}: Bounding box: ({obj_box[0]:.2f}, {obj_box[1]:.2f}, {obj_box[2]:.2f}, {obj_box[3]:.2f})")
         # Try to get clipping path
         show_box = True
         try:
             clip_path = pdfium_c.FPDFPageObj_GetClipPath(obj.raw)
-            if clip_path and all(hasattr(pdfium_c, fn) for fn in [
-                "FPDFClipPath_CountPaths", "FPDFClipPath_CountPathSegments", "FPDFClipPath_GetPathSegment", "FPDFPathSegment_GetPoint", "FPDFPathSegment_GetType", "FPDFPathSegment_GetClose"]):
+            if clip_path:
+                # Collect all points from all paths to calculate a bounding rectangle
+                all_points = []
                 num_paths = pdfium_c.FPDFClipPath_CountPaths(clip_path)
                 for path_idx in range(num_paths):
                     num_segs = pdfium_c.FPDFClipPath_CountPathSegments(clip_path, path_idx)
-                    points = []
                     for seg_idx in range(num_segs):
                         seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, seg_idx)
                         x = ctypes.c_float()
                         y = ctypes.c_float()
                         pdfium_c.FPDFPathSegment_GetPoint(seg, ctypes.byref(x), ctypes.byref(y))
-                        points.append((x.value, y.value))
-                    if len(points) >= 2:
-                        # Get the bounding box of the clip path
-                        xs = [pt[0] for pt in points]
-                        ys = [pt[1] for pt in points]
-                        clip_box = (min(xs), min(ys), max(xs), max(ys))
-                        print(f"Object {i+1}: Clip path box: {clip_box}")
-                        # Draw the clip path in blue
-                        pil_points = [(x * scale, page_height - (y * scale)) for x, y in points]
-                        closed = False
-                        if num_segs > 0:
-                            last_seg = pdfium_c.FPDFClipPath_GetPathSegment(clip_path, path_idx, num_segs-1)
-                            closed = bool(pdfium_c.FPDFPathSegment_GetClose(last_seg))
-                        if closed:
-                            boxes_draw.polygon(pil_points, outline='blue')
-                        else:
-                            boxes_draw.line(pil_points, fill='blue', width=3)
-                        # Only show the bounding box if it matches the clip box
-                        if not boxes_are_equal(obj_box, clip_box):
-                            show_box = False
+                        all_points.append((x.value, y.value))
+                
+                if all_points:
+                    # Calculate the minimal bounding rectangle that fits the clip path
+                    xs = [pt[0] for pt in all_points]
+                    ys = [pt[1] for pt in all_points]
+                    clip_box = (min(xs), min(ys), max(xs), max(ys))
+                    #print(f"Object {i+1}: Clip path bounding box: ({clip_box[0]:.2f}, {clip_box[1]:.2f}, {clip_box[2]:.2f}, {clip_box[3]:.2f}), # of paths: {num_paths}, # of segments: {num_segs}")
+                    
+                    
+                    # Only show the object's bounding box if it doesn't match the clip box
+                    if not boxes_intersect(obj_box, clip_box):
+                        show_box = False
+                        clipped_objects += 1
+                    else:
+                        # Draw the clip path bounding box in blue
+                        draw_box(clip_box, boxes_draw, scale, page_height, color='blue')
+                        
+                        # If boxes intersect, draw in red and extract text
+                        draw_box(obj_box, boxes_draw, scale, page_height, color='red', width=3)
+                        show_box = True
+                        visible_objects += 1
+                        
+                        # Extract and print text for red boxes
+                        if textpage:
+                            buflen = pdfium_c.FPDFTextObj_GetText(obj.raw, textpage, None, 0)
+                            if buflen > 0:
+                                buf = (ctypes.c_ushort * buflen)()
+                                pdfium_c.FPDFTextObj_GetText(obj.raw, textpage, buf, buflen)
+                                byte_buf = bytearray(buf)
+                                text_content = byte_buf.decode('utf-16-le').rstrip('\x00')
+                                utf16_bytes = text_content.encode('utf-16-le')
+                                hex_bytes = ' '.join(f'{b:02x}' for b in utf16_bytes)
+                                print(f"Object {i+1} text (red box): '{text_content}', utf16 bytes: {hex_bytes}")
             else:
-                print(f"Object {i+1}: No usable clipping path.")
+                print(f"Object {i+1}: No clipping path.")
         except Exception as e:
             print(f"Object {i+1}: Error getting clip path: {e}")
         if show_box:
-            pil_top = page_height - (obj_box[3] * scale)
-            pil_bottom = page_height - (obj_box[1] * scale)
-            pil_left = obj_box[0] * scale
-            pil_right = obj_box[2] * scale
-            boxes_draw.rectangle([pil_left, pil_top, pil_right, pil_bottom], outline='red', width=3)
+            draw_box(obj_box, boxes_draw, scale, page_height)
+            visible_objects += 1
 
     pil_image.save(output_path)
     print(f"Matching boxes visualization saved to {output_path}")
+    
+    # Close textpage to avoid memory leaks
+    if textpage:
+        pdfium_c.FPDFText_ClosePage(textpage)
+    
+    # Print statistics
+    print(f"\nStatistics:")
+    print(f"Total objects: {total_objects}")
+    print(f"Text objects: {text_objects}")
+    print(f"Visible objects: {visible_objects}")
+    print(f"Clipped objects: {clipped_objects}")
 
 def main():
     parser = argparse.ArgumentParser(description='Visualize matching bounding boxes and clip paths for all objects')