Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 48 additions & 33 deletions app/extractors/pdfplumber.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"""PDFPlumber-based PDF extractor."""

import re
from io import BytesIO
from typing import Any, Dict, List, Optional

Expand All @@ -26,18 +27,20 @@ def extract(
with pdfplumber.open(BytesIO(pdf_bytes)) as pdf:
page_count = len(pdf.pages)

# Resolve page selection
resolved_pages = resolve_pages(pages, page_count)
page_indices = resolved_pages if resolved_pages else range(page_count)

for page_num in page_indices:
page = pdf.pages[page_num]
# Extract text with x_tolerance=2 to better detect word boundaries
# Default x_tolerance=3 merges words like "PhilipBull" that have gaps
page_annots = page.annots or []
# x_tolerance=2: the default (3) merges spaced words like "PhilipBull"
text = page.extract_text(x_tolerance=2) or ""
# ORCIDs live only in link annotations. Splice each one next to its
# author so the pairing survives; a bare list misaligns when some
# authors have no ORCID.
text = self._inline_orcids(page, text, page_annots)
full_text_parts.append(text)

# Extract tables
page_tables = page.extract_tables()
for table in page_tables:
if table:
Expand All @@ -49,34 +52,21 @@ def extract(
}
)

# Extract hyperlinks (annotations with URI)
if page.annots:
for annot in page.annots:
uri = annot.get("uri")
if uri:
link_type = self._classify_link(uri)
hyperlinks.append(
{
"url": uri,
"page": page_num + 1,
"type": link_type,
}
)

# Page text already contains table cell text in reading order; the
# structured `tables` are returned in `extra` rather than flattened into
# `full_text`, which only duplicated content and added empty-cell noise.
full_text = "\n\n".join(full_text_parts)
for annot in page_annots:
uri = annot.get("uri")
if uri:
link_type = self._classify_link(uri)
hyperlinks.append(
{
"url": uri,
"page": page_num + 1,
"type": link_type,
}
)

# Extract ORCID IDs from hyperlinks and add to full_text
# This makes ORCIDs discoverable even when they're only in link URLs
# (not visible text)
orcid_ids = [
self._extract_orcid_id(h["url"]) for h in hyperlinks if h["type"] == "orcid"
]
orcid_ids = [oid for oid in orcid_ids if oid] # filter None
if orcid_ids:
full_text += "\n\nORCID IDs from hyperlinks: " + " ".join(orcid_ids)
# Tables go in `extra`, not `full_text`; the page text already holds each
# cell in reading order.
full_text = "\n\n".join(full_text_parts)

return {
"full_text": full_text,
Expand All @@ -92,9 +82,34 @@ def extract(
},
}

def _inline_orcids(self, page, text: str, annots: list) -> str:
"""Splice each ORCID inline after the author its icon is anchored to.

The icon sits just right of the author on the same line, so the word
ending nearest left of it is that author's token (name plus any
affiliation marker, e.g. "Bull1,2"). The first text match is the author
block near the top of the page.
"""
words = None
for annot in annots:
orcid = self._extract_orcid_id(annot.get("uri") or "")
if not orcid:
continue
if words is None:
words = page.extract_words(x_tolerance=2)
on_line = [
w
for w in words
if abs(w["top"] - annot["top"]) < 6 and w["x1"] <= annot["x0"] + 2
]
if not on_line:
continue
anchor = max(on_line, key=lambda w: w["x1"])["text"].strip(" ,;")
if anchor and anchor in text:
text = text.replace(anchor, f"{anchor} (ORCID: {orcid})", 1)
return text

def _extract_orcid_id(self, url: str) -> str | None:
"""Extract ORCID ID from an orcid.org URL."""
import re

match = re.search(r"orcid\.org/(\d{4}-\d{4}-\d{4}-\d{3}[\dX])", url)
return match.group(1) if match else None
104 changes: 93 additions & 11 deletions app/schemas/metadata_suggestions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,51 @@

# from __future__ import annotations

import re
from typing import Annotated, Literal

from pydantic import BaseModel, Field, field_validator

_ORCID_RE = re.compile(r"\d{4}-\d{4}-\d{4}-\d{3}[\dX]")


def valid_orcid(value: str) -> bool:
"""Check ORCID shape and ISO 7064 MOD 11-2 check digit.

Rejects fabrications like ``0000-0000-0000-0000`` (wrong check digit) that
the LLM emits for authors with no ORCID, as well as digit-garbled IDs.
"""
value = value.strip()
if not _ORCID_RE.fullmatch(value):
return False
digits = value.replace("-", "")
total = 0
for ch in digits[:15]:
total = (total + int(ch)) * 2
check = (12 - total % 11) % 11
return ("X" if check == 10 else str(check)) == digits[15]


class Creator(BaseModel):
"""A structured creator/author."""

name: str = Field(
description="Full name in '<family>, <given>' format",
examples=["Smith, John"],
examples=["Doe, Jane", "van der Berg, A."],
)
affiliation: str | None = Field(
default=None,
description="Institution or organization the creator is affiliated with",
examples=["CERN", "University of Cambridge"],
)
orcid: str | None = Field(
default=None,
description=(
"ORCID identifier as the bare 16-digit ID (four groups of four, "
"final character may be 'X'), without the orcid.org URL prefix"
),
examples=["0000-0001-2345-6789", "0000-0001-0002-000X"],
)
affiliation: str | None = None
orcid: str | None = None

@field_validator("name")
@classmethod
Expand Down Expand Up @@ -101,17 +132,50 @@ class MetadataSuggestions(BaseModel):


class ExtractedMetadata(BaseModel):
"""Flat schema the LLM fills; converted to ``MetadataSuggestions``.
"""Flat schema the LLM fills, converted to ``MetadataSuggestions``.

Smaller models handle a flat object far better than the discriminated union.
Creators are parallel lists, not nested objects. gpt-oss-20b fills flat
top-level lists in a tool call but drops a field nested under each creator,
so a per-creator ``orcid`` gets lost. ``creator_orcids[i]`` and
``creator_affiliations[i]`` belong to ``creators[i]``.
"""

title: str | None = Field(default=None, description="Document title")
description: str | None = Field(default=None, description="Abstract or summary")
creators: list[Creator] = Field(
default_factory=list, description="Authors or creators"
title: str | None = Field(
default=None,
description="Document title",
examples=["A Concise Title Describing the Work"],
)
description: str | None = Field(
default=None,
description="Abstract or summary",
examples=["A short summary of the document's purpose, methods, and findings."],
)
creators: list[str] = Field(
default_factory=list,
description="Creator full names in '<family>, <given>' format, in order",
examples=[["Doe, Jane", "van der Berg, A."]],
)
creator_orcids: list[str] = Field(
default_factory=list,
description=(
"ORCID iD per creator, parallel to `creators` so creator_orcids[i] "
"is the ORCID of creators[i]; empty string when an author has none. "
"Bare 16-digit form (four groups of four, last may be 'X'), no URL."
),
examples=[["0000-0001-2345-6789", ""]],
)
creator_affiliations: list[str] = Field(
default_factory=list,
description=(
"Affiliation per creator, parallel to `creators`; empty string when unknown"
),
examples=[["CERN", "University of Cambridge"]],
)
doi: str | None = Field(
default=None,
description="The Digital Object Identifier, as a bare DOI without a URL prefix",
examples=["10.1234/example.5678"],
)
doi: str | None = Field(default=None, description="The Digital Object Identifier")
publication_date: str | None = Field(
default=None,
description=(
Expand All @@ -122,6 +186,11 @@ class ExtractedMetadata(BaseModel):
examples=["2014-07-17", "2014-07", "2014"],
)

@staticmethod
def _at(values: list[str], i: int) -> str:
"""Return the i-th parallel value, or '' when the list is shorter."""
return values[i] if i < len(values) else ""

def to_suggestions(self) -> MetadataSuggestions:
"""Build the typed suggestions, dropping null/empty fields."""
suggestions: list[MetadataSuggestion] = []
Expand All @@ -130,7 +199,20 @@ def to_suggestions(self) -> MetadataSuggestions:
if self.description:
suggestions.append(DescriptionSuggestion(value=self.description))
if self.creators:
creators = CreatorsSuggestion(value=self.creators)
value = []
for i, name in enumerate(self.creators):
# Drop ORCIDs that fail the check digit here, in post-processing,
# not on the LLM output schema: a schema error would be fed back
# and the model would invent a checksum-valid fake to satisfy it.
orcid = self._at(self.creator_orcids, i)
value.append(
Creator(
name=name,
orcid=orcid if valid_orcid(orcid) else None,
affiliation=self._at(self.creator_affiliations, i) or None,
)
)
creators = CreatorsSuggestion(value=value)
if creators.value:
suggestions.append(creators)
if self.doi:
Expand Down