From d6e313d87805f24203a8712bc7c2547e397a3a63 Mon Sep 17 00:00:00 2001 From: Alex Ioannidis Date: Tue, 30 Jun 2026 21:14:31 +0200 Subject: [PATCH 1/3] fix(schema): flatten creators into parallel lists gpt-oss-20b drops an ORCID nested under each creator in tool calls; parallel name/orcid/affiliation lists survive. Adds synthetic field examples. --- app/schemas/metadata_suggestions.py | 79 +++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 11 deletions(-) diff --git a/app/schemas/metadata_suggestions.py b/app/schemas/metadata_suggestions.py index 662eeff..334b5b6 100644 --- a/app/schemas/metadata_suggestions.py +++ b/app/schemas/metadata_suggestions.py @@ -15,10 +15,21 @@ class Creator(BaseModel): name: str = Field( description="Full name in ', ' format", - examples=["Smith, John"], + examples=["Doe, Jane", "van der Berg, A."], + ) + affiliation: str | None = Field( + default=None, + description="Institution or organization the creator is affiliated with", + examples=["CERN", "University of Cambridge"], + ) + orcid: str | None = Field( + default=None, + description=( + "ORCID identifier as the bare 16-digit ID (four groups of four, " + "final character may be 'X'), without the orcid.org URL prefix" + ), + examples=["0000-0001-2345-6789", "0000-0002-1234-567X"], ) - affiliation: str | None = None - orcid: str | None = None @field_validator("name") @classmethod @@ -101,17 +112,50 @@ class MetadataSuggestions(BaseModel): class ExtractedMetadata(BaseModel): - """Flat schema the LLM fills; converted to ``MetadataSuggestions``. + """Flat schema the LLM fills, converted to ``MetadataSuggestions``. - Smaller models handle a flat object far better than the discriminated union. + Creators are parallel lists, not nested objects. gpt-oss-20b fills flat + top-level lists in a tool call but drops a field nested under each creator, + so a per-creator ``orcid`` gets lost. ``creator_orcids[i]`` and + ``creator_affiliations[i]`` belong to ``creators[i]``. """ - title: str | None = Field(default=None, description="Document title") - description: str | None = Field(default=None, description="Abstract or summary") - creators: list[Creator] = Field( - default_factory=list, description="Authors or creators" + title: str | None = Field( + default=None, + description="Document title", + examples=["A Concise Title Describing the Work"], + ) + description: str | None = Field( + default=None, + description="Abstract or summary", + examples=["A short summary of the document's purpose, methods, and findings."], + ) + creators: list[str] = Field( + default_factory=list, + description="Creator full names in ', ' format, in order", + examples=[["Doe, Jane", "van der Berg, A."]], + ) + creator_orcids: list[str] = Field( + default_factory=list, + description=( + "ORCID iD per creator, parallel to `creators` so creator_orcids[i] " + "is the ORCID of creators[i]; empty string when an author has none. " + "Bare 16-digit form (four groups of four, last may be 'X'), no URL." + ), + examples=[["0000-0001-2345-6789", ""]], + ) + creator_affiliations: list[str] = Field( + default_factory=list, + description=( + "Affiliation per creator, parallel to `creators`; empty string when unknown" + ), + examples=[["CERN", "University of Cambridge"]], + ) + doi: str | None = Field( + default=None, + description="The Digital Object Identifier, as a bare DOI without a URL prefix", + examples=["10.1234/example.5678"], ) - doi: str | None = Field(default=None, description="The Digital Object Identifier") publication_date: str | None = Field( default=None, description=( @@ -122,6 +166,11 @@ class ExtractedMetadata(BaseModel): examples=["2014-07-17", "2014-07", "2014"], ) + @staticmethod + def _at(values: list[str], i: int) -> str: + """Return the i-th parallel value, or '' when the list is shorter.""" + return values[i] if i < len(values) else "" + def to_suggestions(self) -> MetadataSuggestions: """Build the typed suggestions, dropping null/empty fields.""" suggestions: list[MetadataSuggestion] = [] @@ -130,7 +179,15 @@ def to_suggestions(self) -> MetadataSuggestions: if self.description: suggestions.append(DescriptionSuggestion(value=self.description)) if self.creators: - creators = CreatorsSuggestion(value=self.creators) + value = [ + Creator( + name=name, + orcid=self._at(self.creator_orcids, i) or None, + affiliation=self._at(self.creator_affiliations, i) or None, + ) + for i, name in enumerate(self.creators) + ] + creators = CreatorsSuggestion(value=value) if creators.value: suggestions.append(creators) if self.doi: From 0044fedc397487ba7b6b9e93038dc2aedc3e1c0a Mon Sep 17 00:00:00 2001 From: Alex Ioannidis Date: Tue, 30 Jun 2026 21:14:31 +0200 Subject: [PATCH 2/3] fix(extractor): inline ORCIDs next to their author A bare appended list misaligned when some authors had no ORCID; inline placement keeps each ID tied to its author. --- app/extractors/pdfplumber.py | 81 +++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 33 deletions(-) diff --git a/app/extractors/pdfplumber.py b/app/extractors/pdfplumber.py index a51fa4b..1ede17e 100644 --- a/app/extractors/pdfplumber.py +++ b/app/extractors/pdfplumber.py @@ -3,6 +3,7 @@ """PDFPlumber-based PDF extractor.""" +import re from io import BytesIO from typing import Any, Dict, List, Optional @@ -26,18 +27,20 @@ def extract( with pdfplumber.open(BytesIO(pdf_bytes)) as pdf: page_count = len(pdf.pages) - # Resolve page selection resolved_pages = resolve_pages(pages, page_count) page_indices = resolved_pages if resolved_pages else range(page_count) for page_num in page_indices: page = pdf.pages[page_num] - # Extract text with x_tolerance=2 to better detect word boundaries - # Default x_tolerance=3 merges words like "PhilipBull" that have gaps + page_annots = page.annots or [] + # x_tolerance=2: the default (3) merges spaced words like "PhilipBull" text = page.extract_text(x_tolerance=2) or "" + # ORCIDs live only in link annotations. Splice each one next to its + # author so the pairing survives; a bare list misaligns when some + # authors have no ORCID. + text = self._inline_orcids(page, text, page_annots) full_text_parts.append(text) - # Extract tables page_tables = page.extract_tables() for table in page_tables: if table: @@ -49,34 +52,21 @@ def extract( } ) - # Extract hyperlinks (annotations with URI) - if page.annots: - for annot in page.annots: - uri = annot.get("uri") - if uri: - link_type = self._classify_link(uri) - hyperlinks.append( - { - "url": uri, - "page": page_num + 1, - "type": link_type, - } - ) - - # Page text already contains table cell text in reading order; the - # structured `tables` are returned in `extra` rather than flattened into - # `full_text`, which only duplicated content and added empty-cell noise. - full_text = "\n\n".join(full_text_parts) + for annot in page_annots: + uri = annot.get("uri") + if uri: + link_type = self._classify_link(uri) + hyperlinks.append( + { + "url": uri, + "page": page_num + 1, + "type": link_type, + } + ) - # Extract ORCID IDs from hyperlinks and add to full_text - # This makes ORCIDs discoverable even when they're only in link URLs - # (not visible text) - orcid_ids = [ - self._extract_orcid_id(h["url"]) for h in hyperlinks if h["type"] == "orcid" - ] - orcid_ids = [oid for oid in orcid_ids if oid] # filter None - if orcid_ids: - full_text += "\n\nORCID IDs from hyperlinks: " + " ".join(orcid_ids) + # Tables go in `extra`, not `full_text`; the page text already holds each + # cell in reading order. + full_text = "\n\n".join(full_text_parts) return { "full_text": full_text, @@ -92,9 +82,34 @@ def extract( }, } + def _inline_orcids(self, page, text: str, annots: list) -> str: + """Splice each ORCID inline after the author its icon is anchored to. + + The icon sits just right of the author on the same line, so the word + ending nearest left of it is that author's token (name plus any + affiliation marker, e.g. "Bull1,2"). The first text match is the author + block near the top of the page. + """ + words = None + for annot in annots: + orcid = self._extract_orcid_id(annot.get("uri") or "") + if not orcid: + continue + if words is None: + words = page.extract_words(x_tolerance=2) + on_line = [ + w + for w in words + if abs(w["top"] - annot["top"]) < 6 and w["x1"] <= annot["x0"] + 2 + ] + if not on_line: + continue + anchor = max(on_line, key=lambda w: w["x1"])["text"].strip(" ,;") + if anchor and anchor in text: + text = text.replace(anchor, f"{anchor} (ORCID: {orcid})", 1) + return text + def _extract_orcid_id(self, url: str) -> str | None: """Extract ORCID ID from an orcid.org URL.""" - import re - match = re.search(r"orcid\.org/(\d{4}-\d{4}-\d{4}-\d{3}[\dX])", url) return match.group(1) if match else None From 9156bd40dde4433b618661461cdfcbf3c37ece26 Mon Sep 17 00:00:00 2001 From: Alex Ioannidis Date: Tue, 30 Jun 2026 23:12:25 +0200 Subject: [PATCH 3/3] fix(schema): drop ORCIDs that fail the check digit gpt-oss fabricates 0000-0000-0000-0000 for authors with no ORCID. Filter by the ISO 7064 check digit in to_suggestions, not on the LLM output schema, since a schema error feeds back and the model invents a checksum-valid fake instead of leaving it null. --- app/schemas/metadata_suggestions.py | 41 +++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/app/schemas/metadata_suggestions.py b/app/schemas/metadata_suggestions.py index 334b5b6..6d583f7 100644 --- a/app/schemas/metadata_suggestions.py +++ b/app/schemas/metadata_suggestions.py @@ -5,10 +5,30 @@ # from __future__ import annotations +import re from typing import Annotated, Literal from pydantic import BaseModel, Field, field_validator +_ORCID_RE = re.compile(r"\d{4}-\d{4}-\d{4}-\d{3}[\dX]") + + +def valid_orcid(value: str) -> bool: + """Check ORCID shape and ISO 7064 MOD 11-2 check digit. + + Rejects fabrications like ``0000-0000-0000-0000`` (wrong check digit) that + the LLM emits for authors with no ORCID, as well as digit-garbled IDs. + """ + value = value.strip() + if not _ORCID_RE.fullmatch(value): + return False + digits = value.replace("-", "") + total = 0 + for ch in digits[:15]: + total = (total + int(ch)) * 2 + check = (12 - total % 11) % 11 + return ("X" if check == 10 else str(check)) == digits[15] + class Creator(BaseModel): """A structured creator/author.""" @@ -28,7 +48,7 @@ class Creator(BaseModel): "ORCID identifier as the bare 16-digit ID (four groups of four, " "final character may be 'X'), without the orcid.org URL prefix" ), - examples=["0000-0001-2345-6789", "0000-0002-1234-567X"], + examples=["0000-0001-2345-6789", "0000-0001-0002-000X"], ) @field_validator("name") @@ -179,14 +199,19 @@ def to_suggestions(self) -> MetadataSuggestions: if self.description: suggestions.append(DescriptionSuggestion(value=self.description)) if self.creators: - value = [ - Creator( - name=name, - orcid=self._at(self.creator_orcids, i) or None, - affiliation=self._at(self.creator_affiliations, i) or None, + value = [] + for i, name in enumerate(self.creators): + # Drop ORCIDs that fail the check digit here, in post-processing, + # not on the LLM output schema: a schema error would be fed back + # and the model would invent a checksum-valid fake to satisfy it. + orcid = self._at(self.creator_orcids, i) + value.append( + Creator( + name=name, + orcid=orcid if valid_orcid(orcid) else None, + affiliation=self._at(self.creator_affiliations, i) or None, + ) ) - for i, name in enumerate(self.creators) - ] creators = CreatorsSuggestion(value=value) if creators.value: suggestions.append(creators)