From d6e313d87805f24203a8712bc7c2547e397a3a63 Mon Sep 17 00:00:00 2001
From: Alex Ioannidis <a.ioannidis@cern.ch>
Date: Tue, 30 Jun 2026 21:14:31 +0200
Subject: [PATCH 1/3] fix(schema): flatten creators into parallel lists

gpt-oss-20b drops an ORCID nested under each creator in tool calls; parallel name/orcid/affiliation lists survive. Adds synthetic field examples.
---
 app/schemas/metadata_suggestions.py | 79 +++++++++++++++++++++++++----
 1 file changed, 68 insertions(+), 11 deletions(-)
diff --git a/app/schemas/metadata_suggestions.py b/app/schemas/metadata_suggestions.py
index 662eeff..334b5b6 100644
--- a/app/schemas/metadata_suggestions.py
+++ b/app/schemas/metadata_suggestions.py
@@ -15,10 +15,21 @@ class Creator(BaseModel):
 
     name: str = Field(
         description="Full name in '<family>, <given>' format",
-        examples=["Smith, John"],
+        examples=["Doe, Jane", "van der Berg, A."],
+    )
+    affiliation: str | None = Field(
+        default=None,
+        description="Institution or organization the creator is affiliated with",
+        examples=["CERN", "University of Cambridge"],
+    )
+    orcid: str | None = Field(
+        default=None,
+        description=(
+            "ORCID identifier as the bare 16-digit ID (four groups of four, "
+            "final character may be 'X'), without the orcid.org URL prefix"
+        ),
+        examples=["0000-0001-2345-6789", "0000-0002-1234-567X"],
     )
-    affiliation: str | None = None
-    orcid: str | None = None
 
     @field_validator("name")
     @classmethod
@@ -101,17 +112,50 @@ class MetadataSuggestions(BaseModel):
 
 
 class ExtractedMetadata(BaseModel):
-    """Flat schema the LLM fills; converted to ``MetadataSuggestions``.
+    """Flat schema the LLM fills, converted to ``MetadataSuggestions``.
 
-    Smaller models handle a flat object far better than the discriminated union.
+    Creators are parallel lists, not nested objects. gpt-oss-20b fills flat
+    top-level lists in a tool call but drops a field nested under each creator,
+    so a per-creator ``orcid`` gets lost. ``creator_orcids[i]`` and
+    ``creator_affiliations[i]`` belong to ``creators[i]``.
     """
 
-    title: str | None = Field(default=None, description="Document title")
-    description: str | None = Field(default=None, description="Abstract or summary")
-    creators: list[Creator] = Field(
-        default_factory=list, description="Authors or creators"
+    title: str | None = Field(
+        default=None,
+        description="Document title",
+        examples=["A Concise Title Describing the Work"],
+    )
+    description: str | None = Field(
+        default=None,
+        description="Abstract or summary",
+        examples=["A short summary of the document's purpose, methods, and findings."],
+    )
+    creators: list[str] = Field(
+        default_factory=list,
+        description="Creator full names in '<family>, <given>' format, in order",
+        examples=[["Doe, Jane", "van der Berg, A."]],
+    )
+    creator_orcids: list[str] = Field(
+        default_factory=list,
+        description=(
+            "ORCID iD per creator, parallel to `creators` so creator_orcids[i] "
+            "is the ORCID of creators[i]; empty string when an author has none. "
+            "Bare 16-digit form (four groups of four, last may be 'X'), no URL."
+        ),
+        examples=[["0000-0001-2345-6789", ""]],
+    )
+    creator_affiliations: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Affiliation per creator, parallel to `creators`; empty string when unknown"
+        ),
+        examples=[["CERN", "University of Cambridge"]],
+    )
+    doi: str | None = Field(
+        default=None,
+        description="The Digital Object Identifier, as a bare DOI without a URL prefix",
+        examples=["10.1234/example.5678"],
     )
-    doi: str | None = Field(default=None, description="The Digital Object Identifier")
     publication_date: str | None = Field(
         default=None,
         description=(
@@ -122,6 +166,11 @@ class ExtractedMetadata(BaseModel):
         examples=["2014-07-17", "2014-07", "2014"],
     )
 
+    @staticmethod
+    def _at(values: list[str], i: int) -> str:
+        """Return the i-th parallel value, or '' when the list is shorter."""
+        return values[i] if i < len(values) else ""
+
     def to_suggestions(self) -> MetadataSuggestions:
         """Build the typed suggestions, dropping null/empty fields."""
         suggestions: list[MetadataSuggestion] = []
@@ -130,7 +179,15 @@ def to_suggestions(self) -> MetadataSuggestions:
         if self.description:
             suggestions.append(DescriptionSuggestion(value=self.description))
         if self.creators:
-            creators = CreatorsSuggestion(value=self.creators)
+            value = [
+                Creator(
+                    name=name,
+                    orcid=self._at(self.creator_orcids, i) or None,
+                    affiliation=self._at(self.creator_affiliations, i) or None,
+                )
+                for i, name in enumerate(self.creators)
+            ]
+            creators = CreatorsSuggestion(value=value)
             if creators.value:
                 suggestions.append(creators)
         if self.doi:

From 0044fedc397487ba7b6b9e93038dc2aedc3e1c0a Mon Sep 17 00:00:00 2001
From: Alex Ioannidis <a.ioannidis@cern.ch>
Date: Tue, 30 Jun 2026 21:14:31 +0200
Subject: [PATCH 2/3] fix(extractor): inline ORCIDs next to their author

A bare appended list misaligned when some authors had no ORCID; inline placement keeps each ID tied to its author.
---
 app/extractors/pdfplumber.py | 81 +++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 33 deletions(-)

diff --git a/app/extractors/pdfplumber.py b/app/extractors/pdfplumber.py
index a51fa4b..1ede17e 100644
--- a/app/extractors/pdfplumber.py
+++ b/app/extractors/pdfplumber.py
@@ -3,6 +3,7 @@
 
 """PDFPlumber-based PDF extractor."""
 
+import re
 from io import BytesIO
 from typing import Any, Dict, List, Optional
 
@@ -26,18 +27,20 @@ def extract(
         with pdfplumber.open(BytesIO(pdf_bytes)) as pdf:
             page_count = len(pdf.pages)
 
-            # Resolve page selection
             resolved_pages = resolve_pages(pages, page_count)
             page_indices = resolved_pages if resolved_pages else range(page_count)
 
             for page_num in page_indices:
                 page = pdf.pages[page_num]
-                # Extract text with x_tolerance=2 to better detect word boundaries
-                # Default x_tolerance=3 merges words like "PhilipBull" that have gaps
+                page_annots = page.annots or []
+                # x_tolerance=2: the default (3) merges spaced words like "PhilipBull"
                 text = page.extract_text(x_tolerance=2) or ""
+                # ORCIDs live only in link annotations. Splice each one next to its
+                # author so the pairing survives; a bare list misaligns when some
+                # authors have no ORCID.
+                text = self._inline_orcids(page, text, page_annots)
                 full_text_parts.append(text)
 
-                # Extract tables
                 page_tables = page.extract_tables()
                 for table in page_tables:
                     if table:
@@ -49,34 +52,21 @@ def extract(
                             }
                         )
 
-                # Extract hyperlinks (annotations with URI)
-                if page.annots:
-                    for annot in page.annots:
-                        uri = annot.get("uri")
-                        if uri:
-                            link_type = self._classify_link(uri)
-                            hyperlinks.append(
-                                {
-                                    "url": uri,
-                                    "page": page_num + 1,
-                                    "type": link_type,
-                                }
-                            )
-
-        # Page text already contains table cell text in reading order; the
-        # structured `tables` are returned in `extra` rather than flattened into
-        # `full_text`, which only duplicated content and added empty-cell noise.
-        full_text = "\n\n".join(full_text_parts)
+                for annot in page_annots:
+                    uri = annot.get("uri")
+                    if uri:
+                        link_type = self._classify_link(uri)
+                        hyperlinks.append(
+                            {
+                                "url": uri,
+                                "page": page_num + 1,
+                                "type": link_type,
+                            }
+                        )
 
-        # Extract ORCID IDs from hyperlinks and add to full_text
-        # This makes ORCIDs discoverable even when they're only in link URLs
-        # (not visible text)
-        orcid_ids = [
-            self._extract_orcid_id(h["url"]) for h in hyperlinks if h["type"] == "orcid"
-        ]
-        orcid_ids = [oid for oid in orcid_ids if oid]  # filter None
-        if orcid_ids:
-            full_text += "\n\nORCID IDs from hyperlinks: " + " ".join(orcid_ids)
+        # Tables go in `extra`, not `full_text`; the page text already holds each
+        # cell in reading order.
+        full_text = "\n\n".join(full_text_parts)
 
         return {
             "full_text": full_text,
@@ -92,9 +82,34 @@ def extract(
             },
         }
 
+    def _inline_orcids(self, page, text: str, annots: list) -> str:
+        """Splice each ORCID inline after the author its icon is anchored to.
+
+        The icon sits just right of the author on the same line, so the word
+        ending nearest left of it is that author's token (name plus any
+        affiliation marker, e.g. "Bull1,2"). The first text match is the author
+        block near the top of the page.
+        """
+        words = None
+        for annot in annots:
+            orcid = self._extract_orcid_id(annot.get("uri") or "")
+            if not orcid:
+                continue
+            if words is None:
+                words = page.extract_words(x_tolerance=2)
+            on_line = [
+                w
+                for w in words
+                if abs(w["top"] - annot["top"]) < 6 and w["x1"] <= annot["x0"] + 2
+            ]
+            if not on_line:
+                continue
+            anchor = max(on_line, key=lambda w: w["x1"])["text"].strip(" ,;")
+            if anchor and anchor in text:
+                text = text.replace(anchor, f"{anchor} (ORCID: {orcid})", 1)
+        return text
+
     def _extract_orcid_id(self, url: str) -> str | None:
         """Extract ORCID ID from an orcid.org URL."""
-        import re
-
         match = re.search(r"orcid\.org/(\d{4}-\d{4}-\d{4}-\d{3}[\dX])", url)
         return match.group(1) if match else None

From 9156bd40dde4433b618661461cdfcbf3c37ece26 Mon Sep 17 00:00:00 2001
From: Alex Ioannidis <a.ioannidis@cern.ch>
Date: Tue, 30 Jun 2026 23:12:25 +0200
Subject: [PATCH 3/3] fix(schema): drop ORCIDs that fail the check digit

gpt-oss fabricates 0000-0000-0000-0000 for authors with no ORCID. Filter by the ISO 7064 check digit in to_suggestions, not on the LLM output schema, since a schema error feeds back and the model invents a checksum-valid fake instead of leaving it null.
---
 app/schemas/metadata_suggestions.py | 41 +++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/app/schemas/metadata_suggestions.py b/app/schemas/metadata_suggestions.py
index 334b5b6..6d583f7 100644
--- a/app/schemas/metadata_suggestions.py
+++ b/app/schemas/metadata_suggestions.py
@@ -5,10 +5,30 @@
 
 # from __future__ import annotations
 
+import re
 from typing import Annotated, Literal
 
 from pydantic import BaseModel, Field, field_validator
 
+_ORCID_RE = re.compile(r"\d{4}-\d{4}-\d{4}-\d{3}[\dX]")
+
+
+def valid_orcid(value: str) -> bool:
+    """Check ORCID shape and ISO 7064 MOD 11-2 check digit.
+
+    Rejects fabrications like ``0000-0000-0000-0000`` (wrong check digit) that
+    the LLM emits for authors with no ORCID, as well as digit-garbled IDs.
+    """
+    value = value.strip()
+    if not _ORCID_RE.fullmatch(value):
+        return False
+    digits = value.replace("-", "")
+    total = 0
+    for ch in digits[:15]:
+        total = (total + int(ch)) * 2
+    check = (12 - total % 11) % 11
+    return ("X" if check == 10 else str(check)) == digits[15]
+
 
 class Creator(BaseModel):
     """A structured creator/author."""
@@ -28,7 +48,7 @@ class Creator(BaseModel):
             "ORCID identifier as the bare 16-digit ID (four groups of four, "
             "final character may be 'X'), without the orcid.org URL prefix"
         ),
-        examples=["0000-0001-2345-6789", "0000-0002-1234-567X"],
+        examples=["0000-0001-2345-6789", "0000-0001-0002-000X"],
     )
 
     @field_validator("name")
@@ -179,14 +199,19 @@ def to_suggestions(self) -> MetadataSuggestions:
         if self.description:
             suggestions.append(DescriptionSuggestion(value=self.description))
         if self.creators:
-            value = [
-                Creator(
-                    name=name,
-                    orcid=self._at(self.creator_orcids, i) or None,
-                    affiliation=self._at(self.creator_affiliations, i) or None,
+            value = []
+            for i, name in enumerate(self.creators):
+                # Drop ORCIDs that fail the check digit here, in post-processing,
+                # not on the LLM output schema: a schema error would be fed back
+                # and the model would invent a checksum-valid fake to satisfy it.
+                orcid = self._at(self.creator_orcids, i)
+                value.append(
+                    Creator(
+                        name=name,
+                        orcid=orcid if valid_orcid(orcid) else None,
+                        affiliation=self._at(self.creator_affiliations, i) or None,
+                    )
                 )
-                for i, name in enumerate(self.creators)
-            ]
             creators = CreatorsSuggestion(value=value)
             if creators.value:
                 suggestions.append(creators)