From 309e91825d91c02ffb72a774bd208da96f18a559 Mon Sep 17 00:00:00 2001 From: Jake Bromberg Date: Fri, 5 Jun 2026 06:03:01 -0700 Subject: [PATCH] fix(prompts,baker): revert iter-1 prompt + strip crossed_out at bake time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The iter-1 prompt (PR #63) recovered double_height by forcing the model into a single-Entry shape on the upper grid row. Pilot re-bake on 1990-04apr1318 (11 pages, isolated to data/pilot-bundles-v2/) exposed two regression classes vs. that approach: (a) truncated raw_text on rows the model failed to classify as double_height (e.g. "BILL MONROE - Travelin This" instead of "... Travelin This Lonesome Road"), because the prompt told it to fold the lower line into the upper but it could not read both as one piece; (b) spurious crossed_out tags on a thematic cluster of religious-themed entries — documented as OCR-context drift, unfixable at the prompt level. The structural answer: stop fighting the model. core/prompts.py now lets Gemini emit its natural split-row shape (continuation as its own Entry on the lower grid row) and trusts the existing scripts.make_verifier_bundle._merge_with_spans to fold the wrap into the prior entry at write time, tagging the merged result double_height — which is what the baker has been designed to do since the beginning. The model reads both printed lines when allowed to split, so the truncation pathology disappears. For the crossed_out class, the baker now strips notes=="crossed_out" -> None during _merge_with_spans (stripped first so a crossed_out predecessor can still absorb a following continuation row). The raw_text is preserved verbatim — only the notes value is reset. Pilot results on the same 11-page sample — crossed_out emits: OLD 17 / iter-1 16 / this PR 0; double_height emits: OLD 18 / iter-1 25 / this PR 29; total bundle rows: 677/677/677 (no row-shape regressions); truncation regressions: iter-1 had 2, this PR has 0. The BILL MONROE case specifically: deployed had it correct; iter-1 truncated; this PR has it correct again, tagged double_height via the baker merge. Updated prompt-contract tests to assert the new split-shape directive ("its own Entry on the lower" + "do not try to inline the wrap") and dropped the iter-1 single-Entry assertions. Two new baker tests pin the crossed_out strip and confirm it composes correctly with the continuation merge. --- core/prompts.py | 78 ++++++++++--------------- scripts/make_verifier_bundle.py | 16 ++++- tests/unit/test_make_verifier_bundle.py | 37 ++++++++++++ tests/unit/test_prompts.py | 52 ++++++----------- 4 files changed, 101 insertions(+), 82 deletions(-) diff --git a/core/prompts.py b/core/prompts.py index da65c20..0bd4f87 100644 --- a/core/prompts.py +++ b/core/prompts.py @@ -70,30 +70,21 @@ ONLY when the visual cue is unambiguous. Read each definition before choosing a tag. - * "double_height" — ONE handwritten entry that visually spans two - printed grid rows. The DJ wrote a single song's text but the - handwriting is tall, or the artist/track text flows from the upper - printed line down into the lower printed line as one continuous - piece of writing. Emit a SINGLE Entry on the UPPER printed grid - row, with raw_text containing the whole song's text, and tag it - "double_height". Do NOT also emit a separate Entry on the lower - printed grid row for the same handwritten content — that produces - a phantom duplicate row. Example: the artist "Buffy Sainte-Marie" - is written on the printed row N and the track "God is Alive, - Magic is Afoot" continues on printed row N+1 as one piece of - handwriting → emit one Entry with row_index=N, raw_text="Buffy - Sainte-Marie - God is Alive, Magic is Afoot", notes="double_height". - - * "continuation" — RARE in this corpus. Use ONLY when the lower - printed grid row contains a SEPARATE handwritten fragment that - completes a song started on the row above (different ink stroke, - an explicit arrow or bracket carrying the text down, or the DJ - clearly ran out of room and re-wrote the tail of the entry below). - Emit it as its own Entry on the lower grid row, with the - continuation fragment as raw_text and notes="continuation". - If you find yourself wanting to use "continuation" because the - handwriting itself is tall and flows visually across two rows, - use "double_height" instead — that is the same-entry case. + * "double_height" — a single handwritten entry whose handwriting + is tall enough to occupy two printed grid rows but is written as + one continuous piece. Emit a SINGLE Entry on the upper printed + grid row with the whole song's text in raw_text and notes set to + "double_height". (If the entry instead reads as two visually + distinct fragments — e.g. the artist on the upper row and a + clearly separate track-name fragment on the lower row — use the + continuation tag below instead.) + + * "continuation" — the entry's text wraps onto the next printed + grid row. Emit it as its own Entry on the lower printed grid + row, with that wrap-fragment as raw_text and notes set to + "continuation". A downstream baker merges these into the prior + row at write time, so transcribe both rows verbatim — do not + try to inline the wrap into the prior entry yourself. * "crossed_out" — a clear horizontal line (or angry scribble) drawn THROUGH the artist/track text, indicating the DJ retracted the @@ -211,30 +202,21 @@ ONLY when the visual cue is unambiguous. Read each definition before choosing a tag. - * "double_height" — ONE handwritten entry that visually spans two - printed grid rows. The DJ wrote a single song's text but the - handwriting is tall, or the artist/track text flows from the upper - printed line down into the lower printed line as one continuous - piece of writing. Emit a SINGLE Entry on the UPPER printed grid - row, with raw_text containing the whole song's text, and tag it - "double_height". Do NOT also emit a separate Entry on the lower - printed grid row for the same handwritten content — that produces - a phantom duplicate row. Example: the artist "Buffy Sainte-Marie" - is written on the printed row N and the track "God is Alive, - Magic is Afoot" continues on printed row N+1 as one piece of - handwriting → emit one Entry with row_index=N, raw_text="Buffy - Sainte-Marie - God is Alive, Magic is Afoot", notes="double_height". - - * "continuation" — RARE in this corpus. Use ONLY when the lower - printed grid row contains a SEPARATE handwritten fragment that - completes a song started on the row above (different ink stroke, - an explicit arrow or bracket carrying the text down, or the DJ - clearly ran out of room and re-wrote the tail of the entry below). - Emit it as its own Entry on the lower grid row, with the - continuation fragment as raw_text and notes="continuation". - If you find yourself wanting to use "continuation" because the - handwriting itself is tall and flows visually across two rows, - use "double_height" instead — that is the same-entry case. + * "double_height" — a single handwritten entry whose handwriting + is tall enough to occupy two printed grid rows but is written as + one continuous piece. Emit a SINGLE Entry on the upper printed + grid row with the whole song's text in raw_text and notes set to + "double_height". (If the entry instead reads as two visually + distinct fragments — e.g. the artist on the upper row and a + clearly separate track-name fragment on the lower row — use the + continuation tag below instead.) + + * "continuation" — the entry's text wraps onto the next printed + grid row. Emit it as its own Entry on the lower printed grid + row, with that wrap-fragment as raw_text and notes set to + "continuation". A downstream baker merges these into the prior + row at write time, so transcribe both rows verbatim — do not + try to inline the wrap into the prior entry yourself. * "crossed_out" — a clear horizontal line (or angry scribble) drawn THROUGH the artist/track text, indicating the DJ retracted the diff --git a/scripts/make_verifier_bundle.py b/scripts/make_verifier_bundle.py index da344b2..1176689 100644 --- a/scripts/make_verifier_bundle.py +++ b/scripts/make_verifier_bundle.py @@ -103,6 +103,12 @@ def _merge_with_spans(entries: list[Entry]) -> list[tuple[Entry, int]]: - notes="continuation": folds into the previous logical entry's raw_text (verbatim with the existing merge rules) and adds 1 to its span. - notes="double_height": stays as a single logical entry but spans 2 rows. + - notes="crossed_out": stripped to None before any further processing. + Empirical precision of Gemini's `crossed_out` is ~22% on the + verified corpus; surfacing the tag generates more false-positive + review work than true-positive value. The raw_text is preserved + verbatim — only the notes value is reset, so Alex can mark genuine + strike-throughs by toggling the dropdown. - All others: span 1. A leading "continuation" with nothing above it is preserved as-is with @@ -112,7 +118,15 @@ def _merge_with_spans(entries: list[Entry]) -> list[tuple[Entry, int]]: a verifier-geometry concern. The on-disk pipeline doesn't need it. """ result: list[tuple[Entry, int]] = [] - for entry in entries: + for raw_entry in entries: + # Strip unreliable `crossed_out` tags before any merge / span logic. + # Done first so a stripped crossed_out predecessor can still absorb + # a following continuation row instead of blocking the merge. + entry = ( + raw_entry.model_copy(update={"notes": None}) + if raw_entry.notes == "crossed_out" + else raw_entry + ) if entry.notes == "continuation" and result: prior, prior_span = result[-1] joined = f"{prior.raw_text.rstrip()} {entry.raw_text.lstrip()}".strip() diff --git a/tests/unit/test_make_verifier_bundle.py b/tests/unit/test_make_verifier_bundle.py index b51c090..ca45694 100644 --- a/tests/unit/test_make_verifier_bundle.py +++ b/tests/unit/test_make_verifier_bundle.py @@ -343,6 +343,43 @@ def test_merge_with_spans_empty_input() -> None: assert _merge_with_spans([]) == [] +def test_merge_with_spans_drops_crossed_out_tag() -> None: + """Empirical precision of Gemini's `crossed_out` is ~22% (8 false + positives per 11 emits, measured n=20 on 1990-04apr0106 and reproduced + on 1990-04apr1318). Stripping the tag at bake time eliminates the + false-positive review action; Alex marks the few true positives + himself by toggling the dropdown. The raw_text is preserved verbatim — + only the notes value is reset.""" + entries = [ + Entry(row_index=0, raw_text="Pixies - Debaser", confidence="high", notes="crossed_out"), + Entry(row_index=1, raw_text="Sonic Youth - Sugar Kane", confidence="high"), + ] + result = _merge_with_spans(entries) + assert len(result) == 2 + merged_first, span_first = result[0] + assert merged_first.notes is None, "expected crossed_out to be stripped from the bundle output" + assert merged_first.raw_text == "Pixies - Debaser" + assert span_first == 1 + + +def test_merge_with_spans_drops_crossed_out_before_continuation_merge() -> None: + """`crossed_out` stripping must happen before the continuation merge, + so a crossed_out predecessor doesn't suppress the merge or carry the + tag onto a logically-multi-row entry.""" + entries = [ + Entry(row_index=0, raw_text="Galaxie 500 -", confidence="high", notes="crossed_out"), + Entry(row_index=1, raw_text="Tugboat", confidence="medium", notes="continuation"), + ] + result = _merge_with_spans(entries) + assert len(result) == 1 + merged, span = result[0] + assert merged.raw_text == "Galaxie 500 - Tugboat" + assert span == 2 + # The merged predecessor's tag should be `double_height` from the merge + # rule, NOT `crossed_out` (which we just stripped). + assert merged.notes == "double_height" + + # -- make_bundle ------------------------------------------------------------ diff --git a/tests/unit/test_prompts.py b/tests/unit/test_prompts.py index cea4cc7..9de6d22 100644 --- a/tests/unit/test_prompts.py +++ b/tests/unit/test_prompts.py @@ -41,30 +41,19 @@ def test_prompt_lists_every_phase1_notes_tag(tag: str) -> None: assert tag in PAGE_EXTRACTION_PROMPT -def test_prompt_double_height_emits_single_entry_on_upper_row() -> None: - """Drift observed 2026-06-04: fresh Gemini was splitting a 2-grid-row - handwritten entry into two Entries and tagging the second one - `continuation`, instead of emitting one Entry tagged `double_height`. - The prompt must explicitly direct against the split shape — a single - `double_height` definition without the negation reproduces the drift.""" +def test_prompt_continuation_describes_split_shape_for_wraps() -> None: + """For multi-line wraps, the model should emit the wrap as a separate + Entry tagged `continuation` (the natural split shape). The bundle baker + merges those into the prior row at write time, so the prompt must NOT + ask the model to inline the wrap itself — that would suppress the + second printed line's text on cases the model can't visually classify + as one-vs-two entries.""" text = PAGE_EXTRACTION_PROMPT - assert "SINGLE Entry" in text - # The negation: do not also emit a row on the lower line. Allow either - # "separate" or "second" wording — both are valid phrasings of the rule. - # Normalise whitespace because the prompt is wrapped. - normalised = " ".join(text.lower().split()) - assert "do not also emit a separate entry on the lower" in normalised or ( - "do not also emit a second entry on the lower" in normalised - ) - - -def test_prompt_continuation_definition_excludes_tall_handwriting() -> None: - """`continuation` is for the SEPARATE-fragment case (visible arrow / - re-write below). The wording must steer the model away from using - `continuation` for tall handwriting that spans two grid rows — that - case belongs to `double_height`.""" - # The directive: tall handwriting routes to double_height, not continuation. - assert 'use "double_height" instead' in PAGE_EXTRACTION_PROMPT + # The wrap-fragment-as-its-own-Entry directive. + assert "its own Entry on the lower" in text + # The "don't inline" negation that prevents the iter-1 truncation regression. + lowered = " ".join(text.lower().split()) + assert "do not try to inline the wrap" in lowered def test_prompt_crossed_out_excludes_margin_marks() -> None: @@ -234,17 +223,14 @@ def test_quadrant_template_lists_every_phase1_notes_tag(tag: str) -> None: assert tag in QUADRANT_EXTRACTION_PROMPT_TEMPLATE -def test_quadrant_template_double_height_emits_single_entry_on_upper_row() -> None: +def test_quadrant_template_continuation_describes_split_shape_for_wraps() -> None: + """Parallel of the PAGE version: wraps emit as their own Entry tagged + `continuation`, and the prompt must NOT ask the model to inline the + wrap into the prior row.""" text = QUADRANT_EXTRACTION_PROMPT_TEMPLATE - assert "SINGLE Entry" in text - normalised = " ".join(text.lower().split()) - assert "do not also emit a separate entry on the lower" in normalised or ( - "do not also emit a second entry on the lower" in normalised - ) - - -def test_quadrant_template_continuation_excludes_tall_handwriting() -> None: - assert 'use "double_height" instead' in QUADRANT_EXTRACTION_PROMPT_TEMPLATE + assert "its own Entry on the lower" in text + lowered = " ".join(text.lower().split()) + assert "do not try to inline the wrap" in lowered def test_quadrant_template_crossed_out_excludes_margin_marks() -> None: