From 828ed4365a8e9dec69587c1ac10421ce915d48dc Mon Sep 17 00:00:00 2001
From: Tristan Farmer <159447266+001TMF@users.noreply.github.com>
Date: Sun, 14 Jun 2026 18:06:32 +0100
Subject: [PATCH] fix(ncbivirus): honor Query.Limit as a walk ceiling; harden
 reconciliation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ncbi-virus connector ignored NormalizedQuery.Limit entirely, violating the documented
walk-ceiling contract (query.go: "connectors honor plan.Query.Limit as a walk ceiling so a
huge upstream is never fully fetched"). A search therefore always paged through and fetched
EVERY matching record (one GET per accession) — a filtered viral query could take minutes or
hang under NCBI's unauthenticated rate limit. Now `pinakes search --limit 1` returns the
authoritative count in ~1 request.

Honor the limit:
- Bound the list walk and the per-accession fetch by Limit, but ONLY when limit < authority
  (strict): a limited walk is then always incomplete -> BestEffort, never a false Complete.
- Truncate to the first Limit accessions in imposed (sorted) order.
- On a limit-bounded BestEffort, expose the authoritative total via Completeness.AuthoritativeCount
  (it is known from the page-0 x-ncbi-total-count header), so a caller asking only for a count
  gets the real number from a cheap limited fetch.

The cross-model determinism gate then surfaced two PRE-EXISTING false-Complete paths in the
reconciliation (latent, unrelated to the limit change), both fixed here:
- Over-listed + 404: a page enumerating MORE distinct accessions than the total claims, where a
  404 brings the retrieved count back to the total, falsely Completed. The Complete gate now also
  requires observedDistinct == authority (distinct accessions enumerated, captured before
  truncation).
- Covered-but-not-exhausted cursor: the walk stopped at len(ids) >= total even with a next-page
  token still pending, so an under-reporting total Completed early (and the hash depended on
  server page order). The walk now drains the cursor to nextToken=="" (pageBudget still bounds
  runaways); observedDistinct == authority downgrades any over-/under-enumeration.

Six non-vacuous regression tests (each proven by revert). make ci green. Verified by the
cross-model gate across four rounds (rounds 1-3 each caught a real HIGH; round 4 clean).
Connectors-only change (no engine/idl/schema).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 connectors/ncbivirus/ncbivirus.go      |  61 ++++++++-
 connectors/ncbivirus/ncbivirus_test.go | 166 +++++++++++++++++++++++++
 2 files changed, 222 insertions(+), 5 deletions(-)

diff --git a/connectors/ncbivirus/ncbivirus.go b/connectors/ncbivirus/ncbivirus.go
index 89cf6ac..8a64f98 100644
--- a/connectors/ncbivirus/ncbivirus.go
+++ b/connectors/ncbivirus/ncbivirus.go
@@ -192,6 +192,13 @@ func (c *Connector) Retrieve(p contract.Plan, gov *contract.Governor) (contract.
 
 	searchPath := len(st.IDs) == 0
 
+	// limit is the caller's hard ceiling (NormalizedQuery.Limit), honored as a WALK
+	// CEILING per the contract: a huge upstream is never fully fetched for a small
+	// limit. A limit below the authoritative total makes the result necessarily
+	// incomplete (the Complete gate below still requires fetched == authority), so a
+	// limit can never produce a false Complete — only a BestEffort partial. 0 = unlimited.
+	limit := p.Query.Limit
+
 	var (
 		ids                             []string
 		startCount, endCount, authority int64
@@ -294,11 +301,21 @@ func (c *Connector) Retrieve(p contract.Plan, gov *contract.Governor) (contract.
 				seen[id] = true
 				ids = append(ids, id)
 			}
-			// Terminate as soon as the distinct set covers the authoritative total, or a
-			// page returns no rows and no next token.
-			if authority > 0 && int64(len(ids)) >= authority {
+			// Walk ceiling: once the distinct set reaches the caller's limit, no further
+			// pages are needed — but ONLY when the limit is STRICTLY below the authoritative
+			// total. That guard is load-bearing: it guarantees a limited walk is always
+			// incomplete (fetched <= limit < authority => BestEffort), so the limit can
+			// never mask an over-observation into a false Complete. When limit >= authority
+			// the limit is non-constraining and the normal cover/total logic applies.
+			if limit > 0 && authority > 0 && limit < authority && int64(len(ids)) >= limit {
 				break
 			}
+			// Drain the cursor to genuine exhaustion (no next-page token) rather than
+			// stopping as soon as the distinct set covers the authoritative total: a total
+			// that UNDER-reports while the cursor keeps emitting pages would otherwise yield
+			// a false Complete (and a hash that depends on which records the server placed
+			// before the token). The total-derived pageBudget bounds a runaway cursor, and
+			// the observedDistinct == authority gate downgrades any over-/under-enumeration.
 			if nextToken == "" {
 				break
 			}
@@ -313,8 +330,26 @@ func (c *Connector) Retrieve(p contract.Plan, gov *contract.Governor) (contract.
 	// list before fetching. This (with Normalize's slice sorting + the per-accession
 	// re-keying below) discards the server's release-date-desc list order and its
 	// multi-accession reordering before anything is hashed.
+	// observedDistinct is the number of DISTINCT accessions the list walk enumerated,
+	// captured BEFORE any limit truncation. A Complete requires this to equal the
+	// authoritative total: enumerating MORE distinct accessions than the total claims is
+	// an inconsistency (an under-reported total) that must downgrade even when 404-skipped
+	// fetches happen to bring the retrieved count back down to the total.
+	observedDistinct := int64(len(ids))
+
 	sort.Strings(ids)
 
+	// Truncate to the first `limit` accessions in the imposed (sorted) order, but ONLY
+	// when limit is STRICTLY below the authoritative total (the same guard as the walk
+	// ceiling). This bounds the per-accession fetch for a small limit while guaranteeing
+	// the result stays incomplete (BestEffort). Truncating when limit == authority could
+	// drop an OVER-observed id (the page returned more rows than the total claims) and
+	// fake fetched == authority — a false Complete. When limit >= authority we keep every
+	// observed id so an under-reported total still downgrades.
+	if limit > 0 && authority > 0 && limit < authority && int64(len(ids)) > limit {
+		ids = ids[:limit]
+	}
+
 	// Fetch one verbatim per-accession ndjson report line per DISTINCT accession.
 	var entryVersion string // first per-accession version seen (the get/resolve authority + search-path fallback)
 	raws := make([]contract.RawRecord, 0, len(ids))
@@ -370,7 +405,10 @@ func (c *Connector) Retrieve(p contract.Plan, gov *contract.Governor) (contract.
 	}
 
 	fetched := int64(len(raws))
-	if st.Reconcile && totalKnown && !walkTruncated && !sawDuplicate && !totalMutated && !versionDrift && fetched == authority {
+	// limitApplied: the caller's limit constrained the walk below the authoritative
+	// total. Reported as the BestEffort reason (the count is still authoritative).
+	limitApplied := limit > 0 && totalKnown && authority > 0 && limit < authority
+	if st.Reconcile && totalKnown && !walkTruncated && !sawDuplicate && !totalMutated && !versionDrift && observedDistinct == authority && fetched == authority {
 		// Reconciled: the authoritative total was KNOWN and stable across the whole
 		// (bounded, re-list-free) walk, the version was coherent, and every record was
 		// fetched. Only then is the result a true Complete.
@@ -383,6 +421,8 @@ func (c *Connector) Retrieve(p contract.Plan, gov *contract.Governor) (contract.
 		reason = fmt.Sprintf("ncbivirus: reconcile_count disabled; fetched %d (no authoritative total to reconcile)", fetched)
 	case !totalKnown:
 		reason = fmt.Sprintf("ncbivirus: upstream reported no authoritative x-ncbi-total-count; fetched %d — total unknown, cannot claim Complete", fetched)
+	case limitApplied:
+		reason = fmt.Sprintf("ncbivirus: stopped at the requested limit of %d (authoritative total %d) — partial by request, count not reconciled", limit, authority)
 	case walkTruncated:
 		reason = truncReason
 	case sawDuplicate:
@@ -391,10 +431,21 @@ func (c *Connector) Retrieve(p contract.Plan, gov *contract.Governor) (contract.
 		reason = fmt.Sprintf("ncbivirus: x-ncbi-total-count mutated mid-walk (start=%d end=%d); fetched %d — count not reconciled", startCount, endCount, fetched)
 	case versionDrift:
 		reason = fmt.Sprintf("ncbivirus: x-datasets-version drifted mid-walk; fetched %d — version not coherent", fetched)
+	case observedDistinct != authority:
+		reason = fmt.Sprintf("ncbivirus: enumerated %d distinct accessions vs an authoritative total of %d — inconsistent enumeration, count not reconciled", observedDistinct, authority)
 	default:
 		reason = fmt.Sprintf("ncbivirus: fetched %d of %d (some accessions missing/404) — count not reconciled", fetched, authority)
 	}
-	return rs, contract.NewBestEffort(fetched, reason), nil
+	comp := contract.NewBestEffort(fetched, reason)
+	if limitApplied {
+		// The authoritative total IS known (the page-0 x-ncbi-total-count header); the
+		// walk was stopped at the caller's limit by request, not because the total is
+		// untrustworthy. Expose it so a caller asking only for a count gets the real
+		// total from a cheap limited fetch. (Other BestEffort cases leave it zero
+		// because their total is absent or proven unstable.)
+		comp.AuthoritativeCount = authority
+	}
+	return rs, comp, nil
 }
 
 // listPage performs one dataset_report list GET under a Governor permit and returns
diff --git a/connectors/ncbivirus/ncbivirus_test.go b/connectors/ncbivirus/ncbivirus_test.go
index ccad71b..3e74908 100644
--- a/connectors/ncbivirus/ncbivirus_test.go
+++ b/connectors/ncbivirus/ncbivirus_test.go
@@ -330,6 +330,172 @@ func TestRetrieve_CompleteReconcilesFromHeaderTotal(t *testing.T) {
 	}
 }
 
+// TestRetrieve_LimitBoundsWalk: a Query.Limit below the authoritative total is honored
+// as a walk ceiling. The result is BestEffort (NOT a false Complete) truncated to Limit
+// records, but still reports the authoritative total so a caller asking only for a count
+// gets the real number from a cheap limited fetch.
+func TestRetrieve_LimitBoundsWalk(t *testing.T) {
+	f := stdListDoer(t) // single page, 3 accessions, x-ncbi-total-count = 3
+	c := newConn(t, f)
+	q := searchQuery()
+	q.Limit = 1
+	p := retrievePlan(t, c, q)
+
+	rs, comp, err := c.Retrieve(p, noGov())
+	if err != nil {
+		t.Fatalf("Retrieve: %v", err)
+	}
+	if comp.State != contract.BestEffortState {
+		t.Fatalf("a Limit below the total must NOT Complete; want BestEffort, got %s (%s)", comp.State, comp.Reason)
+	}
+	if comp.ReconciledCount != 1 {
+		t.Fatalf("expected 1 record reconciled (the limit), got %d", comp.ReconciledCount)
+	}
+	if comp.AuthoritativeCount != 3 {
+		t.Fatalf("the authoritative total (3) must be reported even on a limited fetch, got %d", comp.AuthoritativeCount)
+	}
+	if len(rs.Records) != 1 {
+		t.Fatalf("records must be truncated to the limit (1), got %d", len(rs.Records))
+	}
+}
+
+// TestRetrieve_LimitAtOrAboveTotalStillCompletes is the over-rejection guard: a Limit
+// that does not constrain the result (>= the authoritative total) must NOT downgrade a
+// legitimately complete walk.
+func TestRetrieve_LimitAtOrAboveTotalStillCompletes(t *testing.T) {
+	f := stdListDoer(t)
+	c := newConn(t, f)
+	q := searchQuery()
+	q.Limit = 5 // > total of 3
+	p := retrievePlan(t, c, q)
+
+	rs, comp, err := c.Retrieve(p, noGov())
+	if err != nil {
+		t.Fatalf("Retrieve: %v", err)
+	}
+	if comp.State != contract.CompleteState || comp.ReconciledCount != 3 || comp.AuthoritativeCount != 3 {
+		t.Fatalf("a non-constraining limit must still Complete(3,3), got %s %d/%d", comp.State, comp.ReconciledCount, comp.AuthoritativeCount)
+	}
+	if len(rs.Records) != 3 {
+		t.Fatalf("expected all 3 records, got %d", len(rs.Records))
+	}
+}
+
+// TestRetrieve_LimitStopsPaginationEarly proves the walk ceiling bounds the LIST walk
+// itself: with a multi-page upstream and Limit=1, the connector stops after page 1 and
+// never requests page 2 nor the skipped per-accession reports.
+func TestRetrieve_LimitStopsPaginationEarly(t *testing.T) {
+	f := newFakeDoer()
+	listURL := listURLForTaxon(t, "2697049", 1000)
+	// page1 lists NC_045512.2 + NC_002549.1 (server order), total 3, a next-page token.
+	f.set("GET", listURL, 200, loadFixture(t, "list_page1.ndjson"), hdr("3", "TOKEN2", "18.30.1"))
+	// page2 is registered but must NEVER be fetched once the limit is reached on page 1.
+	page2URL := appendPageToken(listURL, "TOKEN2")
+	f.set("GET", page2URL, 200, loadFixture(t, "list_page2.ndjson"), hdr("3", "", "18.30.1"))
+	// After sort+truncate to 1, the single fetched accession is NC_002549.1.
+	f.set("GET", reportURL("NC_002549.1"), 200, loadFixture(t, "report_NC_002549.1.ndjson"), hdr("", "", "18.30.1"))
+	c := newConn(t, f)
+	q := searchQuery()
+	q.Limit = 1
+	p := retrievePlan(t, c, q)
+
+	rs, comp, err := c.Retrieve(p, noGov())
+	if err != nil {
+		t.Fatalf("Retrieve: %v", err)
+	}
+	if comp.State != contract.BestEffortState || comp.AuthoritativeCount != 3 || len(rs.Records) != 1 {
+		t.Fatalf("want BestEffort with authoritative 3 and 1 record, got %s auth=%d n=%d", comp.State, comp.AuthoritativeCount, len(rs.Records))
+	}
+	// Only page 1 + the one truncated report were fetched (2 calls): page 2 and the other
+	// reports were never requested — the walk ceiling stopped pagination early.
+	if f.callCount() != 2 {
+		t.Fatalf("walk ceiling must stop after page 1 (want 2 calls: list + 1 report), got %d", f.callCount())
+	}
+}
+
+// TestRetrieve_LimitEqualUnderReportedTotalStillDowngrades is the cross-model gate's
+// catch: the list page returns MORE distinct accessions (3) than the authoritative total
+// claims (2). A Limit equal to that under-reported total must NOT let truncation drop the
+// extra id and fake fetched == authority — the over-observation must still downgrade to
+// BestEffort. (Truncation only applies when limit < authority, so this stays a full,
+// honest fetch that exposes the inconsistency.)
+func TestRetrieve_LimitEqualUnderReportedTotalStillDowngrades(t *testing.T) {
+	f := newFakeDoer()
+	// 3 distinct accessions on the page, but x-ncbi-total-count claims only 2.
+	f.set("GET", listURLForTaxon(t, "2697049", 1000), 200, loadFixture(t, "list_3ids.ndjson"), hdr("2", "", "18.30.1"))
+	f.set("GET", reportURL("NC_045512.2"), 200, loadFixture(t, "report_NC_045512.2.ndjson"), hdr("", "", "18.30.1"))
+	f.set("GET", reportURL("NC_002549.1"), 200, loadFixture(t, "report_NC_002549.1.ndjson"), hdr("", "", "18.30.1"))
+	f.set("GET", reportURL("NC_001498.1"), 200, loadFixture(t, "report_NC_001498.1.ndjson"), hdr("", "", "18.30.1"))
+	c := newConn(t, f)
+	q := searchQuery()
+	q.Limit = 2 // == the under-reported total; truncation must NOT engage
+	p := retrievePlan(t, c, q)
+
+	rs, comp, err := c.Retrieve(p, noGov())
+	if err != nil {
+		t.Fatalf("Retrieve: %v", err)
+	}
+	if comp.State != contract.BestEffortState {
+		t.Fatalf("a limit equal to an under-reported total must not let truncation fake a Complete; want BestEffort, got %s (%s)", comp.State, comp.Reason)
+	}
+	if len(rs.Records) != 3 {
+		t.Fatalf("every observed accession must be kept (no truncation at limit==authority), got %d records", len(rs.Records))
+	}
+}
+
+// TestRetrieve_OverListedWith404StillDowngrades is the round-2 gate's catch — a
+// pre-existing complete-or-fail gap, independent of Limit. The list page enumerates 3
+// distinct accessions but the authoritative total claims only 2, and one over-listed
+// accession 404s on fetch, so the retrieved count (2) coincidentally equals the total.
+// Enumerating more distinct accessions than the total claims is an inconsistency that
+// must downgrade: the Complete gate also requires the observed distinct count to equal
+// the authoritative total.
+func TestRetrieve_OverListedWith404StillDowngrades(t *testing.T) {
+	f := newFakeDoer()
+	f.set("GET", listURLForTaxon(t, "2697049", 1000), 200, loadFixture(t, "list_3ids.ndjson"), hdr("2", "", "18.30.1")) // 3 ids, total claims 2
+	f.set("GET", reportURL("NC_045512.2"), 200, loadFixture(t, "report_NC_045512.2.ndjson"), hdr("", "", "18.30.1"))
+	f.set("GET", reportURL("NC_002549.1"), 200, loadFixture(t, "report_NC_002549.1.ndjson"), hdr("", "", "18.30.1"))
+	f.set("GET", reportURL("NC_001498.1"), 404, nil, hdr("", "", "18.30.1")) // one over-listed accession 404s
+	c := newConn(t, f)
+	p := retrievePlan(t, c, searchQuery()) // no Limit — this guards the core complete-or-fail gate
+
+	_, comp, err := c.Retrieve(p, noGov())
+	if err != nil {
+		t.Fatalf("Retrieve: %v", err)
+	}
+	if comp.State != contract.BestEffortState {
+		t.Fatalf("observing 3 distinct accessions when the total claims 2 must downgrade even if a 404 makes fetched==total; got %s (%s)", comp.State, comp.Reason)
+	}
+}
+
+// TestRetrieve_TotalCoveredButCursorNotExhaustedDowngrades is the round-3 gate's catch:
+// page 1's accessions already "cover" the header total (2), but page 1 ALSO emits a
+// next-page token and page 2 enumerates a 3rd matching accession (the total under-
+// reported). The walk must drain the cursor to exhaustion rather than stopping at
+// len(ids) >= total, observe 3 distinct accessions, and downgrade — never Complete(2,2).
+func TestRetrieve_TotalCoveredButCursorNotExhaustedDowngrades(t *testing.T) {
+	f := newFakeDoer()
+	listURL := listURLForTaxon(t, "2697049", 1000)
+	// page1: 2 accessions, total claims 2, BUT a next-page token is present.
+	f.set("GET", listURL, 200, loadFixture(t, "list_page1.ndjson"), hdr("2", "TOKEN2", "18.30.1"))
+	// page2: a 3rd accession, still total 2, last page.
+	page2URL := appendPageToken(listURL, "TOKEN2")
+	f.set("GET", page2URL, 200, loadFixture(t, "list_page2.ndjson"), hdr("2", "", "18.30.1"))
+	f.set("GET", reportURL("NC_045512.2"), 200, loadFixture(t, "report_NC_045512.2.ndjson"), hdr("", "", "18.30.1"))
+	f.set("GET", reportURL("NC_002549.1"), 200, loadFixture(t, "report_NC_002549.1.ndjson"), hdr("", "", "18.30.1"))
+	f.set("GET", reportURL("NC_001498.1"), 200, loadFixture(t, "report_NC_001498.1.ndjson"), hdr("", "", "18.30.1"))
+	c := newConn(t, f)
+	p := retrievePlan(t, c, searchQuery()) // no Limit
+
+	_, comp, err := c.Retrieve(p, noGov())
+	if err != nil {
+		t.Fatalf("Retrieve: %v", err)
+	}
+	if comp.State != contract.BestEffortState {
+		t.Fatalf("a covered-but-not-exhausted cursor (under-reported total) must downgrade, not Complete; got %s (%s)", comp.State, comp.Reason)
+	}
+}
+
 func TestRetrieve_TokenWalkCollectsAllPages(t *testing.T) {
 	f := newFakeDoer()
 	listURL := listURLForTaxon(t, "2697049", 1000)