Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions case-studies/ebola/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"manifest_schema_version": "1.0.0",
"engine_version": "pinakes/0.1.0",
"connector_spec_version": "ncbi-virus/1.0.0",
"filter_logic_hash": "sha256:089fe8548c81dc1e871e08175f7a567e7cb9e94c7f6e32f6f1f8e597d198c4ca",
"filter_logic_hash": "sha256:82248fc9f8a959e1e53172618961d757ceb65afd2d8f5f9c98b9bb1f80a91039",
"serializer_codec": {
"serializer_version": "1.0.0",
"digest_algorithm": "sha256",
Expand Down Expand Up @@ -49,7 +49,7 @@
"logical_record_hash": "sha256:01fffd0c228c9f46e28f472a75b2e58b978ef9e565909f2838aa507bd44120e9",
"schema_version": "sequences/1.0.0",
"completeness": {
"state": 0,
"state": "complete",
"reconciled_count": 32,
"authoritative_count": 32
},
Expand Down
249 changes: 249 additions & 0 deletions connectors/ncbivirus/export_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
package ncbivirus

import (
"bytes"
"context"
"fmt"
"net/url"
"strings"
"testing"

"pinakes.sh/pinakes/engine/contract"
)

// efetchBatchURL is the exact URL the connector forms for one efetch batch (a
// comma-joined accession list), mirroring ExportFASTA's url.Values construction.
func efetchBatchURL(accs ...string) string {
v := url.Values{}
v.Set("db", "nuccore")
v.Set("id", strings.Join(accs, ","))
v.Set("rettype", "fasta")
v.Set("retmode", "text")
return "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" + v.Encode()
}

// recs builds a normalized record set carrying just the source accessions, which is
// all ExportFASTA reads.
func recs(accs ...string) []contract.NormalizedRecord {
out := make([]contract.NormalizedRecord, 0, len(accs))
for _, a := range accs {
out = append(out, contract.NormalizedRecord{LogicalID: "ncbi-virus:" + a, SourceAccession: a})
}
return out
}

// fastaMulti concatenates one synthetic FASTA record per accession into a single
// efetch payload, the way efetch returns a multi-id request. Each accession's
// residues are a stable function of the accession (NOT its position in the
// payload), so a scrambled return order does not change any record's sequence.
func fastaMulti(accs ...string) []byte {
var b bytes.Buffer
for _, a := range accs {
reps := 15 + int(a[len(a)-1])%4 // stable per accession
b.WriteString(">" + a + " synthetic\n" + strings.Repeat("ACGT", reps) + "\n")
}
return b.Bytes()
}

// headerCount counts FASTA header lines in out.
func headerCount(out []byte) int {
n := bytes.Count(out, []byte("\n>"))
if len(out) > 0 && out[0] == '>' {
n++
}
return n
}

func TestExportFASTA_CanonicalAndOrdered(t *testing.T) {
f := newFakeDoer()
f.set("GET", efetchBatchURL("KM1", "KM2", "KM3"), 200, fastaMulti("KM1", "KM2", "KM3"), nil)
c := newConnEfetch(t, f)

out, err := c.ExportFASTA(context.Background(), recs("KM1", "KM2", "KM3"), noGov())
if err != nil {
t.Fatalf("ExportFASTA: %v", err)
}
// One batched efetch call, not one per accession.
var efetchCalls int
for _, r := range f.requests {
if strings.Contains(r, "efetch.fcgi") {
efetchCalls++
}
}
if efetchCalls != 1 {
t.Fatalf("expected 1 batched efetch call, got %d", efetchCalls)
}
if hc := headerCount(out); hc != 3 {
t.Fatalf("expected 3 FASTA records, got %d", hc)
}
// Canonical headers: ">ACCESSION" exactly (no upstream free-text description).
for _, want := range []string{">KM1\n", ">KM2\n", ">KM3\n"} {
if !bytes.Contains(out, []byte(want)) {
t.Fatalf("canonical header %q missing from output:\n%s", want, out)
}
}
if bytes.Contains(out, []byte("synthetic")) {
t.Fatalf("export must drop upstream header free-text, got:\n%s", out)
}
// Records preserved in canonical (input) order.
iKM1, iKM2, iKM3 := bytes.Index(out, []byte(">KM1")), bytes.Index(out, []byte(">KM2")), bytes.Index(out, []byte(">KM3"))
if !(iKM1 >= 0 && iKM1 < iKM2 && iKM2 < iKM3) {
t.Fatalf("records out of canonical order: KM1@%d KM2@%d KM3@%d", iKM1, iKM2, iKM3)
}
}

func TestExportFASTA_OutputDeterministicRegardlessOfReturnOrder(t *testing.T) {
// efetch returns the records in a DIFFERENT order than requested; the export
// must still emit them in requested order, byte-identical.
f1 := newFakeDoer()
f1.set("GET", efetchBatchURL("KM1", "KM2", "KM3"), 200, fastaMulti("KM1", "KM2", "KM3"), nil)
out1, err := newConnEfetch(t, f1).ExportFASTA(context.Background(), recs("KM1", "KM2", "KM3"), noGov())
if err != nil {
t.Fatalf("ExportFASTA 1: %v", err)
}
f2 := newFakeDoer()
f2.set("GET", efetchBatchURL("KM1", "KM2", "KM3"), 200, fastaMulti("KM3", "KM1", "KM2"), nil) // scrambled
out2, err := newConnEfetch(t, f2).ExportFASTA(context.Background(), recs("KM1", "KM2", "KM3"), noGov())
if err != nil {
t.Fatalf("ExportFASTA 2: %v", err)
}
if !bytes.Equal(out1, out2) {
t.Fatalf("export must be canonical regardless of efetch return order:\n%s\n---\n%s", out1, out2)
}
}

func TestExportFASTA_Batches(t *testing.T) {
// 250 accessions => two efetch batches (200 + 50), and every record returned.
all := make([]string, 0, 250)
for i := 0; i < 250; i++ {
all = append(all, fmt.Sprintf("A%03d", i))
}
f := newFakeDoer()
f.set("GET", efetchBatchURL(all[:200]...), 200, fastaMulti(all[:200]...), nil)
f.set("GET", efetchBatchURL(all[200:]...), 200, fastaMulti(all[200:]...), nil)
c := newConnEfetch(t, f)

out, err := c.ExportFASTA(context.Background(), recs(all...), noGov())
if err != nil {
t.Fatalf("ExportFASTA: %v", err)
}
var efetchCalls int
for _, r := range f.requests {
if strings.Contains(r, "efetch.fcgi") {
efetchCalls++
}
}
if efetchCalls != 2 {
t.Fatalf("expected 2 batched efetch calls for 250 records, got %d", efetchCalls)
}
if hc := headerCount(out); hc != 250 {
t.Fatalf("expected 250 FASTA records across batches, got %d", hc)
}
}

func TestExportFASTA_IncompleteIsFailure(t *testing.T) {
// efetch returns only two of the three requested accessions: a short export must
// fail (complete-or-fail), never silently return a short set.
f := newFakeDoer()
f.set("GET", efetchBatchURL("KM1", "KM2", "KM3"), 200, fastaMulti("KM1", "KM2"), nil)
c := newConnEfetch(t, f)

_, err := c.ExportFASTA(context.Background(), recs("KM1", "KM2", "KM3"), noGov())
if err == nil {
t.Fatal("a short efetch payload (2 of 3 records) must fail the export")
}
if !strings.Contains(err.Error(), "incomplete") || !strings.Contains(err.Error(), "KM3") {
t.Fatalf("error must name the incompleteness and the missing accession, got: %v", err)
}
}

func TestExportFASTA_UnrequestedAccessionIsFailure(t *testing.T) {
// efetch returns a record that was never requested: reject it.
f := newFakeDoer()
f.set("GET", efetchBatchURL("KM1", "KM2"), 200, fastaMulti("KM1", "KM9"), nil)
c := newConnEfetch(t, f)

_, err := c.ExportFASTA(context.Background(), recs("KM1", "KM2"), noGov())
if err == nil || !strings.Contains(err.Error(), "unrequested") {
t.Fatalf("an unrequested returned accession must fail, got: %v", err)
}
}

func TestExportFASTA_DuplicateReturnedAccessionIsFailure(t *testing.T) {
f := newFakeDoer()
f.set("GET", efetchBatchURL("KM1", "KM2"), 200, fastaMulti("KM1", "KM1"), nil) // KM2 missing, KM1 twice
c := newConnEfetch(t, f)

_, err := c.ExportFASTA(context.Background(), recs("KM1", "KM2"), noGov())
if err == nil {
t.Fatal("a duplicate returned accession must fail")
}
}

func TestExportFASTA_DuplicateRequestedAccessionIsFailure(t *testing.T) {
c := newConnEfetch(t, newFakeDoer())
_, err := c.ExportFASTA(context.Background(), recs("KM1", "KM1"), noGov())
if err == nil || !strings.Contains(err.Error(), "duplicate") {
t.Fatalf("a duplicate requested accession must fail, got: %v", err)
}
}

func TestExportFASTA_VersionInsensitiveMatch(t *testing.T) {
// Requested bare accession; efetch returns the versioned form. They must match.
f := newFakeDoer()
f.set("GET", efetchBatchURL("KM1"), 200, []byte(">KM1.2 synthetic\nACGTACGT\n"), nil)
c := newConnEfetch(t, f)

out, err := c.ExportFASTA(context.Background(), recs("KM1"), noGov())
if err != nil {
t.Fatalf("version-insensitive match must succeed: %v", err)
}
if !bytes.HasPrefix(out, []byte(">KM1\n")) {
t.Fatalf("export header must be the requested accession, got:\n%s", out)
}
}

func TestExportFASTA_NoEfetchEndpoint(t *testing.T) {
c := newConn(t, newFakeDoer()) // base spec: no efetch endpoint configured
_, err := c.ExportFASTA(context.Background(), recs("KM1"), noGov())
if err == nil {
t.Fatal("ExportFASTA without an efetch endpoint must error")
}
}

func TestExportFASTA_MissingAccessionIsFailure(t *testing.T) {
c := newConnEfetch(t, newFakeDoer())
bad := []contract.NormalizedRecord{{LogicalID: "ncbi-virus:x"}} // no SourceAccession
_, err := c.ExportFASTA(context.Background(), bad, noGov())
if err == nil {
t.Fatal("a record without a source accession must fail the export")
}
}

func TestExportFASTA_EmptySetNeedsNoEndpoint(t *testing.T) {
// A valid empty verified set yields empty bytes and no efetch call — even with no
// efetch endpoint configured (no upstream access is required).
c := newConn(t, newFakeDoer())
out, err := c.ExportFASTA(context.Background(), nil, noGov())
if err != nil {
t.Fatalf("empty set: %v", err)
}
if len(out) != 0 {
t.Fatalf("empty set must yield no bytes, got %d", len(out))
}
}

func TestExportFASTA_CancelledContext(t *testing.T) {
f := newFakeDoer()
f.set("GET", efetchBatchURL("KM1"), 200, fastaMulti("KM1"), nil)
c := newConnEfetch(t, f)
ctx, cancel := context.WithCancel(context.Background())
cancel()
_, err := c.ExportFASTA(ctx, recs("KM1"), noGov())
if err == nil {
t.Fatal("a cancelled context must abort the export")
}
}

// Compile-time proof the connector advertises the optional capability.
var _ contract.SequenceExporter = (*Connector)(nil)
4 changes: 2 additions & 2 deletions connectors/ncbivirus/fake_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,9 @@ const restBase = "https://api.ncbi.nlm.nih.gov"
func listURLForTaxon(t *testing.T, taxon string, pageSize int64) string {
t.Helper()
c := &Connector{rest: restBase, pageSize: pageSize}
u, err := c.buildListURL([]contract.Filter{{Field: "organism_taxon_id", Operator: "eq", Value: taxon}})
u, _, err := c.planSearch([]contract.Filter{{Field: "organism_taxon_id", Operator: "eq", Value: taxon}})
if err != nil {
t.Fatalf("buildListURL: %v", err)
t.Fatalf("planSearch: %v", err)
}
return u
}
Expand Down
Loading
Loading