From aeba9446af5e8c81fd135f9a8e086378b5c31ae0 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 20 Jun 2026 16:21:11 -0500 Subject: [PATCH 1/5] fix(node): dedupe mirror and canonical repo rows on list surfaces (#6) The non-paged GET /api/v1/repos legacy path and list_federated_repos returned both the short-owner peer mirror row and the canonical did:key row for the same logical repo, so profiles rendered the repo twice. Only the paged path collapsed them, in SQL. Add a dedupe_canonical_repos helper that groups by (normalized owner, name), keeps the canonical non-mirror row (tie broken by earliest created_at), and carries the group's latest updated_at onto the survivor, matching the paged SQL dedup. Apply it at both non-paged surfaces and cover it with unit tests. --- crates/gitlawb-node/src/api/repos.rs | 232 +++++++++++++++++++++++++-- 1 file changed, 222 insertions(+), 10 deletions(-) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 5f7599c..ad9923f 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -6,7 +6,8 @@ use bytes::Bytes; use std::sync::Arc; use crate::auth::{caller_authorized_to_push, AuthenticatedDid}; -use chrono::Utc; +use crate::db::RepoRecord; +use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use uuid::Uuid; @@ -227,8 +228,8 @@ pub async fn list_repos( } let repos = state.db.list_all_repos_with_stars().await?; - let filtered: Vec<_> = repos - .iter() + let filtered: Vec<(RepoRecord, i64)> = repos + .into_iter() .filter(|(r, _)| { if let Some(owner) = &query.owner { crate::api::did_matches(owner.as_str(), &r.owner_did) @@ -237,10 +238,11 @@ pub async fn list_repos( } }) .collect(); - let total = filtered.len() as i64; - let resp: Vec<_> = filtered + let deduped = dedupe_canonical_repos(filtered); + let total = deduped.len() as i64; + let resp: Vec<_> = deduped .into_iter() - .map(|(r, stars)| to_response(r, &state, *stars)) + .map(|(r, stars)| to_response(&r, &state, stars)) .collect(); let mut response = Json(resp).into_response(); response.headers_mut().insert( @@ -1121,7 +1123,7 @@ pub async fn list_refs( pub async fn list_federated_repos( State(state): State, ) -> Result> { - let local_repos = state.db.list_all_repos_with_stars().await?; + let local_repos = dedupe_canonical_repos(state.db.list_all_repos_with_stars().await?); let local_node_url = state .config .public_url @@ -1379,11 +1381,90 @@ fn to_response(record: &crate::db::RepoRecord, state: &AppState, star_count: i64 } } +/// Description marker that `upsert_mirror_repo` stamps on short-owner peer mirror rows. +const MIRROR_DESCRIPTION: &str = "mirrored from peer"; + +/// Collapse short-owner mirror rows and canonical `did:key:` rows that point at the +/// same logical repo into a single entry, so profile/list surfaces don't render the +/// same repo twice (issue #6). +/// +/// Rows are grouped by `(normalized owner, name)`, where the normalized owner is the +/// key segment after the last `:` (so `did:key:z6Mk…` and the bare `z6Mk…` mirror row +/// collapse together). Within a group the canonical row wins: a non-mirror row is +/// preferred over a `"mirrored from peer"` row, ties broken by earliest `created_at`. +/// The survivor inherits the group's most recent `updated_at` so a gossip push that +/// only touches the mirror row still floats the repo to the top. +/// +/// This mirrors the SQL dedup already applied on the paged path in +/// `Db::list_all_repos_paged`; the two must stay in sync. +fn dedupe_canonical_repos(rows: Vec<(RepoRecord, i64)>) -> Vec<(RepoRecord, i64)> { + use std::collections::HashMap; + + fn is_mirror(r: &RepoRecord) -> bool { + r.description.as_deref() == Some(MIRROR_DESCRIPTION) + } + + // Strictly more canonical: non-mirror beats mirror; on a tie the earlier row wins. + fn outranks(candidate: &RepoRecord, current: &RepoRecord) -> bool { + match (is_mirror(candidate), is_mirror(current)) { + (false, true) => true, + (true, false) => false, + _ => candidate.created_at < current.created_at, + } + } + + // Preserve first-seen group order so output ordering stays deterministic. + let mut order: Vec<(String, String)> = Vec::new(); + let mut winners: HashMap<(String, String), (RepoRecord, i64)> = HashMap::new(); + let mut latest: HashMap<(String, String), DateTime> = HashMap::new(); + + for (rec, stars) in rows { + let short = rec + .owner_did + .rsplit(':') + .next() + .unwrap_or(&rec.owner_did) + .to_string(); + let key = (short, rec.name.clone()); + + latest + .entry(key.clone()) + .and_modify(|u| { + if rec.updated_at > *u { + *u = rec.updated_at; + } + }) + .or_insert(rec.updated_at); + + match winners.get(&key) { + None => { + order.push(key.clone()); + winners.insert(key, (rec, stars)); + } + Some((current, _)) if outranks(&rec, current) => { + winners.insert(key, (rec, stars)); + } + Some(_) => {} + } + } + + order + .into_iter() + .filter_map(|key| { + let max_updated = latest.get(&key).copied(); + winners.remove(&key).map(|(mut rec, stars)| { + if let Some(u) = max_updated { + rec.updated_at = u; + } + (rec, stars) + }) + }) + .collect() +} + #[cfg(test)] mod tests { - use super::owner_push_rejection; - use crate::auth::caller_authorized_to_push; - use crate::error::AppError; + use super::*; const OWNER_DID: &str = "did:key:z6MkpTHR8VNsBxYAAWHut2Geadd9jSwuBV8xRoAnwWsdvktH"; const OWNER_SHORT: &str = "z6MkpTHR8VNsBxYAAWHut2Geadd9jSwuBV8xRoAnwWsdvktH"; @@ -1457,4 +1538,135 @@ mod tests { assert!(caller_authorized_to_push(&repo, OWNER_SHORT)); assert!(!caller_authorized_to_push(&repo, STRANGER_DID)); } + + fn ts(s: &str) -> DateTime { + DateTime::parse_from_rfc3339(s).unwrap().with_timezone(&Utc) + } + + fn record(id: &str, owner_did: &str, name: &str, desc: &str, updated: &str) -> RepoRecord { + RepoRecord { + id: id.to_string(), + name: name.to_string(), + owner_did: owner_did.to_string(), + description: Some(desc.to_string()), + is_public: true, + default_branch: "main".to_string(), + created_at: ts("2026-01-01T00:00:00Z"), + updated_at: ts(updated), + disk_path: format!("/srv/{id}"), + forked_from: None, + machine_id: None, + } + } + + #[test] + fn canonical_row_wins_over_short_owner_mirror() { + // Order deliberately puts the mirror row first to prove ranking, not input order, decides. + let mirror = record( + "z6Mkwbud/nipmod", + "z6Mkwbud", + "nipmod", + "mirrored from peer", + "2026-02-01T00:00:00Z", + ); + let canonical = record( + "9d92186a", + "did:key:z6Mkwbud", + "nipmod", + "Decentralized npm for agents on Gitlawb", + "2026-01-15T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(mirror, 3), (canonical, 7)]); + + assert_eq!(out.len(), 1, "the two rows collapse into one logical repo"); + let (rec, stars) = &out[0]; + assert_eq!( + rec.owner_did, "did:key:z6Mkwbud", + "canonical did:key row wins" + ); + assert_eq!( + rec.description.as_deref(), + Some("Decentralized npm for agents on Gitlawb"), + "canonical description and metadata survive, not the mirror placeholder", + ); + assert_eq!(*stars, 7, "star count follows the canonical row"); + // Survivor inherits the group's most recent updated_at (here the mirror's). + assert_eq!(rec.updated_at, ts("2026-02-01T00:00:00Z")); + } + + #[test] + fn distinct_repos_are_preserved_in_order() { + let a = record( + "id-a", + "did:key:z6Aaa", + "alpha", + "first", + "2026-03-01T00:00:00Z", + ); + let b = record( + "id-b", + "did:key:z6Bbb", + "beta", + "second", + "2026-03-02T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(a, 1), (b, 2)]); + + assert_eq!(out.len(), 2); + assert_eq!(out[0].0.name, "alpha"); + assert_eq!(out[1].0.name, "beta"); + } + + #[test] + fn same_short_owner_different_repo_does_not_collapse() { + let one = record( + "id-1", + "z6Mkwbud", + "nipmod", + "mirrored from peer", + "2026-01-01T00:00:00Z", + ); + let two = record( + "id-2", + "did:key:z6Mkwbud", + "other", + "real", + "2026-01-01T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(one, 0), (two, 0)]); + + assert_eq!( + out.len(), + 2, + "different repo names stay separate under one owner" + ); + } + + #[test] + fn two_mirror_rows_break_tie_by_earliest_created_at() { + let mut older = record( + "id-old", + "z6X", + "r", + "mirrored from peer", + "2026-02-01T00:00:00Z", + ); + older.created_at = ts("2026-01-01T00:00:00Z"); + let mut newer = record( + "id-new", + "z6X", + "r", + "mirrored from peer", + "2026-03-01T00:00:00Z", + ); + newer.created_at = ts("2026-01-10T00:00:00Z"); + + let out = dedupe_canonical_repos(vec![(newer, 0), (older, 0)]); + + assert_eq!(out.len(), 1); + assert_eq!(out[0].0.id, "id-old", "earliest created_at wins the tie"); + } } From 694fab1feec0e398ffc0c9ba6f4f72b6c497ae7c Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 24 Jun 2026 09:27:02 -0500 Subject: [PATCH 2/5] fix(node): dedupe GraphQL + stats repo surfaces and key mirror detection on the repo id Addresses jatmn's review on #73 (dedup not applied on every reader path; mirror detection keyed on a user-settable string). - GraphQL repos and /api/v1/stats now collapse mirror+canonical rows, via new Db::list_all_repos_deduped and count_repos_deduped that share the DISTINCT ON CTE with list_all_repos_paged so the dedup rule cannot drift. - Mirror detection keys on the structural slash-form id (written only by upsert_mirror_repo) instead of the description == 'mirrored from peer' string, in both the SQL paths and dedupe_canonical_repos. - Deterministic survivor on a full created_at tie (id ASC) in both implementations. - Legacy REST list and federated keep their method-scoped did_matches owner filter in Rust; it does not compose with the method-blind SQL group key, so those paths intentionally stay on the Rust helper. - Adds sqlx and unit tests for the new surfaces, the structural marker, and the tiebreak. --- crates/gitlawb-node/src/api/repos.rs | 74 +++++- crates/gitlawb-node/src/db/mod.rs | 310 +++++++++++++++++++++-- crates/gitlawb-node/src/graphql/query.rs | 2 +- crates/gitlawb-node/src/server.rs | 7 +- crates/gitlawb-node/src/test_support.rs | 91 +++++++ 5 files changed, 447 insertions(+), 37 deletions(-) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index ad9923f..4f434ef 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -1381,9 +1381,6 @@ fn to_response(record: &crate::db::RepoRecord, state: &AppState, star_count: i64 } } -/// Description marker that `upsert_mirror_repo` stamps on short-owner peer mirror rows. -const MIRROR_DESCRIPTION: &str = "mirrored from peer"; - /// Collapse short-owner mirror rows and canonical `did:key:` rows that point at the /// same logical repo into a single entry, so profile/list surfaces don't render the /// same repo twice (issue #6). @@ -1391,25 +1388,31 @@ const MIRROR_DESCRIPTION: &str = "mirrored from peer"; /// Rows are grouped by `(normalized owner, name)`, where the normalized owner is the /// key segment after the last `:` (so `did:key:z6Mk…` and the bare `z6Mk…` mirror row /// collapse together). Within a group the canonical row wins: a non-mirror row is -/// preferred over a `"mirrored from peer"` row, ties broken by earliest `created_at`. +/// preferred over a mirror, ties broken by earliest `created_at` then `id`. A mirror +/// row is identified structurally by its slash-form `id` (`{owner_short}/{name}`, +/// written only by `Db::upsert_mirror_repo`), not by its user-settable description. /// The survivor inherits the group's most recent `updated_at` so a gossip push that /// only touches the mirror row still floats the repo to the top. /// -/// This mirrors the SQL dedup already applied on the paged path in -/// `Db::list_all_repos_paged`; the two must stay in sync. +/// This mirrors the SQL dedup applied on the paged/unfiltered paths via +/// `Db::DEDUP_CTE`; the marker and the `id` tiebreak must stay in sync with it. fn dedupe_canonical_repos(rows: Vec<(RepoRecord, i64)>) -> Vec<(RepoRecord, i64)> { use std::collections::HashMap; + // Mirror rows carry a slash-form id, written only by Db::upsert_mirror_repo; + // canonical rows use a UUID id (no slash). Structural, not user-settable. fn is_mirror(r: &RepoRecord) -> bool { - r.description.as_deref() == Some(MIRROR_DESCRIPTION) + r.id.contains('/') } - // Strictly more canonical: non-mirror beats mirror; on a tie the earlier row wins. + // Strictly more canonical: non-mirror beats mirror; on equal mirror-status the + // earlier created_at wins, and a full tie falls back to id ASC so the survivor + // matches SQL's DISTINCT ON (… created_at ASC, id ASC). fn outranks(candidate: &RepoRecord, current: &RepoRecord) -> bool { match (is_mirror(candidate), is_mirror(current)) { (false, true) => true, (true, false) => false, - _ => candidate.created_at < current.created_at, + _ => (candidate.created_at, &candidate.id) < (current.created_at, ¤t.id), } } @@ -1621,8 +1624,9 @@ mod tests { #[test] fn same_short_owner_different_repo_does_not_collapse() { + // `one` is a real mirror row: slash-form id is the structural marker. let one = record( - "id-1", + "z6Mkwbud/nipmod", "z6Mkwbud", "nipmod", "mirrored from peer", @@ -1647,8 +1651,9 @@ mod tests { #[test] fn two_mirror_rows_break_tie_by_earliest_created_at() { + // Both are mirror rows (slash-form ids); earliest created_at wins. let mut older = record( - "id-old", + "z6X/r", "z6X", "r", "mirrored from peer", @@ -1656,7 +1661,7 @@ mod tests { ); older.created_at = ts("2026-01-01T00:00:00Z"); let mut newer = record( - "id-new", + "z6X/r-dup", "z6X", "r", "mirrored from peer", @@ -1667,6 +1672,49 @@ mod tests { let out = dedupe_canonical_repos(vec![(newer, 0), (older, 0)]); assert_eq!(out.len(), 1); - assert_eq!(out[0].0.id, "id-old", "earliest created_at wins the tie"); + assert_eq!(out[0].0.id, "z6X/r", "earliest created_at wins the tie"); + } + + #[test] + fn canonical_with_mirror_description_is_treated_as_canonical() { + // Marker robustness: the canonical row carries the literal mirror + // description (user-settable) but a UUID id; the true mirror has the + // slash id and was created earlier. The canonical must still win — dedup + // keys on the structural id, not the description. + let canonical = record( + "9d92186a-uuid", + "did:key:z6Mkwbud", + "nipmod", + "mirrored from peer", + "2026-02-01T00:00:00Z", + ); + let mirror = record( + "z6Mkwbud/nipmod", + "z6Mkwbud", + "nipmod", + "a normal description", + "2026-01-01T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(canonical, 5), (mirror, 1)]); + + assert_eq!(out.len(), 1); + assert_eq!( + out[0].0.id, "9d92186a-uuid", + "canonical wins by structural id marker despite the mirror description" + ); + } + + #[test] + fn full_tie_resolves_by_id_asc() { + // Two canonical rows in one group, identical created_at; only id differs. + // Survivor is id ASC, matching SQL's DISTINCT ON (… created_at ASC, id ASC). + let bbb = record("bbb", "did:key:z6Same", "repo", "real", "2026-01-01T00:00:00Z"); + let aaa = record("aaa", "z6Same", "repo", "real", "2026-01-01T00:00:00Z"); + + let out = dedupe_canonical_repos(vec![(bbb, 0), (aaa, 0)]); + + assert_eq!(out.len(), 1, "same group collapses"); + assert_eq!(out[0].0.id, "aaa", "id ASC breaks a full tie deterministically"); } } diff --git a/crates/gitlawb-node/src/db/mod.rs b/crates/gitlawb-node/src/db/mod.rs index cfb289e..377684d 100644 --- a/crates/gitlawb-node/src/db/mod.rs +++ b/crates/gitlawb-node/src/db/mod.rs @@ -889,14 +889,15 @@ impl Db { .collect()) } - pub async fn list_all_repos_paged( - &self, - owner_filter: Option<&str>, - limit: i64, - offset: i64, - ) -> Result<(Vec<(RepoRecord, i64)>, i64)> { - let rows = sqlx::query( - "WITH deduped AS ( + /// Shared dedup CTE: collapses the mirror row and the canonical row of one + /// logical repo into a single survivor. `$1` is an optional owner filter + /// (NULL = all rows). Grouping is `(split_part(owner_did,':',-1), name)`; the + /// canonical row wins (mirror rows carry a slash-form `id` written only by + /// `upsert_mirror_repo`), ties broken by earliest `created_at` then `id` so a + /// full tie still picks a deterministic survivor. `list_all_repos_paged`, + /// `list_all_repos_deduped`, and the marker logic in + /// `crate::api::repos::dedupe_canonical_repos` must stay in sync. + const DEDUP_CTE: &'static str = "WITH deduped AS ( SELECT DISTINCT ON (split_part(owner_did, ':', -1), name) id, name, owner_did, description, is_public, default_branch, created_at, @@ -910,9 +911,22 @@ impl Db { FROM repos WHERE ($1::text IS NULL OR owner_did = $1 OR owner_did LIKE '%:' || $1) ORDER BY split_part(owner_did, ':', -1), name, - CASE WHEN description = 'mirrored from peer' THEN 1 ELSE 0 END, - created_at ASC - ) + -- mirror rows carry a slash-form id (\"{owner_short}/{name}\"), + -- written only by upsert_mirror_repo; canonical ids are UUIDs. + -- Rank canonical (no slash) ahead of the mirror in each group, + -- keyed on the structural id, not the user-settable description. + CASE WHEN position('/' in id) > 0 THEN 1 ELSE 0 END, + created_at ASC, id ASC + )"; + + pub async fn list_all_repos_paged( + &self, + owner_filter: Option<&str>, + limit: i64, + offset: i64, + ) -> Result<(Vec<(RepoRecord, i64)>, i64)> { + let sql = format!( + "{} SELECT d.id, d.name, d.owner_did, d.description, d.is_public, d.default_branch, d.created_at, d.updated_at, d.disk_path, @@ -925,12 +939,14 @@ impl Db { ) s ON s.repo_id = d.id ORDER BY d.updated_at DESC LIMIT $2 OFFSET $3", - ) - .bind(owner_filter) - .bind(limit) - .bind(offset) - .fetch_all(&self.pool) - .await?; + Self::DEDUP_CTE + ); + let rows = sqlx::query(&sql) + .bind(owner_filter) + .bind(limit) + .bind(offset) + .fetch_all(&self.pool) + .await?; let total = rows .first() @@ -961,6 +977,41 @@ impl Db { Ok((out, total)) } + /// Deduped repo list (no stars, no paging) for the unfiltered read surfaces + /// (GraphQL `repos`). One logical repo per mirror+canonical group, ordered by + /// the group's most recent activity. Shares `DEDUP_CTE` with the paged path so + /// the dedup rule cannot drift; binds a NULL owner filter (all rows). + pub async fn list_all_repos_deduped(&self) -> Result> { + let sql = format!( + "{} + SELECT d.id, d.name, d.owner_did, d.description, d.is_public, + d.default_branch, d.created_at, d.updated_at, d.disk_path, + d.forked_from, d.machine_id + FROM deduped d + ORDER BY d.updated_at DESC", + Self::DEDUP_CTE + ); + let rows = sqlx::query(&sql) + .bind(None::<&str>) + .fetch_all(&self.pool) + .await?; + + Ok(rows.into_iter().map(row_to_repo).collect()) + } + + /// Count of distinct logical repos (mirror + canonical collapsed), for + /// `/api/v1/stats`. Uses the same `(split_part(owner_did,':',-1), name)` + /// grouping as `DEDUP_CTE`; the marker/tiebreak only decide which row would + /// survive, not the group count, so they are not needed here. + pub async fn count_repos_deduped(&self) -> Result { + let row = sqlx::query( + "SELECT COUNT(DISTINCT (split_part(owner_did, ':', -1), name)) AS cnt FROM repos", + ) + .fetch_one(&self.pool) + .await?; + Ok(row.get::("cnt")) + } + pub async fn touch_repo(&self, id: &str) -> Result<()> { sqlx::query("UPDATE repos SET updated_at = $1 WHERE id = $2") .bind(Utc::now().to_rfc3339()) @@ -3001,3 +3052,228 @@ mod agent_discovery_tests { assert!(filter_discoverable(vec![], Some("reputation:score")).is_empty()); } } + +#[cfg(test)] +mod dedup_db_tests { + use super::{Db, RepoRecord}; + use chrono::{DateTime, Utc}; + use sqlx::PgPool; + + async fn db(pool: PgPool) -> Db { + let db = Db::for_testing(pool); + db.run_migrations().await.unwrap(); + db + } + + fn ts(s: &str) -> DateTime { + DateTime::parse_from_rfc3339(s).unwrap().with_timezone(&Utc) + } + + /// Build a repo row with explicit timestamps. A slash in `id` marks a mirror + /// row (the format `upsert_mirror_repo` writes); a UUID-shaped `id` is canonical. + fn rec(id: &str, owner_did: &str, name: &str, desc: &str, created: &str, updated: &str) -> RepoRecord { + RepoRecord { + id: id.to_string(), + name: name.to_string(), + owner_did: owner_did.to_string(), + description: Some(desc.to_string()), + is_public: true, + default_branch: "main".to_string(), + created_at: ts(created), + updated_at: ts(updated), + disk_path: format!("/srv/{id}"), + forked_from: None, + machine_id: None, + } + } + + /// The canonical `did:key:` row and the short-owner mirror row of one logical + /// repo collapse to a single deduped entry: the canonical row wins and inherits + /// the group's most recent `updated_at`. + #[sqlx::test] + async fn deduped_collapses_mirror_and_canonical(pool: PgPool) { + let db = db(pool).await; + let canonical = rec( + "9d92186a-canonical", + "did:key:z6Mkwbud", + "nipmod", + "Decentralized npm for agents", + "2026-01-15T00:00:00Z", + "2026-01-15T00:00:00Z", + ); + // Mirror row in the shape upsert_mirror_repo writes: slash id, bare owner. + let mirror = rec( + "z6Mkwbud/nipmod", + "z6Mkwbud", + "nipmod", + "mirrored from peer", + "2026-02-01T00:00:00Z", + "2026-03-01T00:00:00Z", + ); + db.create_repo(&canonical).await.unwrap(); + db.create_repo(&mirror).await.unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!(out.len(), 1, "the pair collapses to one logical repo"); + assert_eq!(out[0].owner_did, "did:key:z6Mkwbud", "canonical row wins"); + assert_eq!( + out[0].updated_at, + ts("2026-03-01T00:00:00Z"), + "survivor inherits the group's MAX(updated_at)" + ); + } + + /// upsert_mirror_repo's own rows dedupe against a canonical twin (proves the + /// real mirror writer's row shape is classified correctly). + #[sqlx::test] + async fn deduped_collapses_real_upsert_mirror_row(pool: PgPool) { + let db = db(pool).await; + let canonical = rec( + "uuid-canonical", + "did:key:z6Mkwbud", + "nipmod", + "real", + "2026-01-15T00:00:00Z", + "2026-01-15T00:00:00Z", + ); + db.create_repo(&canonical).await.unwrap(); + db.upsert_mirror_repo("z6Mkwbud", "nipmod", "/srv/mirror", None) + .await + .unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!(out.len(), 1, "real mirror row collapses with its canonical twin"); + assert_eq!(out[0].owner_did, "did:key:z6Mkwbud", "canonical row wins"); + } + + /// Distinct repos are preserved and ordered by most recent activity. + #[sqlx::test] + async fn deduped_preserves_distinct_repos_ordered_by_updated(pool: PgPool) { + let db = db(pool).await; + db.create_repo(&rec( + "id-a", + "did:key:z6Aaa", + "alpha", + "first", + "2026-03-01T00:00:00Z", + "2026-03-01T00:00:00Z", + )) + .await + .unwrap(); + db.create_repo(&rec( + "id-b", + "did:key:z6Bbb", + "beta", + "second", + "2026-03-02T00:00:00Z", + "2026-03-02T00:00:00Z", + )) + .await + .unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!(out.len(), 2); + assert_eq!(out[0].name, "beta", "most recently updated first"); + assert_eq!(out[1].name, "alpha"); + } + + /// count_repos_deduped counts logical repos, not raw rows. + #[sqlx::test] + async fn count_repos_deduped_counts_logical_repos(pool: PgPool) { + let db = db(pool).await; + // One logical repo (canonical + mirror) plus one standalone. + db.create_repo(&rec( + "uuid-c", + "did:key:z6Mkwbud", + "nipmod", + "real", + "2026-01-15T00:00:00Z", + "2026-01-15T00:00:00Z", + )) + .await + .unwrap(); + db.upsert_mirror_repo("z6Mkwbud", "nipmod", "/srv/m", None) + .await + .unwrap(); + db.create_repo(&rec( + "uuid-d", + "did:key:z6Other", + "solo", + "real", + "2026-01-16T00:00:00Z", + "2026-01-16T00:00:00Z", + )) + .await + .unwrap(); + + assert_eq!(db.count_repos_deduped().await.unwrap(), 2); + } + + /// Full tie (same mirror-status and created_at within a group) resolves to a + /// deterministic survivor by `id ASC`, matching the Rust helper's tiebreak. + #[sqlx::test] + async fn deduped_full_tie_resolves_by_id_asc(pool: PgPool) { + let db = db(pool).await; + // Two canonical rows in the same (normalized owner, name) group, identical + // created_at; only the id differs. Different owner_did strings avoid any + // (owner, name) collision while still normalizing to the same group key. + db.create_repo(&rec( + "bbb", + "did:key:z6Same", + "repo", + "real", + "2026-01-01T00:00:00Z", + "2026-01-01T00:00:00Z", + )) + .await + .unwrap(); + db.create_repo(&rec( + "aaa", + "z6Same", + "repo", + "real", + "2026-01-01T00:00:00Z", + "2026-01-01T00:00:00Z", + )) + .await + .unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!(out.len(), 1, "same group collapses"); + assert_eq!(out[0].id, "aaa", "id ASC breaks a full tie deterministically"); + } + + /// Marker robustness: a canonical row whose `description` is literally + /// "mirrored from peer" but whose `id` is a UUID is still ranked canonical and + /// wins over a true slash-id mirror in its group — even though the mirror was + /// created earlier. Proves dedup keys on the structural id, not the description. + #[sqlx::test] + async fn deduped_marker_uses_id_not_description(pool: PgPool) { + let db = db(pool).await; + let canonical = rec( + "uuid-canonical", + "did:key:z6Mkwbud", + "nipmod", + "mirrored from peer", // user-settable description = the old marker string + "2026-01-15T00:00:00Z", + "2026-01-15T00:00:00Z", + ); + let mirror = rec( + "z6Mkwbud/nipmod", // slash id = the real structural marker + "z6Mkwbud", + "nipmod", + "a normal description, not the marker", + "2026-01-01T00:00:00Z", // earlier: would win on created_at if marker ignored + "2026-01-01T00:00:00Z", + ); + db.create_repo(&canonical).await.unwrap(); + db.create_repo(&mirror).await.unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!(out.len(), 1); + assert_eq!( + out[0].id, "uuid-canonical", + "canonical wins by structural id marker despite carrying the mirror description" + ); + } +} diff --git a/crates/gitlawb-node/src/graphql/query.rs b/crates/gitlawb-node/src/graphql/query.rs index 5f2bfb1..c7673f6 100644 --- a/crates/gitlawb-node/src/graphql/query.rs +++ b/crates/gitlawb-node/src/graphql/query.rs @@ -12,7 +12,7 @@ impl QueryRoot { async fn repos(&self, ctx: &Context<'_>) -> Result> { let db = ctx.data_unchecked::>(); let repos = db - .list_all_repos() + .list_all_repos_deduped() .await .map_err(|e| async_graphql::Error::new(e.to_string()))?; Ok(repos diff --git a/crates/gitlawb-node/src/server.rs b/crates/gitlawb-node/src/server.rs index c9df49d..122b71f 100644 --- a/crates/gitlawb-node/src/server.rs +++ b/crates/gitlawb-node/src/server.rs @@ -452,12 +452,7 @@ async fn node_info(State(state): State) -> Json { } async fn stats(State(state): State) -> Json { - let repos = state - .db - .list_all_repos() - .await - .map(|r| r.len() as i64) - .unwrap_or(0); + let repos = state.db.count_repos_deduped().await.unwrap_or(0); let agents = state.db.count_agents().await.unwrap_or(0); let pushes = state.db.count_pushes().await.unwrap_or(0); Json(json!({ diff --git a/crates/gitlawb-node/src/test_support.rs b/crates/gitlawb-node/src/test_support.rs index 540c6ce..6722f4d 100644 --- a/crates/gitlawb-node/src/test_support.rs +++ b/crates/gitlawb-node/src/test_support.rs @@ -510,4 +510,95 @@ mod tests { "register must reject a DID other than the signer" ); } + + /// Issue #6 / jatmn finding 1: the GraphQL `repos` query renders one logical + /// repo per mirror+canonical pair. Seeds a canonical `did:key:` repo plus its + /// short-owner mirror row and a distinct standalone repo, then asserts the + /// query returns two entries (not three) and the shared repo appears once as + /// the canonical owner. + #[sqlx::test] + async fn graphql_repos_is_deduped(pool: PgPool) { + let short = "zGRAPHQLDEDUPAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(&format!("did:key:{short}"), "shared")) + .await + .expect("seed canonical"); + state + .db + .upsert_mirror_repo(short, "shared", "/tmp/mirror", None) + .await + .expect("seed mirror"); + state + .db + .create_repo(&seed_repo("did:key:zGQLOTHERBBBBBBBBBBBBBBBBBBBBBBBBBBBBB", "solo")) + .await + .expect("seed standalone"); + + let resp = state + .graphql_schema + .execute(async_graphql::Request::new("{ repos { name ownerDid } }")) + .await; + assert!(resp.errors.is_empty(), "graphql errors: {:?}", resp.errors); + let data = resp.data.into_json().expect("graphql data to json"); + let repos = data["repos"].as_array().expect("repos array"); + assert_eq!( + repos.len(), + 2, + "mirror+canonical collapse to one logical repo, plus the standalone" + ); + let shared: Vec<_> = repos.iter().filter(|r| r["name"] == "shared").collect(); + assert_eq!(shared.len(), 1, "the shared repo must not be double-listed"); + assert_eq!( + shared[0]["ownerDid"], + serde_json::json!(format!("did:key:{short}")), + "the canonical did:key row is the survivor" + ); + } + + /// Issue #6 / jatmn finding 2: `/api/v1/stats` counts logical repos, not raw + /// rows. With a mirror+canonical pair and a standalone repo present, the + /// `repos` count is 2. + #[sqlx::test] + async fn stats_repo_count_is_deduped(pool: PgPool) { + let short = "zSTATSDEDUPAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(&format!("did:key:{short}"), "shared")) + .await + .expect("seed canonical"); + state + .db + .upsert_mirror_repo(short, "shared", "/tmp/mirror", None) + .await + .expect("seed mirror"); + state + .db + .create_repo(&seed_repo("did:key:zSTATSOTHERBBBBBBBBBBBBBBBBBBBBBBBBBB", "solo")) + .await + .expect("seed standalone"); + + let router = crate::server::build_router(state); + let resp = router + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/api/v1/stats") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let bytes = axum::body::to_bytes(resp.into_body(), usize::MAX) + .await + .unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + assert_eq!( + json["repos"], 2, + "stats must count logical repos (mirror+canonical collapsed)" + ); + } } From 8e8b74b205582f44da3d6d4b06c8dc8efcfca4f2 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 24 Jun 2026 09:36:16 -0500 Subject: [PATCH 3/5] test(node): cover mirror-only, empty, and count/list-consistency dedup cases Follow-ups from code review on the dedup change: - list_all_repos_deduped/count_repos_deduped: mirror-only group survives, empty table returns empty/0, and count_repos_deduped equals the deduped list length (guards the two independent SQL queries against grouping-key drift). - Document list_all_repos as the raw, non-deduped enumeration path (object lookup only), so it is not mistaken for a listing-surface method. --- crates/gitlawb-node/src/db/mod.rs | 65 +++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/crates/gitlawb-node/src/db/mod.rs b/crates/gitlawb-node/src/db/mod.rs index 377684d..09a3364 100644 --- a/crates/gitlawb-node/src/db/mod.rs +++ b/crates/gitlawb-node/src/db/mod.rs @@ -855,6 +855,11 @@ impl Db { Ok(rows.into_iter().map(row_to_repo).collect()) } + /// Raw list of every repo row — NOT deduped (a mirror row and its canonical + /// row both appear) and without stars. For enumeration callers that must see + /// every physical row (e.g. the IPFS object scan in `api::ipfs`), not for + /// listing surfaces. Listing surfaces dedupe via `list_all_repos_deduped` or + /// `list_all_repos_with_stars` + `dedupe_canonical_repos`. pub async fn list_all_repos(&self) -> Result> { let rows = sqlx::query( "SELECT id, name, owner_did, description, is_public, default_branch, @@ -3276,4 +3281,64 @@ mod dedup_db_tests { "canonical wins by structural id marker despite carrying the mirror description" ); } + + /// A mirror row with no canonical twin survives dedup as the sole entry for its + /// group (it is not dropped just because it is the mirror). + #[sqlx::test] + async fn deduped_mirror_only_group_survives(pool: PgPool) { + let db = db(pool).await; + db.upsert_mirror_repo("z6Lonely", "orphan", "/srv/m", None) + .await + .unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!(out.len(), 1, "a mirror-only group still yields one logical repo"); + assert_eq!(out[0].id, "z6Lonely/orphan"); + assert_eq!(db.count_repos_deduped().await.unwrap(), 1); + } + + /// Degenerate empty table: deduped list is empty and the count is 0, no error. + #[sqlx::test] + async fn deduped_empty_table(pool: PgPool) { + let db = db(pool).await; + assert!(db.list_all_repos_deduped().await.unwrap().is_empty()); + assert_eq!(db.count_repos_deduped().await.unwrap(), 0); + } + + /// count_repos_deduped and list_all_repos_deduped must agree: the count is the + /// number of logical repos the list returns. Guards the two independent SQL + /// queries against drifting on the grouping key. + #[sqlx::test] + async fn deduped_count_matches_list_len(pool: PgPool) { + let db = db(pool).await; + // Two logical repos: one canonical+mirror pair, one standalone canonical. + db.create_repo(&rec( + "uuid-1", + "did:key:z6Pair", + "shared", + "real", + "2026-01-01T00:00:00Z", + "2026-01-01T00:00:00Z", + )) + .await + .unwrap(); + db.upsert_mirror_repo("z6Pair", "shared", "/srv/m", None) + .await + .unwrap(); + db.create_repo(&rec( + "uuid-2", + "did:key:z6Solo", + "solo", + "real", + "2026-01-02T00:00:00Z", + "2026-01-02T00:00:00Z", + )) + .await + .unwrap(); + + let list_len = db.list_all_repos_deduped().await.unwrap().len() as i64; + let count = db.count_repos_deduped().await.unwrap(); + assert_eq!(list_len, 2); + assert_eq!(count, list_len, "count must equal the deduped list length"); + } } From 5721791a24ea54d1802e8ed6ee415579e20dc956 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 24 Jun 2026 13:29:20 -0500 Subject: [PATCH 4/5] style(node): apply cargo fmt to dedup test code --- crates/gitlawb-node/src/api/repos.rs | 13 ++++++++++-- crates/gitlawb-node/src/db/mod.rs | 28 ++++++++++++++++++++----- crates/gitlawb-node/src/test_support.rs | 10 +++++++-- 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 4f434ef..fa110ac 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -1709,12 +1709,21 @@ mod tests { fn full_tie_resolves_by_id_asc() { // Two canonical rows in one group, identical created_at; only id differs. // Survivor is id ASC, matching SQL's DISTINCT ON (… created_at ASC, id ASC). - let bbb = record("bbb", "did:key:z6Same", "repo", "real", "2026-01-01T00:00:00Z"); + let bbb = record( + "bbb", + "did:key:z6Same", + "repo", + "real", + "2026-01-01T00:00:00Z", + ); let aaa = record("aaa", "z6Same", "repo", "real", "2026-01-01T00:00:00Z"); let out = dedupe_canonical_repos(vec![(bbb, 0), (aaa, 0)]); assert_eq!(out.len(), 1, "same group collapses"); - assert_eq!(out[0].0.id, "aaa", "id ASC breaks a full tie deterministically"); + assert_eq!( + out[0].0.id, "aaa", + "id ASC breaks a full tie deterministically" + ); } } diff --git a/crates/gitlawb-node/src/db/mod.rs b/crates/gitlawb-node/src/db/mod.rs index 09a3364..0d151a2 100644 --- a/crates/gitlawb-node/src/db/mod.rs +++ b/crates/gitlawb-node/src/db/mod.rs @@ -3076,7 +3076,14 @@ mod dedup_db_tests { /// Build a repo row with explicit timestamps. A slash in `id` marks a mirror /// row (the format `upsert_mirror_repo` writes); a UUID-shaped `id` is canonical. - fn rec(id: &str, owner_did: &str, name: &str, desc: &str, created: &str, updated: &str) -> RepoRecord { + fn rec( + id: &str, + owner_did: &str, + name: &str, + desc: &str, + created: &str, + updated: &str, + ) -> RepoRecord { RepoRecord { id: id.to_string(), name: name.to_string(), @@ -3147,7 +3154,11 @@ mod dedup_db_tests { .unwrap(); let out = db.list_all_repos_deduped().await.unwrap(); - assert_eq!(out.len(), 1, "real mirror row collapses with its canonical twin"); + assert_eq!( + out.len(), + 1, + "real mirror row collapses with its canonical twin" + ); assert_eq!(out[0].owner_did, "did:key:z6Mkwbud", "canonical row wins"); } @@ -3245,7 +3256,10 @@ mod dedup_db_tests { let out = db.list_all_repos_deduped().await.unwrap(); assert_eq!(out.len(), 1, "same group collapses"); - assert_eq!(out[0].id, "aaa", "id ASC breaks a full tie deterministically"); + assert_eq!( + out[0].id, "aaa", + "id ASC breaks a full tie deterministically" + ); } /// Marker robustness: a canonical row whose `description` is literally @@ -3264,7 +3278,7 @@ mod dedup_db_tests { "2026-01-15T00:00:00Z", ); let mirror = rec( - "z6Mkwbud/nipmod", // slash id = the real structural marker + "z6Mkwbud/nipmod", // slash id = the real structural marker "z6Mkwbud", "nipmod", "a normal description, not the marker", @@ -3292,7 +3306,11 @@ mod dedup_db_tests { .unwrap(); let out = db.list_all_repos_deduped().await.unwrap(); - assert_eq!(out.len(), 1, "a mirror-only group still yields one logical repo"); + assert_eq!( + out.len(), + 1, + "a mirror-only group still yields one logical repo" + ); assert_eq!(out[0].id, "z6Lonely/orphan"); assert_eq!(db.count_repos_deduped().await.unwrap(), 1); } diff --git a/crates/gitlawb-node/src/test_support.rs b/crates/gitlawb-node/src/test_support.rs index 6722f4d..154db14 100644 --- a/crates/gitlawb-node/src/test_support.rs +++ b/crates/gitlawb-node/src/test_support.rs @@ -532,7 +532,10 @@ mod tests { .expect("seed mirror"); state .db - .create_repo(&seed_repo("did:key:zGQLOTHERBBBBBBBBBBBBBBBBBBBBBBBBBBBBB", "solo")) + .create_repo(&seed_repo( + "did:key:zGQLOTHERBBBBBBBBBBBBBBBBBBBBBBBBBBBBB", + "solo", + )) .await .expect("seed standalone"); @@ -576,7 +579,10 @@ mod tests { .expect("seed mirror"); state .db - .create_repo(&seed_repo("did:key:zSTATSOTHERBBBBBBBBBBBBBBBBBBBBBBBBBB", "solo")) + .create_repo(&seed_repo( + "did:key:zSTATSOTHERBBBBBBBBBBBBBBBBBBBBBBBBBB", + "solo", + )) .await .expect("seed standalone"); From 9b9b1202eb67fe6ea8b01566cf26b45a3b68a8a1 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 24 Jun 2026 16:35:06 -0500 Subject: [PATCH 5/5] fix(node): make the repo dedup key did:key-aware so distinct DID methods don't collapse The dedup grouping key took the last ':' segment of owner_did (split_part / rsplit), so two repos owned by did:key:X and did:gitlawb:X with the same name collapsed into one logical repo on the list, paged, count, stats, and GraphQL surfaces. That is the exact cross-method collision the codebase already guards against in did_matches. Replace it with a did:key-aware key that strips a did:key: prefix only when the remainder is a bare id (no ':'), otherwise keeps the full DID, reproducing did_matches/key_id as an equivalence relation: did:key:X and a bare mirror X still collapse, while distinct methods never merge. Applied byte-identically across DEDUP_CTE (DISTINCT ON / PARTITION BY / ORDER BY), count_repos_deduped, the empty-page count fallback, and the in-memory dedupe_canonical_repos, so the SQL and Rust paths agree. The backing index lived in the already-released migration v1, so v1 is left untouched and a new migration v7 drops idx_repos_owner_short_name and builds idx_repos_owner_key_name on the matching expression; the CASE must stay byte-identical to the queries or Postgres stops using it. Tests cover both the in-memory and SQL paths: distinct methods stay separate, bare-id and did:key forms collapse, and the residual-colon guard keeps a malformed did:key:did:gitlawb:X distinct from the bare method DID. 216 pass. --- crates/gitlawb-node/src/api/repos.rs | 146 ++++++++++++++++++++++- crates/gitlawb-node/src/db/mod.rs | 170 +++++++++++++++++++++++++-- 2 files changed, 302 insertions(+), 14 deletions(-) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index fa110ac..881201d 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -1422,13 +1422,20 @@ fn dedupe_canonical_repos(rows: Vec<(RepoRecord, i64)>) -> Vec<(RepoRecord, i64) let mut latest: HashMap<(String, String), DateTime> = HashMap::new(); for (rec, stars) in rows { - let short = rec + // did:key-aware owner key: strip a `did:key:` prefix so the bare mirror id + // and its `did:key:` canonical collapse, but leave any other DID method + // whole so `did:key:X` and `did:gitlawb:X` never merge. The `!contains(':')` + // guard mirrors did_matches' `key_id` check: a stripped value that still + // holds a `:` is a non-key full DID (e.g. malformed `did:key:did:gitlawb:X`) + // and must keep its full form, not collapse onto the bare method DID. Stays + // byte-equivalent to the SQL CASE in Db::DEDUP_CTE / count_repos_deduped. + let owner_key = rec .owner_did - .rsplit(':') - .next() + .strip_prefix("did:key:") + .filter(|rest| !rest.contains(':')) .unwrap_or(&rec.owner_did) .to_string(); - let key = (short, rec.name.clone()); + let key = (owner_key, rec.name.clone()); latest .entry(key.clone()) @@ -1649,6 +1656,137 @@ mod tests { ); } + #[test] + fn distinct_did_methods_sharing_a_base58_id_do_not_collapse() { + // `did:key` and `did:gitlawb` share the base58 id space, so a trailing + // segment key would treat these as one repo. The did:key-aware key keeps + // them apart, matching crate::api::did_matches. + let keyed = record( + "id-keyed", + "did:key:z6Mkwbud", + "nipmod", + "owned via did:key", + "2026-01-01T00:00:00Z", + ); + let gitlawb = record( + "id-gitlawb", + "did:gitlawb:z6Mkwbud", + "nipmod", + "owned via did:gitlawb", + "2026-01-01T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(keyed, 1), (gitlawb, 2)]); + + assert_eq!( + out.len(), + 2, + "same name and base58 id under different DID methods are distinct repos" + ); + } + + #[test] + fn bare_id_and_did_key_form_of_same_owner_collapse() { + // A bare mirror id and its did:key canonical are the same owner and must + // collapse, the mirror-vs-canonical case stated in owner-key terms. + let mirror = record( + "z6Mkwbud/nipmod", + "z6Mkwbud", + "nipmod", + "mirrored from peer", + "2026-02-01T00:00:00Z", + ); + let canonical = record( + "canon-id", + "did:key:z6Mkwbud", + "nipmod", + "real", + "2026-01-15T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(mirror, 0), (canonical, 5)]); + + assert_eq!(out.len(), 1, "bare id and its did:key form are one owner"); + assert_eq!(out[0].0.owner_did, "did:key:z6Mkwbud", "canonical row wins"); + } + + #[test] + fn did_key_wrapping_a_full_did_does_not_collapse_onto_the_bare_method_did() { + // Residual-colon guard, mirroring did_matches' `!key_id().contains(':')`: + // a malformed `did:key:did:gitlawb:X` strips to `did:gitlawb:X`, which still + // holds a `:`, so it must keep its full form and NOT collapse with a real + // `did:gitlawb:X` repo of the same name. + let wrapped = record( + "id-wrapped", + "did:key:did:gitlawb:z6Mkwbud", + "nipmod", + "malformed nested DID", + "2026-01-01T00:00:00Z", + ); + let method = record( + "id-method", + "did:gitlawb:z6Mkwbud", + "nipmod", + "real method DID", + "2026-01-02T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(wrapped, 1), (method, 2)]); + + assert_eq!( + out.len(), + 2, + "a did:key-wrapped full DID stays distinct from the bare method DID" + ); + // Assert identity, not just count: each owner survives unmerged, so a + // regression that kept two rows but mis-keyed the survivor is also caught. + let mut owners: Vec<&str> = out.iter().map(|(r, _)| r.owner_did.as_str()).collect(); + owners.sort_unstable(); + assert_eq!( + owners, + vec!["did:gitlawb:z6Mkwbud", "did:key:did:gitlawb:z6Mkwbud"], + "both owner DIDs survive in their full form" + ); + } + + #[test] + fn empty_did_key_residual_keys_to_empty_string_consistently() { + // Degenerate boundary the reviewers flagged: `did:key:` with no id strips to + // an empty residual (no colon), so the key is "". A bare empty owner also + // keys to "", so the two collapse — proving the Rust strip path maps the + // empty residual exactly like the SQL `substr(owner_did, 9)` / `position` + // path (mirrored in the db-level test). A real did:key id keys separately. + let empty_did_key = record( + "id-empty-didkey", + "did:key:", + "nipmod", + "empty residual", + "2026-01-01T00:00:00Z", + ); + let empty_bare = record( + "id-empty-bare", + "", + "nipmod", + "empty owner", + "2026-01-02T00:00:00Z", + ); + let real = record( + "id-real", + "did:key:z6Mkwbud", + "nipmod", + "real id", + "2026-01-03T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(empty_did_key, 0), (empty_bare, 0), (real, 0)]); + + assert_eq!( + out.len(), + 2, + "`did:key:` and the empty owner share the empty key and collapse; the real id stays separate" + ); + } + #[test] fn two_mirror_rows_break_tie_by_earliest_created_at() { // Both are mirror rows (slash-form ids); earliest created_at wins. diff --git a/crates/gitlawb-node/src/db/mod.rs b/crates/gitlawb-node/src/db/mod.rs index 0d151a2..36e44ff 100644 --- a/crates/gitlawb-node/src/db/mod.rs +++ b/crates/gitlawb-node/src/db/mod.rs @@ -767,6 +767,20 @@ const MIGRATIONS: &[Migration] = &[ "ALTER TABLE agents ADD COLUMN IF NOT EXISTS deactivated_at TEXT", ], }, + Migration { + version: 7, + name: "repo_owner_dedup_key_didkey_aware", + stmts: &[ + // The dedup grouping key moved from the last `:` segment to a + // did:key-aware key (strip `did:key:`, leave any other DID method + // whole) so `did:key:X` and `did:gitlawb:X` no longer collapse. Swap + // the index that backs it: drop the last-segment one from v1 and build + // the matching expression index. The CASE must stay byte-for-byte in + // sync with DEDUP_CTE / count_repos_deduped or Postgres won't use it. + "DROP INDEX IF EXISTS idx_repos_owner_short_name", + "CREATE INDEX IF NOT EXISTS idx_repos_owner_key_name ON repos ((CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END), name)", + ], + }, ]; // ── Repos ───────────────────────────────────────────────────────────────────── @@ -896,26 +910,32 @@ impl Db { /// Shared dedup CTE: collapses the mirror row and the canonical row of one /// logical repo into a single survivor. `$1` is an optional owner filter - /// (NULL = all rows). Grouping is `(split_part(owner_did,':',-1), name)`; the - /// canonical row wins (mirror rows carry a slash-form `id` written only by + /// (NULL = all rows). Grouping collapses on a did:key-aware owner key: strip a + /// `did:key:` prefix (8 chars, so `substr(owner_did, 9)`) only when the + /// remainder is a bare id with no `:`, otherwise keep the full DID. That is the + /// exact normalization in `crate::api::did_matches`, so `did:key:X` and a bare + /// `X` collapse while distinct DID methods (`did:gitlawb:X`) never merge. The + /// CASE is repeated verbatim in `count_repos_deduped` and the v7 index and must + /// stay byte-identical or Postgres stops using the index. + /// The canonical row wins (mirror rows carry a slash-form `id` written only by /// `upsert_mirror_repo`), ties broken by earliest `created_at` then `id` so a /// full tie still picks a deterministic survivor. `list_all_repos_paged`, /// `list_all_repos_deduped`, and the marker logic in /// `crate::api::repos::dedupe_canonical_repos` must stay in sync. const DEDUP_CTE: &'static str = "WITH deduped AS ( - SELECT DISTINCT ON (split_part(owner_did, ':', -1), name) + SELECT DISTINCT ON (CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name) id, name, owner_did, description, is_public, default_branch, created_at, -- group MAX, not the canonical row's own value: pushes that -- arrive via gossip touch only the mirror row, so the -- canonical updated_at goes stale MAX(updated_at) OVER ( - PARTITION BY split_part(owner_did, ':', -1), name + PARTITION BY CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name ) AS updated_at, disk_path, forked_from, machine_id FROM repos WHERE ($1::text IS NULL OR owner_did = $1 OR owner_did LIKE '%:' || $1) - ORDER BY split_part(owner_did, ':', -1), name, + ORDER BY CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name, -- mirror rows carry a slash-form id (\"{owner_short}/{name}\"), -- written only by upsert_mirror_repo; canonical ids are UUIDs. -- Rank canonical (no slash) ahead of the mirror in each group, @@ -967,7 +987,7 @@ impl Db { let total = if out.is_empty() { let row = sqlx::query( - "SELECT COUNT(DISTINCT (split_part(owner_did, ':', -1), name)) AS cnt + "SELECT COUNT(DISTINCT (CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name)) AS cnt FROM repos WHERE ($1::text IS NULL OR owner_did = $1 OR owner_did LIKE '%:' || $1)", ) @@ -1005,12 +1025,12 @@ impl Db { } /// Count of distinct logical repos (mirror + canonical collapsed), for - /// `/api/v1/stats`. Uses the same `(split_part(owner_did,':',-1), name)` - /// grouping as `DEDUP_CTE`; the marker/tiebreak only decide which row would - /// survive, not the group count, so they are not needed here. + /// `/api/v1/stats`. Uses the same did:key-aware owner-key grouping as + /// `DEDUP_CTE` (the CASE must stay byte-identical); the marker/tiebreak only + /// decide which row would survive, not the group count, so they are not needed here. pub async fn count_repos_deduped(&self) -> Result { let row = sqlx::query( - "SELECT COUNT(DISTINCT (split_part(owner_did, ':', -1), name)) AS cnt FROM repos", + "SELECT COUNT(DISTINCT (CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name)) AS cnt FROM repos", ) .fetch_one(&self.pool) .await?; @@ -3162,6 +3182,136 @@ mod dedup_db_tests { assert_eq!(out[0].owner_did, "did:key:z6Mkwbud", "canonical row wins"); } + /// Same name and base58 id but different DID methods (`did:key` vs + /// `did:gitlawb`) must NOT collapse: the grouping key strips only `did:key:` + /// and leaves other methods whole, matching crate::api::did_matches. Both the + /// list (DEDUP_CTE) and count (count_repos_deduped) paths must agree. + #[sqlx::test] + async fn deduped_keeps_distinct_did_methods_apart(pool: PgPool) { + let db = db(pool).await; + db.create_repo(&rec( + "id-keyed", + "did:key:z6Mkwbud", + "nipmod", + "via did:key", + "2026-01-01T00:00:00Z", + "2026-01-01T00:00:00Z", + )) + .await + .unwrap(); + db.create_repo(&rec( + "id-gitlawb", + "did:gitlawb:z6Mkwbud", + "nipmod", + "via did:gitlawb", + "2026-01-02T00:00:00Z", + "2026-01-02T00:00:00Z", + )) + .await + .unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!(out.len(), 2, "distinct DID methods are distinct owners"); + assert_eq!( + db.count_repos_deduped().await.unwrap(), + 2, + "count path agrees with the list path", + ); + } + + /// SQL residual-colon guard: a malformed `did:key:did:gitlawb:X` strips to a + /// value that still holds a `:`, so the CASE keeps it whole and it does NOT + /// collapse with a real `did:gitlawb:X`. Proves the SQL key matches the Rust + /// `strip_prefix(...).filter(|r| !r.contains(':'))` and did_matches. + #[sqlx::test] + async fn deduped_did_key_wrapping_a_full_did_stays_distinct(pool: PgPool) { + let db = db(pool).await; + db.create_repo(&rec( + "id-wrapped", + "did:key:did:gitlawb:z6Mkwbud", + "nipmod", + "malformed nested DID", + "2026-01-01T00:00:00Z", + "2026-01-01T00:00:00Z", + )) + .await + .unwrap(); + db.create_repo(&rec( + "id-method", + "did:gitlawb:z6Mkwbud", + "nipmod", + "real method DID", + "2026-01-02T00:00:00Z", + "2026-01-02T00:00:00Z", + )) + .await + .unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!( + out.len(), + 2, + "wrapped full DID stays distinct from the method DID" + ); + assert_eq!( + db.count_repos_deduped().await.unwrap(), + 2, + "count path agrees with the list path", + ); + } + + /// Empty-residual boundary: `did:key:` matches `LIKE 'did:key:%'`, + /// `substr(owner_did, 9)` is '', and `position(':' in '')` is 0, so the CASE + /// keys it to '' just like a bare empty owner, while a real `did:key:z…` keys + /// separately. Pins that the SQL empty-residual handling matches the Rust + /// `strip_prefix(...).filter(...)` path (mirrored in the api-level test). + #[sqlx::test] + async fn deduped_empty_did_key_residual_keys_to_empty_string(pool: PgPool) { + let db = db(pool).await; + db.create_repo(&rec( + "id-empty-didkey", + "did:key:", + "nipmod", + "empty residual", + "2026-01-01T00:00:00Z", + "2026-01-01T00:00:00Z", + )) + .await + .unwrap(); + db.create_repo(&rec( + "id-empty-bare", + "", + "nipmod", + "empty owner", + "2026-01-02T00:00:00Z", + "2026-01-02T00:00:00Z", + )) + .await + .unwrap(); + db.create_repo(&rec( + "id-real", + "did:key:z6Mkwbud", + "nipmod", + "real id", + "2026-01-03T00:00:00Z", + "2026-01-03T00:00:00Z", + )) + .await + .unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!( + out.len(), + 2, + "`did:key:` and the empty owner collapse on the empty key; the real id is separate" + ); + assert_eq!( + db.count_repos_deduped().await.unwrap(), + 2, + "count path agrees with the list path", + ); + } + /// Distinct repos are preserved and ordered by most recent activity. #[sqlx::test] async fn deduped_preserves_distinct_repos_ordered_by_updated(pool: PgPool) {