diff --git a/site/.gitignore b/site/.gitignore index 2dec903..6fd9461 100644 --- a/site/.gitignore +++ b/site/.gitignore @@ -39,4 +39,5 @@ src/content/data/talks.json src/content/data/primers.json src/content/data/graph.json src/content/data/metrics.json +src/content/data/recent.json src/content/data/taxonomy.json diff --git a/site/scripts/parser/generate-data.ts b/site/scripts/parser/generate-data.ts index 7b1ee67..a60bb1a 100644 --- a/site/scripts/parser/generate-data.ts +++ b/site/scripts/parser/generate-data.ts @@ -25,6 +25,7 @@ import { buildPrimersModel } from './primers.js'; import { buildGraphModel } from './graph.js'; import { CitationCacheSchema, type CitationCache } from './citations.js'; import { buildMetricsModel } from './metrics.js'; +import { buildRecentModel } from './recent.js'; import { writeLlmsFull } from './llms-full.js'; import { PapersDataSchema, @@ -34,6 +35,7 @@ import { PrimersSchema, GraphSchema, MetricsSchema, + RecentSchema, TaxonomyDataSchema, type Counts, } from './types.js'; @@ -98,6 +100,7 @@ export function generateData( primers: number; graphNodes: number; graphEdges: number; + recentEntries: number; taxonomyDefs: number; } { // Build and validate the papers model. @@ -117,6 +120,10 @@ export function generateData( const graph = buildGraphModel(model, loadCitationCache()); const metrics = buildMetricsModel(model); + // Home page "Recently added" list, derived from git history. Empty (not an + // error) when history is unavailable — see buildRecentModel. + const recent = buildRecentModel(); + // Taxonomy.md row/column definitions for the explorer's hover/click popups. const taxonomy = buildTaxonomyModel(); @@ -128,6 +135,7 @@ export function generateData( PrimersSchema.parse(primers); GraphSchema.parse(graph); MetricsSchema.parse(metrics); + RecentSchema.parse(recent); TaxonomyDataSchema.parse(taxonomy); // No-drift guard: the homepage counts and the catalog/talks/graph/metrics @@ -217,6 +225,13 @@ export function generateData( 'utf-8', ); + // Write recent.json. + writeFileSync( + join(outDir, 'recent.json'), + JSON.stringify(recent, null, 2) + '\n', + 'utf-8', + ); + // Write taxonomy.json. writeFileSync( join(outDir, 'taxonomy.json'), @@ -232,6 +247,7 @@ export function generateData( primers: primers.primers.length, graphNodes: graph.nodes.length, graphEdges: graph.edges.length, + recentEntries: recent.length, taxonomyDefs: Object.keys(taxonomy.definitions).length, }; } @@ -263,6 +279,7 @@ if (isMain) { primers, graphNodes, graphEdges, + recentEntries, taxonomyDefs, } = generateData(); // Full-text agent index (public/llms-full.txt) — generated alongside the @@ -276,6 +293,7 @@ if (isMain) { `catalog.json (${catalogEntries} entries), talks.json (${talks} talks), ` + `primers.json (${primers} primers), ` + `graph.json (${graphNodes} nodes / ${graphEdges} edges), metrics.json, ` + + `recent.json (${recentEntries} entries), ` + `taxonomy.json (${taxonomyDefs} definitions), ` + `and llms-full.txt (${llmsBytes} bytes)`, ); diff --git a/site/scripts/parser/recent.test.ts b/site/scripts/parser/recent.test.ts new file mode 100644 index 0000000..c998ee9 --- /dev/null +++ b/site/scripts/parser/recent.test.ts @@ -0,0 +1,63 @@ +/** + * recent.test.ts — tests for the home page "Recently added" builder. + * + * The list is git-derived, so it changes every commit — assertions are + * structure-only (shape, bounds, ordering, schema), never an exact tally. + */ + +import { describe, it, expect, beforeAll } from 'vitest'; + +import { buildRecentModel } from './recent.js'; +import { RecentSchema, type Recent } from './types.js'; + +describe('buildRecentModel — real repo', () => { + let recent: Recent; + + beforeAll(() => { + recent = buildRecentModel(); + }); + + it('returns at most `limit` entries (default 5)', () => { + expect(Array.isArray(recent)).toBe(true); + expect(recent.length).toBeLessThanOrEqual(5); + }); + + it('finds at least one addition in-repo', () => { + // The repo has full history here, so the addition filter must hit something. + expect(recent.length).toBeGreaterThan(0); + }); + + it('every entry has a valid date / kind / area / non-empty title', () => { + for (const e of recent) { + expect(e.date).toMatch(/^\d{4}-\d{2}-\d{2}$/); + expect(['Paper', 'Software', 'Dataset', 'Database', 'Resource']).toContain(e.kind); + expect(['media', 'cell', 'bioprocess', 'scaffolding', 'sensory', 'tooling', 'eval']).toContain(e.area); + expect(e.title.length).toBeGreaterThan(0); + } + }); + + it('is ordered newest-first (dates non-increasing)', () => { + for (let i = 1; i < recent.length; i++) { + expect(recent[i - 1].date >= recent[i].date).toBe(true); + } + }); + + it('has no duplicate titles', () => { + const titles = recent.map((e) => e.title); + expect(new Set(titles).size).toBe(titles.length); + }); + + it('honours a smaller limit', () => { + expect(buildRecentModel(undefined, 2).length).toBeLessThanOrEqual(2); + }); + + it('degrades to an empty (valid) list when git history is unavailable', () => { + // A non-repo path makes `git log` fail; the builder must swallow it. + const empty = buildRecentModel('/nonexistent-not-a-git-repo'); + expect(empty).toEqual([]); + }); + + it('passes RecentSchema', () => { + expect(RecentSchema.safeParse(recent).success).toBe(true); + }); +}); diff --git a/site/scripts/parser/recent.ts b/site/scripts/parser/recent.ts new file mode 100644 index 0000000..f44afdd --- /dev/null +++ b/site/scripts/parser/recent.ts @@ -0,0 +1,164 @@ +/** + * recent.ts — builds recent.json for the home page "Recently added" panel. + * + * Derived at build time from `git log` over the canonical content files, so the + * list refreshes on every `pnpm parse`/build/deploy and can never go stale. The + * git call mirrors the momentum snapshot in metrics.ts: a thin execFileSync + * wrapper, the whole thing guarded so a shallow clone / tarball build degrades to + * an empty list (the component then renders just its "View the full changelog" + * link) rather than failing the build. + * + * Selection: `--no-merges` so PR-merged additions surface (this repo merges via + * merge commits, which `--first-parent` would hide); restricted to commits whose + * Conventional-Commit subject starts with an addition verb, so reorg/relabel + * commits stay out of a list titled "Recently *added*". + * + * Reads git history; never mutates the canonical files. + */ + +import { execFileSync } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; + +import { RecentSchema, type Recent, type RecentEntry } from './types.js'; + +/** parser/ → scripts/ → site/ → repo root. */ +const DEFAULT_REPO_ROOT: string = fileURLToPath(new URL('../../../', import.meta.url)); + +/** Canonical content files/dirs whose additions the home page advertises. */ +const CONTENT_PATHS: readonly string[] = [ + 'Papers.md', + 'Software.md', + 'Databases.md', + 'Datasets', + 'OtherResources.md', +]; + +/** First Conventional-Commit scope token → the entry kind shown on the card. */ +const SCOPE_TO_KIND: Readonly> = { + papers: 'Paper', + data: 'Dataset', + datasets: 'Dataset', + software: 'Software', + databases: 'Database', + resources: 'Resource', +}; + +/** Subject lead verbs that mark a commit as an *addition* (vs. a reorg/relabel). */ +const ADDITION_VERBS: readonly string[] = [ + 'add', + 'integrate', + 'catalogue', + 'catalog', + 'inventory', +]; + +/** + * Ordered keyword → research-area map. The area dot is the one field git can't + * ground, so it's a best-effort match against the title; first hit wins, and + * anything unmatched falls back to `tooling` (the neutral "general method/tool" + * column). Keep the method/cell-biology cues before the generic tooling cues. + */ +const AREA_KEYWORDS: ReadonlyArray = [ + ['sensory', ['burger', 'flavor', 'flavour', 'aroma', 'taste', 'sensory', 'mass spec', 'metabolom', 'volatile']], + ['eval', ['benchmark', 'leaderboard', 'eval']], + ['bioprocess', ['bioreactor', 'bioprocess', 'scale-up', 'scale up', 'perfusion', 'microcarrier']], + ['scaffolding', ['scaffold', 'biomaterial', 'hydrogel']], + ['media', ['media', 'medium', 'growth factor', 'serum']], + ['cell', ['knockout', 'crispr', 'satellite cell', 'differentiation', 'atlas', 'single-cell', 'scrna', 'rna-seq', 'cell line', 'cell-line', 'lineage', 'transcriptom']], + ['tooling', ['agent', 'mcp', 'llm', 'foundation model', 'framework', 'tool', 'docs', 'pipeline']], +]; + +/** + * Run a git command rooted at `repoRoot`, returning trimmed stdout. stderr is + * discarded so the guarded failure path (no repo / shallow clone) stays silent. + */ +function git(repoRoot: string, args: string[]): string { + return execFileSync('git', ['-C', repoRoot, ...args], { + encoding: 'utf-8', + stdio: ['ignore', 'pipe', 'ignore'], + }).trim(); +} + +/** First keyword match wins; unmatched titles default to `tooling`. */ +function classifyArea(title: string): RecentEntry['area'] { + const haystack = title.toLowerCase(); + for (const [area, keywords] of AREA_KEYWORDS) { + if (keywords.some((kw) => haystack.includes(kw))) return area; + } + return 'tooling'; +} + +/** + * Turn a Conventional-Commit subject into a card entry, or null if it isn't an + * addition to a catalogued content file. + * + * `feat(papers): add Tac et al. 2026 burger paper (#236)` + * → { kind: 'Paper', title: 'Tac et al. 2026 burger paper', area: 'sensory' } + */ +function parseSubject(date: string, subject: string): RecentEntry | null { + // type(scope): rest — scope required (it carries the kind). + const m = /^[a-z]+\(([^)]+)\):\s*(.+)$/i.exec(subject); + if (!m) return null; + const kind = SCOPE_TO_KIND[m[1].split(',')[0].trim().toLowerCase()]; + if (!kind) return null; + + let rest = m[2].trim(); + const verb = ADDITION_VERBS.find((v) => + new RegExp(`^${v}\\b`, 'i').test(rest), + ); + if (!verb) return null; + + // Strip the lead verb and any trailing " (#NN)" issue/PR ref → noun phrase. + let title = rest + .slice(verb.length) + .replace(/\s*\(#\d+\)\s*$/, '') + .trim(); + if (!title) return null; + title = title.charAt(0).toUpperCase() + title.slice(1); + + return { date, kind, title, area: classifyArea(title) }; +} + +/** + * Build the home page "Recently added" list (newest first) from git history. + * + * @param repoRoot Repository root (defaults to the canonical root). + * @param limit Max entries to keep (default 5 — what the panel shows). + * @returns Validated, deduped entries; an empty array if git history is + * unavailable (shallow clone / tarball) — never throws on that. + */ +export function buildRecentModel( + repoRoot: string = DEFAULT_REPO_ROOT, + limit = 5, +): Recent { + let lines: string[]; + try { + const out = git(repoRoot, [ + 'log', + '--no-merges', + '--date=short', + // %ad %s — the separator can't appear in a subject. + '--format=%ad\x1f%s', + '--', + ...CONTENT_PATHS, + ]); + lines = out ? out.split('\n') : []; + } catch { + return RecentSchema.parse([]); + } + + const entries: RecentEntry[] = []; + const seenTitles = new Set(); + for (const line of lines) { + // git log is already newest-first; don't re-sort (preserves intra-day order). + if (entries.length >= limit) break; + const sep = line.indexOf('\x1f'); + if (sep < 0) continue; + const entry = parseSubject(line.slice(0, sep), line.slice(sep + 1)); + if (!entry || seenTitles.has(entry.title)) continue; + seenTitles.add(entry.title); + entries.push(entry); + } + + return RecentSchema.parse(entries); +} diff --git a/site/scripts/parser/types.ts b/site/scripts/parser/types.ts index e16a646..a22d157 100644 --- a/site/scripts/parser/types.ts +++ b/site/scripts/parser/types.ts @@ -342,6 +342,33 @@ export const MetricsSchema = z.object({ generatedAt: z.string(), }); +// --------------------------------------------------------------------------- +// recent.json — home page "Recently added" list, derived from git history +// --------------------------------------------------------------------------- + +/** One entry in the home page "Recently added" panel (RecentlyAdded.astro). */ +export const RecentEntrySchema = z.object({ + /** commit date, YYYY-MM-DD */ + date: z.string().regex(/^\d{4}-\d{2}-\d{2}$/), + /** entry type, rendered as an uppercase label */ + kind: z.enum(['Paper', 'Software', 'Dataset', 'Database', 'Resource']), + /** short title: the commit subject with prefix, lead verb, and issue ref stripped */ + title: z.string().min(1), + /** research-area key driving the dot colour (RecentlyAdded.astro areaColor) */ + area: z.enum([ + 'media', + 'cell', + 'bioprocess', + 'scaffolding', + 'sensory', + 'tooling', + 'eval', + ]), +}); + +/** recent.json is a flat array of entries, newest first. */ +export const RecentSchema = z.array(RecentEntrySchema); + // --------------------------------------------------------------------------- // taxonomy.json — Taxonomy.md row/column definitions, keyed by matrix label // --------------------------------------------------------------------------- @@ -383,4 +410,6 @@ export type Graph = z.infer; export type MetricsSpecies = z.infer; export type MetricsDatasets = z.infer; export type Metrics = z.infer; +export type RecentEntry = z.infer; +export type Recent = z.infer; export type TaxonomyData = z.infer; diff --git a/site/src/content/data/recent.json b/site/src/content/data/recent.json deleted file mode 100644 index 71156b1..0000000 --- a/site/src/content/data/recent.json +++ /dev/null @@ -1,7 +0,0 @@ -[ - { "date": "2026-05-27", "kind": "Paper", "title": "FrontierScience benchmark + companion dataset", "area": "eval" }, - { "date": "2026-05-26", "kind": "Software", "title": "MRMPROBS for targeted mass spectrometry", "area": "sensory" }, - { "date": "2026-05-20", "kind": "Dataset", "title": "Bovine skeletal-muscle single-cell atlas", "area": "cell" }, - { "date": "2026-05-18", "kind": "Database", "title": "PRIDE proteomics repository", "area": "media" }, - { "date": "2026-05-14", "kind": "Paper", "title": "Multi-fidelity Bayesian media design", "area": "media" } -]