Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions site/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,5 @@ src/content/data/talks.json
src/content/data/primers.json
src/content/data/graph.json
src/content/data/metrics.json
src/content/data/recent.json
src/content/data/taxonomy.json
18 changes: 18 additions & 0 deletions site/scripts/parser/generate-data.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import { buildPrimersModel } from './primers.js';
import { buildGraphModel } from './graph.js';
import { CitationCacheSchema, type CitationCache } from './citations.js';
import { buildMetricsModel } from './metrics.js';
import { buildRecentModel } from './recent.js';
import { writeLlmsFull } from './llms-full.js';
import {
PapersDataSchema,
Expand All @@ -34,6 +35,7 @@ import {
PrimersSchema,
GraphSchema,
MetricsSchema,
RecentSchema,
TaxonomyDataSchema,
type Counts,
} from './types.js';
Expand Down Expand Up @@ -98,6 +100,7 @@ export function generateData(
primers: number;
graphNodes: number;
graphEdges: number;
recentEntries: number;
taxonomyDefs: number;
} {
// Build and validate the papers model.
Expand All @@ -117,6 +120,10 @@ export function generateData(
const graph = buildGraphModel(model, loadCitationCache());
const metrics = buildMetricsModel(model);

// Home page "Recently added" list, derived from git history. Empty (not an
// error) when history is unavailable — see buildRecentModel.
const recent = buildRecentModel();

// Taxonomy.md row/column definitions for the explorer's hover/click popups.
const taxonomy = buildTaxonomyModel();

Expand All @@ -128,6 +135,7 @@ export function generateData(
PrimersSchema.parse(primers);
GraphSchema.parse(graph);
MetricsSchema.parse(metrics);
RecentSchema.parse(recent);
TaxonomyDataSchema.parse(taxonomy);

// No-drift guard: the homepage counts and the catalog/talks/graph/metrics
Expand Down Expand Up @@ -217,6 +225,13 @@ export function generateData(
'utf-8',
);

// Write recent.json.
writeFileSync(
join(outDir, 'recent.json'),
JSON.stringify(recent, null, 2) + '\n',
'utf-8',
);

// Write taxonomy.json.
writeFileSync(
join(outDir, 'taxonomy.json'),
Expand All @@ -232,6 +247,7 @@ export function generateData(
primers: primers.primers.length,
graphNodes: graph.nodes.length,
graphEdges: graph.edges.length,
recentEntries: recent.length,
taxonomyDefs: Object.keys(taxonomy.definitions).length,
};
}
Expand Down Expand Up @@ -263,6 +279,7 @@ if (isMain) {
primers,
graphNodes,
graphEdges,
recentEntries,
taxonomyDefs,
} = generateData();
// Full-text agent index (public/llms-full.txt) — generated alongside the
Expand All @@ -276,6 +293,7 @@ if (isMain) {
`catalog.json (${catalogEntries} entries), talks.json (${talks} talks), ` +
`primers.json (${primers} primers), ` +
`graph.json (${graphNodes} nodes / ${graphEdges} edges), metrics.json, ` +
`recent.json (${recentEntries} entries), ` +
`taxonomy.json (${taxonomyDefs} definitions), ` +
`and llms-full.txt (${llmsBytes} bytes)`,
);
Expand Down
63 changes: 63 additions & 0 deletions site/scripts/parser/recent.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/**
* recent.test.ts — tests for the home page "Recently added" builder.
*
* The list is git-derived, so it changes every commit — assertions are
* structure-only (shape, bounds, ordering, schema), never an exact tally.
*/

import { describe, it, expect, beforeAll } from 'vitest';

import { buildRecentModel } from './recent.js';
import { RecentSchema, type Recent } from './types.js';

describe('buildRecentModel — real repo', () => {
let recent: Recent;

beforeAll(() => {
recent = buildRecentModel();
});

it('returns at most `limit` entries (default 5)', () => {
expect(Array.isArray(recent)).toBe(true);
expect(recent.length).toBeLessThanOrEqual(5);
});

it('finds at least one addition in-repo', () => {
// The repo has full history here, so the addition filter must hit something.
expect(recent.length).toBeGreaterThan(0);
});

it('every entry has a valid date / kind / area / non-empty title', () => {
for (const e of recent) {
expect(e.date).toMatch(/^\d{4}-\d{2}-\d{2}$/);
expect(['Paper', 'Software', 'Dataset', 'Database', 'Resource']).toContain(e.kind);
expect(['media', 'cell', 'bioprocess', 'scaffolding', 'sensory', 'tooling', 'eval']).toContain(e.area);
expect(e.title.length).toBeGreaterThan(0);
}
});

it('is ordered newest-first (dates non-increasing)', () => {
for (let i = 1; i < recent.length; i++) {
expect(recent[i - 1].date >= recent[i].date).toBe(true);
}
});

it('has no duplicate titles', () => {
const titles = recent.map((e) => e.title);
expect(new Set(titles).size).toBe(titles.length);
});

it('honours a smaller limit', () => {
expect(buildRecentModel(undefined, 2).length).toBeLessThanOrEqual(2);
});

it('degrades to an empty (valid) list when git history is unavailable', () => {
// A non-repo path makes `git log` fail; the builder must swallow it.
const empty = buildRecentModel('/nonexistent-not-a-git-repo');
expect(empty).toEqual([]);
});

it('passes RecentSchema', () => {
expect(RecentSchema.safeParse(recent).success).toBe(true);
});
});
164 changes: 164 additions & 0 deletions site/scripts/parser/recent.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
/**
* recent.ts — builds recent.json for the home page "Recently added" panel.
*
* Derived at build time from `git log` over the canonical content files, so the
* list refreshes on every `pnpm parse`/build/deploy and can never go stale. The
* git call mirrors the momentum snapshot in metrics.ts: a thin execFileSync
* wrapper, the whole thing guarded so a shallow clone / tarball build degrades to
* an empty list (the component then renders just its "View the full changelog"
* link) rather than failing the build.
*
* Selection: `--no-merges` so PR-merged additions surface (this repo merges via
* merge commits, which `--first-parent` would hide); restricted to commits whose
* Conventional-Commit subject starts with an addition verb, so reorg/relabel
* commits stay out of a list titled "Recently *added*".
*
* Reads git history; never mutates the canonical files.
*/

import { execFileSync } from 'node:child_process';
import { fileURLToPath } from 'node:url';

import { RecentSchema, type Recent, type RecentEntry } from './types.js';

/** parser/ → scripts/ → site/ → repo root. */
const DEFAULT_REPO_ROOT: string = fileURLToPath(new URL('../../../', import.meta.url));

/** Canonical content files/dirs whose additions the home page advertises. */
const CONTENT_PATHS: readonly string[] = [
'Papers.md',
'Software.md',
'Databases.md',
'Datasets',
'OtherResources.md',
];

/** First Conventional-Commit scope token → the entry kind shown on the card. */
const SCOPE_TO_KIND: Readonly<Record<string, RecentEntry['kind']>> = {
papers: 'Paper',
data: 'Dataset',
datasets: 'Dataset',
software: 'Software',
databases: 'Database',
resources: 'Resource',
};

/** Subject lead verbs that mark a commit as an *addition* (vs. a reorg/relabel). */
const ADDITION_VERBS: readonly string[] = [
'add',
'integrate',
'catalogue',
'catalog',
'inventory',
];

/**
* Ordered keyword → research-area map. The area dot is the one field git can't
* ground, so it's a best-effort match against the title; first hit wins, and
* anything unmatched falls back to `tooling` (the neutral "general method/tool"
* column). Keep the method/cell-biology cues before the generic tooling cues.
*/
const AREA_KEYWORDS: ReadonlyArray<readonly [RecentEntry['area'], readonly string[]]> = [
['sensory', ['burger', 'flavor', 'flavour', 'aroma', 'taste', 'sensory', 'mass spec', 'metabolom', 'volatile']],
['eval', ['benchmark', 'leaderboard', 'eval']],
['bioprocess', ['bioreactor', 'bioprocess', 'scale-up', 'scale up', 'perfusion', 'microcarrier']],
['scaffolding', ['scaffold', 'biomaterial', 'hydrogel']],
['media', ['media', 'medium', 'growth factor', 'serum']],
['cell', ['knockout', 'crispr', 'satellite cell', 'differentiation', 'atlas', 'single-cell', 'scrna', 'rna-seq', 'cell line', 'cell-line', 'lineage', 'transcriptom']],
['tooling', ['agent', 'mcp', 'llm', 'foundation model', 'framework', 'tool', 'docs', 'pipeline']],
];

/**
* Run a git command rooted at `repoRoot`, returning trimmed stdout. stderr is
* discarded so the guarded failure path (no repo / shallow clone) stays silent.
*/
function git(repoRoot: string, args: string[]): string {
return execFileSync('git', ['-C', repoRoot, ...args], {
encoding: 'utf-8',
stdio: ['ignore', 'pipe', 'ignore'],
}).trim();
}

/** First keyword match wins; unmatched titles default to `tooling`. */
function classifyArea(title: string): RecentEntry['area'] {
const haystack = title.toLowerCase();
for (const [area, keywords] of AREA_KEYWORDS) {
if (keywords.some((kw) => haystack.includes(kw))) return area;
}
return 'tooling';
}

/**
* Turn a Conventional-Commit subject into a card entry, or null if it isn't an
* addition to a catalogued content file.
*
* `feat(papers): add Tac et al. 2026 burger paper (#236)`
* → { kind: 'Paper', title: 'Tac et al. 2026 burger paper', area: 'sensory' }
*/
function parseSubject(date: string, subject: string): RecentEntry | null {
// type(scope): rest — scope required (it carries the kind).
const m = /^[a-z]+\(([^)]+)\):\s*(.+)$/i.exec(subject);
if (!m) return null;
const kind = SCOPE_TO_KIND[m[1].split(',')[0].trim().toLowerCase()];
if (!kind) return null;

let rest = m[2].trim();
const verb = ADDITION_VERBS.find((v) =>
new RegExp(`^${v}\\b`, 'i').test(rest),
);
if (!verb) return null;

// Strip the lead verb and any trailing " (#NN)" issue/PR ref → noun phrase.
let title = rest
.slice(verb.length)
.replace(/\s*\(#\d+\)\s*$/, '')
.trim();
if (!title) return null;
title = title.charAt(0).toUpperCase() + title.slice(1);

return { date, kind, title, area: classifyArea(title) };
}

/**
* Build the home page "Recently added" list (newest first) from git history.
*
* @param repoRoot Repository root (defaults to the canonical root).
* @param limit Max entries to keep (default 5 — what the panel shows).
* @returns Validated, deduped entries; an empty array if git history is
* unavailable (shallow clone / tarball) — never throws on that.
*/
export function buildRecentModel(
repoRoot: string = DEFAULT_REPO_ROOT,
limit = 5,
): Recent {
let lines: string[];
try {
const out = git(repoRoot, [
'log',
'--no-merges',
'--date=short',
// %ad <unit-separator> %s — the separator can't appear in a subject.
'--format=%ad\x1f%s',
'--',
...CONTENT_PATHS,
]);
lines = out ? out.split('\n') : [];
} catch {
return RecentSchema.parse([]);
}

const entries: RecentEntry[] = [];
const seenTitles = new Set<string>();
for (const line of lines) {
// git log is already newest-first; don't re-sort (preserves intra-day order).
if (entries.length >= limit) break;
const sep = line.indexOf('\x1f');
if (sep < 0) continue;
const entry = parseSubject(line.slice(0, sep), line.slice(sep + 1));
if (!entry || seenTitles.has(entry.title)) continue;
seenTitles.add(entry.title);
entries.push(entry);
}

return RecentSchema.parse(entries);
}
29 changes: 29 additions & 0 deletions site/scripts/parser/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,33 @@ export const MetricsSchema = z.object({
generatedAt: z.string(),
});

// ---------------------------------------------------------------------------
// recent.json — home page "Recently added" list, derived from git history
// ---------------------------------------------------------------------------

/** One entry in the home page "Recently added" panel (RecentlyAdded.astro). */
export const RecentEntrySchema = z.object({
/** commit date, YYYY-MM-DD */
date: z.string().regex(/^\d{4}-\d{2}-\d{2}$/),
/** entry type, rendered as an uppercase label */
kind: z.enum(['Paper', 'Software', 'Dataset', 'Database', 'Resource']),
/** short title: the commit subject with prefix, lead verb, and issue ref stripped */
title: z.string().min(1),
/** research-area key driving the dot colour (RecentlyAdded.astro areaColor) */
area: z.enum([
'media',
'cell',
'bioprocess',
'scaffolding',
'sensory',
'tooling',
'eval',
]),
});

/** recent.json is a flat array of entries, newest first. */
export const RecentSchema = z.array(RecentEntrySchema);

// ---------------------------------------------------------------------------
// taxonomy.json — Taxonomy.md row/column definitions, keyed by matrix label
// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -383,4 +410,6 @@ export type Graph = z.infer<typeof GraphSchema>;
export type MetricsSpecies = z.infer<typeof MetricsSpeciesSchema>;
export type MetricsDatasets = z.infer<typeof MetricsDatasetsSchema>;
export type Metrics = z.infer<typeof MetricsSchema>;
export type RecentEntry = z.infer<typeof RecentEntrySchema>;
export type Recent = z.infer<typeof RecentSchema>;
export type TaxonomyData = z.infer<typeof TaxonomyDataSchema>;
7 changes: 0 additions & 7 deletions site/src/content/data/recent.json

This file was deleted.