Skip to content

Commit aeb4736

Browse files
committed
emit: converge to 2 target-parameterized APIs (emitParser reuses emitLexer)
The emit layer had three inconsistent entry points — `emitParser(grammar)` (JS, no target), `emitLexer(grammar, st)` (JS, internal symtab), and `emitPortableParser(grammar, target)` (lexer buried in `target.render`). Collapse them to exactly two, both parameterized by a Target: emitLexer(grammar, target) -> the lexer for that target emitParser(grammar, target) -> the parser, REUSING emitLexer(grammar, target) A Target owns both halves, so a parser reuses the SAME target's lexer — jsTarget's parser embeds jsTarget's SoA-int lexer, goTarget's parser embeds goTarget's Tok-list lexer. No cross-target lexer format is shared, so the optimized JS path keeps its integer-bitmask dispatch and the portable targets keep their clean byte scanner. - src/emit.ts (new): the Target interface + the two public functions; re-exports jsTarget / tsTarget / goTarget / rustTarget. - emit-parser.ts: the optimized emitter split into `emitJsLexer` (derive) + `emitJsParser` (embed a handed-in lexer) behind `jsTarget`. The split is pure refactor — re-deriving the deterministic symtab yields the identical lexer string, so emit-parser-verify stays byte-for-byte. - emit-lexer.ts: `emitLexer` -> `emitSoaLexer` (frees the public name). - emit-portable.ts + target-{ts,go,rust}.ts: `render(ir)` split into the target's `emitLexer`/`emitParser`; `emitPortableParser` removed (`portableIR` exported). - ~19 callers updated to `emitParser(g, jsTarget)` / `emitParser(g, <portable>)`. emit-parser-verify byte-identical (0 mismatches), portable-targets 16 grammars ×3 ≡ interpreter, emit-tsc-gate clean, full suite 42/42.
1 parent ca2a56b commit aeb4736

23 files changed

Lines changed: 134 additions & 71 deletions

src/emit-lexer.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ const NON_ASCII_WS_FN =
4545
const nonAsciiWsConsume = (v: string, cont: boolean, indent: string): string =>
4646
`${indent}if (${v} > 127 && lxNonAsciiWs(${v})) { LX_WS.lastIndex = pos; const m = LX_WS.exec(source); if (m !== null) { if (m[0].includes('\\n')) pendingNl = true; pos += m[0].length;${cont ? ' continue;' : ''} } }`;
4747

48-
export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null {
48+
export function emitSoaLexer(grammar: CstGrammar, st: LexerSymtab): string | null {
4949
// Out of scope: the markup / indentation / newline state machines.
5050
if (grammar.markup || grammar.indent || grammar.newline) return null;
5151
if (grammar.tokens.some(t => tokenBlockPatternSource(t) || t.blockOnly)) return null;

src/emit-parser.ts

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@
2727
import type { CstGrammar, RuleExpr, RuleDecl } from './types.ts';
2828
import { isKeywordLiteral, collectLiterals } from './grammar-utils.ts';
2929
import { analyzeGrammar, findEntryRule, type Sec } from './grammar-analysis.ts';
30-
import { emitLexer } from './emit-lexer.ts';
30+
import { emitSoaLexer } from './emit-lexer.ts';
31+
import type { Target } from './emit.ts';
3132
import { withAwaitYield } from './await-yield-fork.ts';
3233

3334
// ── Static analysis ──
@@ -1092,7 +1093,28 @@ class Emitter {
10921093

10931094
// ── Top-level emit ──
10941095

1095-
export function emitParser(grammar: CstGrammar): string {
1096+
// The `js` Target: the optimized SoA-int parser/lexer, wrapped behind the same two-method
1097+
// Target contract as the portable ts/go/rust targets (see emit.ts). `emitJsLexer` derives the
1098+
// standalone lexer; `emitJsParser` embeds whatever lexer source it is handed. Splitting the
1099+
// lexer COMPUTATION from its EMBEDDING leaves the emitted bytes identical (both re-derive the
1100+
// same deterministic symtab), so `emit-parser-verify` stays byte-for-byte.
1101+
export const jsTarget: Target = {
1102+
name: 'javascript',
1103+
ext: 'js',
1104+
emitLexer: emitJsLexer,
1105+
emitParser: emitJsParser,
1106+
};
1107+
1108+
export function emitJsLexer(grammar: CstGrammar): string | null {
1109+
grammar = withAwaitYield(grammar);
1110+
const st = analyze(grammar).symtab;
1111+
return emitSoaLexer(grammar, {
1112+
typeKind: st.typeKind, kwLitKind: st.kwLitKind, puLitKind: st.puLitKind,
1113+
KIND_PUNCT: st.KIND_PUNCT, KIND_NAMED_FALLBACK: st.KIND_NAMED_FALLBACK,
1114+
});
1115+
}
1116+
1117+
export function emitJsParser(grammar: CstGrammar, lexSrc: string | null): string {
10961118
// [Await]/[Yield] context: name-fork the body-reachable rule closure into $A/$Y/$AY
10971119
// families (see await-yield-fork.ts). No-op for a grammar with no ctx markers. Done
10981120
// HERE (not at grammar export) so the forks exist ONLY in the parser's rule identity
@@ -1127,11 +1149,8 @@ export function emitParser(grammar: CstGrammar): string {
11271149
// The lexer: EMITTED (specialized, standalone — see emit-lexer.ts) when the grammar
11281150
// is a plain token stream; the data-driven createLexer runtime otherwise
11291151
// (markup/indent/newline state machines stay interpreter-only).
1152+
// `lexSrc` is handed in by the Target façade (emitParser reuses emitLexer) — see emit.ts.
11301153
const st = a.symtab;
1131-
const lexSrc = emitLexer(grammar, {
1132-
typeKind: st.typeKind, kwLitKind: st.kwLitKind, puLitKind: st.puLitKind,
1133-
KIND_PUNCT: st.KIND_PUNCT, KIND_NAMED_FALLBACK: st.KIND_NAMED_FALLBACK,
1134-
});
11351154
e.soa = lexSrc !== null;
11361155
if (!lexSrc) {
11371156
e.emit(`import { createLexer } from ${J(resolveLexerImport())};`);

src/emit-portable.ts

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// ── emit-portable ──
22
//
3-
// The target-agnostic emitter (issue #6). `emitPortableParser(grammar, target)` derives
3+
// The target-agnostic emitter (issue #6). `emitParser(grammar, target)` (see emit.ts) derives
44
// a COMPLETE, self-contained parser in the target's language from the same CstGrammar the
55
// TS engine uses. It is the agnosticism proof: ONE analysis → ONE intermediate form (IR)
66
// → N language renderings, all producing the byte-identical CST the interpreter does.
@@ -120,17 +120,12 @@ export type ParserIR = {
120120
tpl: TplCfg | null; // null unless the grammar has a template token
121121
};
122122

123-
export interface Target {
124-
name: string;
125-
ext: string; // emitted file extension (no dot)
126-
render(ir: ParserIR): string; // the complete, compilable source
127-
}
128-
129-
export function emitPortableParser(grammar: CstGrammar, target: Target): string {
130-
// Apply the [Await]/[Yield] context fork exactly as createParser does, so `await`/`yield`
131-
// are keywords inside async/generator bodies and identifiers outside — name-forked into
132-
// $A/$Y/$AY rule families. Every other consumer (and the portable parser) sees plain rules.
133-
return target.render(buildIR(withAwaitYield(grammar)));
123+
// The target-agnostic parse plan for a grammar. Applies the [Await]/[Yield] context fork
124+
// exactly as createParser does (so `await`/`yield` are keywords inside async/generator bodies
125+
// and identifiers outside — name-forked into $A/$Y/$AY rule families), then builds the IR each
126+
// portable Target (ts/go/rust) renders. The `Target` contract itself lives in emit.ts.
127+
export function portableIR(grammar: CstGrammar): ParserIR {
128+
return buildIR(withAwaitYield(grammar));
134129
}
135130

136131
// ── buildIR: grammar + analysis → the target-agnostic parse plan ──

src/emit.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
// The emit layer's public surface: exactly two APIs, both parameterized by a `Target`.
2+
//
3+
// emitLexer(grammar, target) → the lexer source for that target
4+
// emitParser(grammar, target) → the parser source for that target, REUSING emitLexer
5+
//
6+
// A `Target` owns BOTH halves, so emitParser(grammar, target) reuses the SAME target's lexer —
7+
// jsTarget's parser embeds jsTarget's SoA-int lexer, goTarget's parser embeds goTarget's
8+
// Tok-list lexer. No cross-target lexer format is shared, so the optimized JS path keeps its
9+
// integer-bitmask token dispatch while the portable targets keep their clean byte scanner.
10+
//
11+
// Targets: `jsTarget` (the optimized SoA parser, emit-parser.ts) and the portable
12+
// `tsTarget`/`goTarget`/`rustTarget` (emit-portable.ts + target-*.ts).
13+
import type { CstGrammar } from './types.ts';
14+
15+
export interface Target {
16+
name: string;
17+
ext: string; // emitted file extension (no dot)
18+
emitLexer(grammar: CstGrammar): string | null; // null ⇒ runtime-lexer fallback (jsTarget markup/indent grammars)
19+
emitParser(grammar: CstGrammar, lexerSrc: string | null): string; // the parser, embedding `lexerSrc`
20+
}
21+
22+
export function emitLexer(grammar: CstGrammar, target: Target): string | null {
23+
return target.emitLexer(grammar);
24+
}
25+
26+
export function emitParser(grammar: CstGrammar, target: Target): string {
27+
return target.emitParser(grammar, emitLexer(grammar, target)); // ← parser reuses lexer
28+
}
29+
30+
export { jsTarget } from './emit-parser.ts';
31+
export { tsTarget } from './target-ts.ts';
32+
export { goTarget } from './target-go.ts';
33+
export { rustTarget } from './target-rust.ts';

src/target-go.ts

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
11
// The Go Target for emit-portable. Renders the same language-agnostic ParserIR as tsTarget
22
// into a self-contained Go program (Go stdlib only — the lexer is regex-free, so it compiles
33
// with no module dependencies). Its CST JSON is checked byte-for-byte against the interpreter,
4-
// so `emitPortableParser(grammar, goTarget)` is a real, verified Go parser derived from the
4+
// so `emitParser(grammar, goTarget)` is a real, verified Go parser derived from the
55
// same grammar definition.
66
//
77
// ARENA allocation (to minimise GC pressure, as tsgo does): nodes live in a flat `nodes []Node`,
88
// their children in a flat `kids []int32`, and in-progress children accumulate on a `scratch`
99
// stack. A node is an int32 index, never a heap pointer. Backtracking truncates the three
1010
// slices to saved lengths; the slices keep their capacity across parses (reset to len 0), so a
1111
// warmed parser allocates ~nothing per parse.
12-
import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target, TplCfg } from './emit-portable.ts';
13-
import type { TokenPattern } from './types.ts';
12+
import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, TplCfg } from './emit-portable.ts';
13+
import { portableIR } from './emit-portable.ts';
14+
import type { Target } from './emit.ts';
15+
import type { TokenPattern, CstGrammar } from './types.ts';
1416

1517
const J = (v: unknown) => JSON.stringify(v);
1618
const rangeCond = (v: string, rs: CharRange[]) =>
@@ -290,7 +292,11 @@ ${r.nudSeqs.map((seq) => `\t{ save := pos; sb := len(scratch); nb := len(nodes);
290292
export const goTarget: Target = {
291293
name: 'go',
292294
ext: 'go',
293-
render(ir: ParserIR): string {
295+
emitLexer(grammar: CstGrammar): string {
296+
return lexer(portableIR(grammar));
297+
},
298+
emitParser(grammar: CstGrammar, lexerSrc: string | null): string {
299+
const ir = portableIR(grammar);
294300
const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r, ir.tpl) : rdRule(r))).join('\n\n');
295301
const matchTemplate = ir.tpl ? `func matchTemplate() int32 {
296302
\tt := peek()
@@ -344,7 +350,7 @@ var nodes []Node
344350
var kids []int32
345351
var scratch []int32
346352
347-
${lexer(ir)}
353+
${lexerSrc ?? ''}
348354
349355
func peek() *Tok {
350356
\tif pos < len(toks) { return &toks[pos] }

src/target-rust.ts

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// The Rust Target for emit-portable. Renders the same language-agnostic ParserIR as
22
// tsTarget/goTarget into a self-contained Rust program (no external crates — the lexer is
33
// regex-free, so it compiles with rustc alone, no Cargo/network). Its CST JSON is checked
4-
// byte-for-byte against the interpreter, so `emitPortableParser(grammar, rustTarget)` is a
4+
// byte-for-byte against the interpreter, so `emitParser(grammar, rustTarget)` is a
55
// real, verified Rust parser derived from the same grammar definition.
66
//
77
// Rust ownership note: a CST node is OWNED (moved), unlike the TS/Go pointer trees. In the
@@ -11,8 +11,10 @@
1111
// returns it. Sub-sequence combinators (star/opt/sep) take non-capturing fn pointers
1212
// `fn(&mut Parser, &mut Vec<Cst>) -> bool`, threading the parser + kids as params (so nothing
1313
// is captured, sidestepping the borrow checker).
14-
import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target, TplCfg } from './emit-portable.ts';
15-
import type { TokenPattern } from './types.ts';
14+
import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, TplCfg } from './emit-portable.ts';
15+
import { portableIR } from './emit-portable.ts';
16+
import type { Target } from './emit.ts';
17+
import type { TokenPattern, CstGrammar } from './types.ts';
1618

1719
const J = (v: unknown) => JSON.stringify(v);
1820
const rangeCond = (v: string, rs: CharRange[]) =>
@@ -312,7 +314,11 @@ ${r.nudSeqs.map((seq) => ` { let save = self.pos; let mut kids: Vec<Cst>
312314
export const rustTarget: Target = {
313315
name: 'rust',
314316
ext: 'rs',
315-
render(ir: ParserIR): string {
317+
emitLexer(grammar: CstGrammar): string {
318+
return lexer(portableIR(grammar));
319+
},
320+
emitParser(grammar: CstGrammar, lexerSrc: string | null): string {
321+
const ir = portableIR(grammar);
316322
const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r, ir.tpl) : rdRule(r))).join('\n\n');
317323
const matchTemplate = ir.tpl ? ` fn match_template(&mut self) -> Option<Cst> {
318324
let t = self.peek()?;
@@ -350,7 +356,7 @@ impl Cst {
350356
// offset/end inferred from first/last child (children non-empty).
351357
fn node(rule: &'static str, kids: Vec<Cst>) -> Cst { let o = kids[0].offset; let e = kids[kids.len() - 1].end; Cst::node(rule, kids, o, e) }
352358
353-
${lexer(ir)}
359+
${lexerSrc ?? ''}
354360
355361
struct Parser<'a> { toks: Vec<Tok<'a>>, pos: usize, capped: bool, suppress_next: Vec<&'static str>, src: &'a str }
356362
impl<'a> Parser<'a> {

src/target-ts.ts

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@
44
// index LEDs), and a CST→JSON printer over stdin. It is the reference rendering — its CST
55
// is checked byte-for-byte against the interpreter (createParser), so a divergence in the
66
// portable logic surfaces here before Go/Rust are compiled.
7-
import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target, TplCfg } from './emit-portable.ts';
7+
import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, TplCfg } from './emit-portable.ts';
8+
import { portableIR } from './emit-portable.ts';
9+
import type { Target } from './emit.ts';
10+
import type { CstGrammar } from './types.ts';
811

912
const J = (v: unknown) => JSON.stringify(v);
1013
const rangeCond = (v: string, rs: CharRange[]) =>
@@ -267,7 +270,11 @@ ${r.nudSeqs.map((seq) => ` { const save = pos; const kids: Cst[] = []; if (${se
267270
export const tsTarget: Target = {
268271
name: 'typescript',
269272
ext: 'ts',
270-
render(ir: ParserIR): string {
273+
emitLexer(grammar: CstGrammar): string {
274+
return lexer(portableIR(grammar));
275+
},
276+
emitParser(grammar: CstGrammar, lexerSrc: string | null): string {
277+
const ir = portableIR(grammar);
271278
const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r, ir.tpl) : rdRule(r))).join('\n\n');
272279
const matchTemplate = ir.tpl ? `function matchTemplate(): Cst | null {
273280
const t = peek();
@@ -296,7 +303,7 @@ type Leaf = { tokenType: string; offset: number; end: number };
296303
type Node = { rule: string; children: Cst[]; offset: number; end: number };
297304
type Cst = Node | Leaf;
298305
299-
${lexer(ir)}
306+
${lexerSrc ?? ''}
300307
301308
let toks: Tok[] = [];
302309
let pos = 0;

test/cst-match-totality.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
// node test/cst-match-totality.ts
1414
import { existsSync, readFileSync, readdirSync, statSync, writeFileSync } from 'node:fs';
1515
import { join } from 'node:path';
16-
import { emitParser } from '../src/emit-parser.ts';
16+
import { emitParser, jsTarget } from '../src/emit.ts';
1717
import { generateInputs } from './grammar-gen.ts';
1818

1919
const GRAMMARS = ['typescript', 'javascript', 'typescriptreact', 'javascriptreact', 'yaml', 'html'];
@@ -52,7 +52,7 @@ for (const name of GRAMMARS) {
5252
const grammar = (await import(`../${name}.ts`)).default;
5353
const matchers = (await import(`../${name}.cst-match.ts`)).MATCHERS;
5454
const emPath = `/tmp/emitted-totality-${name}.mts`;
55-
writeFileSync(emPath, emitParser(grammar));
55+
writeFileSync(emPath, emitParser(grammar, jsTarget));
5656
const em = (await import(emPath + '?v=' + process.pid)) as Emitted;
5757
let parsed = 0;
5858
for (const input of generateInputs(grammar, { depth: 5, nestDepth: 5, cap: 7, fuzzRounds: 250, maxInputs: 1500, seed: 5 })) {

test/emit-lexer-verify.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@
99
// node test/emit-lexer-verify.ts # in-repo corpus (+ /tmp/ts-repo if present)
1010
import { readFileSync, writeFileSync } from 'node:fs';
1111
import { createLexer } from '../src/gen-lexer.ts';
12-
import { emitParser } from '../src/emit-parser.ts';
12+
import { emitParser, jsTarget } from '../src/emit.ts';
1313
import { inRepoCorpus, externalTsFiles } from './emit-corpus.ts';
1414

1515
const grammar = (await import('../typescript.ts')).default;
1616

1717
// The reference: createLexer with the SAME intern config the emitted parser bakes.
1818
const EMITTED = '/tmp/emit-lexer-verify-parser.mts';
19-
writeFileSync(EMITTED, emitParser(grammar));
19+
writeFileSync(EMITTED, emitParser(grammar, jsTarget));
2020
const emitted = await import(EMITTED + '?v=' + Date.now());
2121
const src = readFileSync(EMITTED, 'utf-8');
2222
if (src.includes('createLexer(')) {

test/emit-parser-bench.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@
99
// node test/emit-parser-bench.ts # the 4 bench files, N=20
1010
// node test/emit-parser-bench.ts <N> # custom timed-run count
1111
import { createParser } from '../src/gen-parser.ts';
12-
import { emitParser } from '../src/emit-parser.ts';
12+
import { emitParser, jsTarget } from '../src/emit.ts';
1313
import { readFileSync, writeFileSync } from 'fs';
1414

1515
const grammar = (await import('../typescript.ts')).default;
1616
const oracle = createParser(grammar);
1717

1818
const EMITTED = '/tmp/emitted-parser.mts';
19-
writeFileSync(EMITTED, emitParser(grammar));
19+
writeFileSync(EMITTED, emitParser(grammar, jsTarget));
2020
const emitted = await import(EMITTED + '?v=' + Date.now());
2121

2222
const N = Number(process.argv[2]) || 20;

0 commit comments

Comments
 (0)