Skip to content

Commit c0d84d0

Browse files
committed
emit-portable: template-literal interpolation in all three targets (stage 4)
The portable lexer's second stateful feature: `${…}` interpolation. A `` ` `` opens a span scanned to the next `${` (emit $templateHead) or closing `` ` `` (the whole token, no substitution); a `}` that closes a hole resumes the span ($templateMiddle / Tail). A templateStack of brace-depths decides which `}` closes the hole versus a nested `{…}` (object/block) or nested template inside it. The parser's Pratt nud sees a $templateHead and assembles head·expr·(middle·expr)*·tail into a synthetic $template node, parsing each hole with the Pratt expression rule. The lexer state machine generalises cleanly with the regex one — a grammar can have regex, templates, or both share one emit() / LexState (Rust: a struct that now also carries the template_stack). examples/templatejs.ts (minijs + templates + a shorthand object so a hole can hold `{…}`) verifies it: no-substitution, adjacent/multiple holes, expressions in holes, NESTED templates, and an object inside a hole (the brace-depth counter) — all ts/go/rust CSTs byte-identical to createParser (gate: 11/11 accept, 4/4 reject per target). Full suite 42/42. Tagged templates (`` tag`…` `` — a postfix-token Pratt LED) are out of scope here; that's a parser-algebra gap, the remaining work alongside the markup/indent lexers.
1 parent b10cfdd commit c0d84d0

6 files changed

Lines changed: 346 additions & 64 deletions

File tree

examples/templatejs.ts

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
// minijs + TEMPLATE LITERALS — exercises the portable lexer's second STATEFUL feature
2+
// (stage 4): `${…}` interpolation. The lexer splits `` `a${x}b${y}c` `` into
3+
// $templateHead·$templateMiddle·$templateTail around the holes, tracking a brace-depth
4+
// stack so a nested `{…}` (or a nested template) inside a hole doesn't close it; the
5+
// parser assembles the pieces and interpolated expressions into a `$template` node.
6+
import {
7+
token, rule, defineGrammar, left, right, op, prefix, alt,
8+
seq, oneOf, range, star, sep, opt, many, altPattern, noneOf, notFollowedBy,
9+
} from '../src/api.ts';
10+
11+
const digit = range('0', '9');
12+
const idStart = oneOf(range('a', 'z'), range('A', 'Z'), '_', '$');
13+
const idCont = oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_', '$');
14+
15+
const Ident = token(seq(idStart, star(idCont)), { identifier: true, scope: 'variable' });
16+
const Number_ = token(seq(digit, star(digit)), { scope: 'constant.numeric' });
17+
const Str = token(seq('"', star(altPattern(noneOf('"', '\\'), seq('\\', noneOf('\n')))), '"'), { scope: 'string.quoted.double' });
18+
const LineComment = token(seq('//', star(noneOf('\n'))), { skip: true, scope: 'comment.line' });
19+
20+
// NoSubstitution template: backtick body excludes a real `${` (a `$` not followed by `{`
21+
// stays literal); the `template` config drives the interpolated split in the lexer.
22+
const Template = token(
23+
seq('`', star(altPattern(noneOf('`', '\\', '$'), seq('\\', noneOf('\n')), seq('$', notFollowedBy('{')))), '`'),
24+
{ scope: 'string.template', template: { open: '`', interpOpen: '${', interpClose: '}' } },
25+
);
26+
27+
const jsPrec = [
28+
right('='),
29+
left('||'), left('&&'),
30+
left('+', '-'),
31+
left('*', '/', '%'),
32+
right(prefix('!', '-', '+')),
33+
];
34+
35+
const Expr = rule(($) => [
36+
Number_, Str, Template, Ident,
37+
['(', $, ')'],
38+
['{', opt(sep(Ident, ',')), '}'], // shorthand object — gives a hole a nested `{ … }`
39+
[prefix, $],
40+
[$, op, $],
41+
[$, '(', opt(sep($, ',')), ')'],
42+
[$, '.', Ident],
43+
]);
44+
45+
const Block = rule(($) => [['{', many(Stmt), '}']]);
46+
const Stmt = rule(($) => [
47+
Block,
48+
[alt('var', 'let', 'const'), Ident, opt('=', Expr), ';'],
49+
['if', '(', Expr, ')', Stmt, opt('else', Stmt)],
50+
['return', opt(Expr), ';'],
51+
[Expr, ';'],
52+
]);
53+
const Program = rule(($) => [many(Stmt)]);
54+
55+
export default defineGrammar({
56+
name: 'templatejs',
57+
scopeName: 'source.templatejs',
58+
tokens: { Ident, Number: Number_, Str, Template, LineComment },
59+
prec: jsPrec,
60+
rules: { Expr, Block, Stmt, Program },
61+
});

src/emit-portable.ts

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,28 @@ export type RegexCtx = {
8080
postfixAfterValue: string[]; // ambiguous postfix/prefix ops (e.g. `!`): value only in postfix
8181
};
8282

83+
// Template literals with `${…}` interpolation: a STATEFUL lexer split. A `` ` `` opens a
84+
// span scanned to the next `${` (→ $templateHead) or closing `` ` `` (→ the whole token,
85+
// no substitution); a `}` that closes a hole resumes the span (→ $templateMiddle / Tail).
86+
// A `templateStack` of brace-depths tracks which `}` closes the hole vs. a nested `{…}`.
87+
// The parser assembles head·expr·(middle·expr)*·tail into a synthetic `$template` node.
88+
export type TplCfg = {
89+
token: string; // the token flagged `template`; its NoSubstitution form is a plain leaf
90+
open: string; // `` ` ``
91+
interpOpen: string; // `${`
92+
interpClose: string; // `}`
93+
braceOpen: string; // `{` — a nested one deepens the hole, so its `}` is not the closer
94+
interpRule: string; // the rule that parses each `${…}` hole (the Pratt expression rule)
95+
};
96+
8397
export type ParserIR = {
8498
grammarName: string;
8599
entry: string;
86100
tokens: LexTok[]; // for the char scanner, tried in declaration order
87101
puncts: string[]; // punctuation literals, longest-first (maximal munch)
88102
rules: RuleIR[];
89103
regexCtx: RegexCtx | null; // null unless the grammar has a regex token with context
104+
tpl: TplCfg | null; // null unless the grammar has a template token
90105
};
91106

92107
export interface Target {
@@ -163,7 +178,24 @@ function buildIR(grammar: CstGrammar): ParserIR {
163178
};
164179
}
165180

166-
return { grammarName: grammar.name ?? 'grammar', entry: findEntryRule(grammar), tokens, puncts, rules, regexCtx };
181+
// Template literals (only if the grammar declares a template token). The interpolation
182+
// holes are parsed by the Pratt expression rule — the rule that carries operator leds.
183+
let tpl: TplCfg | null = null;
184+
const tplTok = grammar.tokens.find((t) => t.template);
185+
if (tplTok && tplTok.template) {
186+
const prattName = rules.find((r) => r.kind === 'pratt')?.name;
187+
if (!prattName) throw new Error('portable: a template token needs a Pratt expression rule to parse its interpolations');
188+
tpl = {
189+
token: tplTok.name,
190+
open: tplTok.template.open,
191+
interpOpen: tplTok.template.interpOpen,
192+
interpClose: tplTok.template.interpClose,
193+
braceOpen: tplTok.template.interpOpen.slice(-1),
194+
interpRule: prattName,
195+
};
196+
}
197+
198+
return { grammarName: grammar.name ?? 'grammar', entry: findEntryRule(grammar), tokens, puncts, rules, regexCtx, tpl };
167199
}
168200

169201
// Classify a token: a fast-path shape (run/string/line/block) when one cleanly matches,

src/target-go.ts

Lines changed: 77 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
// stack. A node is an int32 index, never a heap pointer. Backtracking truncates the three
1010
// slices to saved lengths; the slices keep their capacity across parses (reset to len 0), so a
1111
// warmed parser allocates ~nothing per parse.
12-
import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target } from './emit-portable.ts';
12+
import type { ParserIR, RdRule, PrattRule, Step, Bracket, CharRange, LexTok, Target, TplCfg } from './emit-portable.ts';
1313
import type { TokenPattern } from './types.ts';
1414

1515
const J = (v: unknown) => JSON.stringify(v);
@@ -44,11 +44,12 @@ function compilePat(p: TokenPattern, defs: string[]): string {
4444
return name;
4545
}
4646

47-
function scanTok(t: LexTok, defs: string[], rxTok?: string): string {
47+
function scanTok(t: LexTok, defs: string[], rxTok?: string, tplTok?: string): string {
4848
const name = (t as { name: string }).name;
49-
const stateful = rxTok !== undefined;
49+
const stateful = rxTok !== undefined || tplTok !== undefined;
50+
if (tplTok !== undefined && name === tplTok) return ''; // template token scanned by the state machine
5051
const push = (endE: string) => (t.skip ? '' : stateful ? `emit(${J(name)}, src[pos:${endE}], pos, ${endE}); ` : `toks = append(toks, Tok{${J(name)}, src[pos:${endE}], pos, ${endE}}); `);
51-
const gate = stateful && name === rxTok ? '!prevIsValue() && ' : '';
52+
const gate = rxTok !== undefined && name === rxTok ? '!prevIsValue() && ' : '';
5253
if (t.kind === 'run') return `\t\tif ${gate}${rangeCond('c', t.first)} {
5354
\t\t\te := pos + 1
5455
\t\t\tfor e < n { cc := int(src[e]); if !${rangeCond('cc', t.cont)} { break }; e++ }
@@ -77,12 +78,14 @@ function scanTok(t: LexTok, defs: string[], rxTok?: string): string {
7778
function lexer(ir: ParserIR): string {
7879
const defs: string[] = [];
7980
const rx = ir.regexCtx;
80-
const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken)).join('\n');
81-
const pushPunct = rx ? (p: string) => `emit("", ${J(p)}, pos, pos + ${p.length})` : (p: string) => `toks = append(toks, Tok{"", ${J(p)}, pos, pos + ${p.length}})`;
81+
const tpl = ir.tpl;
82+
const stateful = !!(rx || tpl);
83+
const toks = ir.tokens.map((t) => scanTok(t, defs, rx?.regexToken, tpl?.token)).join('\n');
84+
const pushPunct = stateful ? (p: string) => `emit("", ${J(p)}, pos, pos + ${p.length})` : (p: string) => `toks = append(toks, Tok{"", ${J(p)}, pos, pos + ${p.length}})`;
8285
const puncts = ir.puncts.map((p) =>
8386
`\t\tif strings.HasPrefix(src[pos:], ${J(p)}) { ${pushPunct(p)}; pos += ${p.length}; continue }`).join('\n');
8487
const goMap = (a: string[]) => `map[string]bool{${a.map((x) => `${J(x)}: true`).join(', ')}}`;
85-
const stateBlock = rx ? `\tprevText, prevKind, bpText := "", "", ""
88+
const rxState = rx ? `\tprevText, prevKind, bpText := "", "", ""
8689
\thasPrev, hasPrev2 := false, false
8790
\tparenHead := []bool{}
8891
\tlastClose, lastBang := false, false
@@ -100,27 +103,56 @@ function lexer(ir: ParserIR): string {
100103
\t\tisParenHead := prevText == ")" && lastClose
101104
\t\treturn !isExprKw && !isParenHead && (_divK[prevKind] || _divT[prevText])
102105
\t}
103-
\temit := func(kind, text string, off, end int) {
104-
\t\tif text == "(" {
106+
` : '';
107+
const tplState = tpl ? `\ttemplateStack := []int{}
108+
\tscanTplSpan := func(p int) (bool, int) {
109+
\t\tfor p < n {
110+
\t\t\tif strings.HasPrefix(src[p:], ${J(tpl.interpOpen)}) { return true, p + ${tpl.interpOpen.length} }
111+
\t\t\tif src[p] == 92 { p += 2; continue }
112+
\t\t\tif strings.HasPrefix(src[p:], ${J(tpl.open)}) { return false, p + ${tpl.open.length} }
113+
\t\t\tp++
114+
\t\t}
115+
\t\treturn false, p
116+
\t}
117+
\t_ = scanTplSpan
118+
` : '';
119+
const emitHooks = [
120+
rx ? `\t\tif text == "(" {
105121
\t\t\tisMember := hasPrev2 && _mem[bpText]
106122
\t\t\tparenHead = append(parenHead, !isMember && prevKind == IDENT && _phK[prevText])
107123
\t\t} else if text == ")" {
108124
\t\t\tif len(parenHead) > 0 { lastClose = parenHead[len(parenHead)-1]; parenHead = parenHead[:len(parenHead)-1] } else { lastClose = false }
109125
\t\t}
110-
\t\tif _pav[text] { lastBang = prevIsValue() }
111-
\t\ttoks = append(toks, Tok{kind, text, off, end})
112-
\t\tbpText = prevText; hasPrev2 = hasPrev; prevKind = kind; prevText = text; hasPrev = true
126+
\t\tif _pav[text] { lastBang = prevIsValue() }` : '',
127+
tpl ? `\t\tif len(templateStack) > 0 { if text == ${J(tpl.braceOpen)} { templateStack[len(templateStack)-1]++ } else if text == ${J(tpl.interpClose)} { templateStack[len(templateStack)-1]-- } }` : '',
128+
].filter(Boolean).join('\n');
129+
const emitTail = rx ? `\n\t\tbpText = prevText; hasPrev2 = hasPrev; prevKind = kind; prevText = text; hasPrev = true` : '';
130+
const emitFn = stateful ? `\temit := func(kind, text string, off, end int) {
131+
${emitHooks}
132+
\t\ttoks = append(toks, Tok{kind, text, off, end})${emitTail}
113133
\t}
114-
\t_ = bpText; _ = hasPrev2; _ = lastBang; _ = prevIsValue
134+
\t_ = emit
135+
` : '';
136+
const tplDispatch = tpl ? `\t\tif len(templateStack) > 0 && strings.HasPrefix(src[pos:], ${J(tpl.interpClose)}) && templateStack[len(templateStack)-1] == 0 {
137+
\t\t\ttemplateStack = templateStack[:len(templateStack)-1]
138+
\t\t\tinterp, e := scanTplSpan(pos + ${tpl.interpClose.length})
139+
\t\t\tif interp { emit("$templateMiddle", src[pos:e], pos, e); templateStack = append(templateStack, 0) } else { emit("$templateTail", src[pos:e], pos, e) }
140+
\t\t\tpos = e; continue
141+
\t\t}
142+
\t\tif strings.HasPrefix(src[pos:], ${J(tpl.open)}) {
143+
\t\t\tinterp, e := scanTplSpan(pos + ${tpl.open.length})
144+
\t\t\tif interp { emit("$templateHead", src[pos:e], pos, e); templateStack = append(templateStack, 0) } else { emit(${J(tpl.token)}, src[pos:e], pos, e) }
145+
\t\t\tpos = e; continue
146+
\t\t}
115147
` : '';
116148
return `${defs.length ? 'var _s string\n' + defs.join('\n') + '\n' : ''}func lex(src string) []Tok {
117149
\ttoks := toks[:0]
118150
\tn := len(src)
119151
\tpos := 0
120-
${stateBlock}${defs.length ? '\t_s = src\n' : ''}\tfor pos < n {
152+
${rxState}${tplState}${emitFn}${defs.length ? '\t_s = src\n' : ''}\tfor pos < n {
121153
\t\tc := int(src[pos])
122154
\t\tif c == 32 || c == 9 || c == 10 || c == 13 { pos++; continue }
123-
${toks}
155+
${tplDispatch}${toks}
124156
${puncts}
125157
\t\tpanic(fmt.Sprintf("lex error at %d", pos))
126158
\t}
@@ -151,7 +183,15 @@ ${r.alts.map(alt).join('\n')}
151183
}`;
152184
}
153185

154-
function prattRule(r: PrattRule): string {
186+
function prattRule(r: PrattRule, tpl: TplCfg | null): string {
187+
const tplNud = tpl && r.nudToks.includes(tpl.token)
188+
? `\tif t.Kind == "$templateHead" {
189+
\t\tnode := matchTemplate()
190+
\t\tif node < 0 { return -1 }
191+
\t\tsb := len(scratch); scratch = append(scratch, node)
192+
\t\treturn finish(${J(r.name)}, sb, nodes[node].Offset)
193+
\t}\n`
194+
: '';
155195
const bin = r.binary.map((b) => `${J(b.op)}: {${b.lbp}, ${b.rbp}}`).join(', ');
156196
const pre = r.prefix.map((p) => `${J(p.op)}: ${p.rbp}`).join(', ');
157197
const atoms = r.nudToks.map((k) => `${J(k)}: true`).join(', ');
@@ -192,7 +232,7 @@ ${r.leds.map(ledArm).join('\n')}
192232
func ${r.name}nud() int32 {
193233
\tt := peek()
194234
\tif t == nil { return -1 }
195-
\tif ${r.name}ATOM[t.Kind] {
235+
${tplNud}\tif ${r.name}ATOM[t.Kind] {
196236
\t\tsb := len(scratch); scratch = append(scratch, mkLeaf(t.Kind, t.Off, t.End)); pos++
197237
\t\treturn finish(${J(r.name)}, sb, t.Off)
198238
\t}
@@ -213,7 +253,25 @@ export const goTarget: Target = {
213253
name: 'go',
214254
ext: 'go',
215255
render(ir: ParserIR): string {
216-
const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r) : rdRule(r))).join('\n\n');
256+
const ruleFns = ir.rules.map((r) => (r.kind === 'pratt' ? prattRule(r, ir.tpl) : rdRule(r))).join('\n\n');
257+
const matchTemplate = ir.tpl ? `func matchTemplate() int32 {
258+
\tt := peek()
259+
\tif t == nil || t.Kind != "$templateHead" { return -1 }
260+
\tsb := len(scratch); nb := len(nodes); kb := len(kids); save := pos
261+
\tscratch = append(scratch, mkLeaf("$templateHead", t.Off, t.End)); pos++
262+
\tfor {
263+
\t\texpr := parse${ir.tpl.interpRule}()
264+
\t\tif expr < 0 { pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1 }
265+
\t\tscratch = append(scratch, expr)
266+
\t\tnext := peek()
267+
\t\tif next == nil { pos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1 }
268+
\t\tif next.Kind == "$templateMiddle" { scratch = append(scratch, mkLeaf("$templateMiddle", next.Off, next.End)); pos++; continue }
269+
\t\tif next.Kind == "$templateTail" { scratch = append(scratch, mkLeaf("$templateTail", next.Off, next.End)); pos++; break }
270+
\t\tpos = save; scratch = scratch[:sb]; nodes = nodes[:nb]; kids = kids[:kb]; return -1
271+
\t}
272+
\treturn finish("$template", sb, t.Off)
273+
}
274+
` : '';
217275
return `// GENERATED by emit-portable.ts (goTarget) — parser for grammar "${ir.grammarName}".
218276
package main
219277
@@ -296,7 +354,7 @@ func altLit(opts [][2]string) bool {
296354
\treturn false
297355
}
298356
299-
${ruleFns}
357+
${matchTemplate}${ruleFns}
300358
301359
func writeJSON(id int32, b *strings.Builder) {
302360
\tnd := &nodes[id]

0 commit comments

Comments
 (0)