diff --git a/.github/workflows/impl-review.yml b/.github/workflows/impl-review.yml index 7ae7179d7c..ae1c2f7930 100644 --- a/.github/workflows/impl-review.yml +++ b/.github/workflows/impl-review.yml @@ -179,6 +179,105 @@ jobs: fi echo "::notice::Found both theme renders — proceeding to review" + # Canvas dimension gate — runs before the AI review. + # The saved PNG must be exactly 3200×1800 (landscape) or 2400×2400 (square) + # within ±16 px (≤0.5 %). On drift, write /tmp/anyplot-canvas-gate.txt with + # a synthetic weakness (named actual+target+signed delta + likely cause). + # ai-quality-review.md reads that file and treats canvas_gate=fail as a + # MUST-flag weakness that forces VQ-05 to 0, which drags the quality_score + # below the repair threshold — re-routing the PR into impl-repair through + # the existing 5-review/4-repair cascade. Never raw `exit 1` here: + # impl-repair.yml is dispatched by ai-rejected label only, so a hard step + # failure would leave a dead PR with no retry path. + - name: Canvas dimension gate + env: + LIBRARY: ${{ steps.pr.outputs.library }} + ATTEMPT: ${{ steps.attempts.outputs.display }} + run: | + # Clean any stale file from a previous run on this runner. + rm -f /tmp/anyplot-canvas-gate.txt + + pip install --quiet pillow + + python3 <<'PY' + import os + import sys + from pathlib import Path + + from PIL import Image + + LIBRARY = os.environ["LIBRARY"] + ATTEMPT = os.environ["ATTEMPT"] + TOLERANCE = 16 # ≤0.5 % on the 3200-axis + TARGETS = [(3200, 1800), (2400, 2400)] + + path = Path("plot_images/plot-light.png") + with Image.open(path) as img: + w, h = img.size + + # Pick the closer target so the repair feedback names the right one. + tw, th = min(TARGETS, key=lambda t: abs(w - t[0]) + abs(h - t[1])) + dw, dh = w - tw, h - th + ok = abs(dw) <= TOLERANCE and abs(dh) <= TOLERANCE + + status = "pass" if ok else "fail" + print( + f"::notice::canvas_gate library={LIBRARY} status={status} " + f"actual={w}x{h} target={tw}x{th} delta={dw:+d}x{dh:+d} attempt={ATTEMPT}" + ) + + if ok: + sys.exit(0) + + # Build the synthetic weakness — include actual, target, signed delta, + # direction, possible aspect mismatch, and library-specific likely cause. + # This is what impl-repair sees in metadata.review.weaknesses. + direction_parts = [] + if dw < 0: + direction_parts.append(f"width is {-dw} px short") + elif dw > 0: + direction_parts.append(f"width is {dw} px over") + if dh < 0: + direction_parts.append(f"height is {-dh} px short") + elif dh > 0: + direction_parts.append(f"height is {dh} px over") + direction = "; ".join(direction_parts) or "axes on target individually but combined off" + + aspect_note = "" + # Aspect-mismatch hint: actual is landscape-ish but closest target is square (or vice versa) + actual_aspect = w / h if h else 1 + if (tw, th) == (2400, 2400) and actual_aspect > 1.2: + aspect_note = " — the implementation rendered a landscape canvas but the closest target is square; reconsider orientation." + elif (tw, th) == (3200, 1800) and actual_aspect < 0.85: + aspect_note = " — the implementation rendered a portrait/square canvas but the closest target is landscape; reconsider orientation." + + # Library-specific likely cause (concrete repair hints, not generic). + causes = { + "matplotlib": "Most likely cause: `bbox_inches='tight'` on `savefig` — remove it (default `bbox_inches=None`).", + "seaborn": "Most likely cause: `bbox_inches='tight'` on `savefig` — remove it (default `bbox_inches=None`).", + "altair": "Most likely cause: vl-convert padding title/legend outside `width`/`height`. Add `configure_view(continuousWidth=…, continuousHeight=…)` + zero `padding` and normalize the saved PNG with the PIL crop/pad snippet in `prompts/library/altair.md`.", + "plotly": "Most likely cause: `autosize=True` or implicit margin growth. Set `autosize=False` and pin `margin=dict(...)` explicitly.", + "bokeh": "Most likely cause: bokeh toolbar adding ~30-50 px above the figure. Set `toolbar_location=None`.", + "highcharts": "Most likely cause: Chrome chrome eating viewport space. Use `Emulation.setDeviceMetricsOverride` via CDP (see `prompts/library/highcharts.md` Save section) — `--window-size` alone is not authoritative in headless mode.", + "pygal": "Check that `pygal.(width=…, height=…)` and the SVG→PNG conversion don't override dims.", + "plotnine": "Check `ggsave(width=…, height=…, units='in', dpi=400)` and `theme(figure_size=…)`; do not pass `bbox_inches='tight'`.", + "letsplot": "Check `ggsize(W, H)` and `ggsave(..., scale=4)` pair — only the two canonical pairs land on target.", + "ggplot2": "Check `ggsave(width=…, height=…, units='in', dpi=400)` with `ragg::agg_png`.", + } + cause = causes.get(LIBRARY, f"Review `prompts/library/{LIBRARY}.md` 'Canvas — hard rule' section.") + + weakness = ( + f"Canvas dimensions drifted from required target. " + f"Actual: {w}×{h}. Closest valid target: {tw}×{th} (±{TOLERANCE} px tolerance). " + f"Signed delta: {dw:+d} × {dh:+d} — {direction}.{aspect_note} " + f"{cause} " + f"Re-render at exactly {tw}×{th}; the post-render gate enforces this." + ) + + Path("/tmp/anyplot-canvas-gate.txt").write_text(weakness, encoding="utf-8") + print(f"::warning::Canvas gate FAILED — wrote synthetic weakness to /tmp/anyplot-canvas-gate.txt: {weakness}") + PY + - name: React with eyes emoji env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/app/eslint.config.js b/app/eslint.config.js index 824a2265dc..1216bc7320 100644 --- a/app/eslint.config.js +++ b/app/eslint.config.js @@ -38,6 +38,7 @@ export default [ requestAnimationFrame: 'readonly', cancelAnimationFrame: 'readonly', performance: 'readonly', + crypto: 'readonly', // DOM types HTMLElement: 'readonly', HTMLDivElement: 'readonly', diff --git a/app/src/components/SpecTabs.tsx b/app/src/components/SpecTabs.tsx index 7773051821..f01bf0f2f8 100644 --- a/app/src/components/SpecTabs.tsx +++ b/app/src/components/SpecTabs.tsx @@ -221,7 +221,7 @@ export function SpecTabs({ const handleTagClick = useCallback( (paramName: string, value: string) => { onTrackEvent?.('tag_click', { param: paramName, value, source: 'spec_detail' }); - navigate(`/?${paramName}=${encodeURIComponent(value)}`); + navigate(`/plots?${paramName}=${encodeURIComponent(value)}`); }, [navigate, onTrackEvent] ); diff --git a/app/src/pages/StatsPage.test.tsx b/app/src/pages/StatsPage.test.tsx index 871109ebc1..68a315fed2 100644 --- a/app/src/pages/StatsPage.test.tsx +++ b/app/src/pages/StatsPage.test.tsx @@ -1,15 +1,17 @@ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; -import { render, screen, waitFor } from '../test-utils'; +import { render, screen, userEvent, waitFor } from '../test-utils'; import { StatsPage } from './StatsPage'; vi.mock('react-helmet-async', () => ({ Helmet: ({ children }: { children: React.ReactNode }) => <>{children}, })); +const mockTrackEvent = vi.fn(); + vi.mock('../hooks', () => ({ useAnalytics: () => ({ trackPageview: vi.fn(), - trackEvent: vi.fn(), + trackEvent: mockTrackEvent, }), })); @@ -55,7 +57,9 @@ const mockDashboard = { ], tag_distribution: { plot_type: { scatter: 42, line: 30 }, - data_type: { numeric: 80 }, + // "time series" has a space so the tag-link test below can assert + // encodeURIComponent is actually exercised (href -> data=time%20series). + data_type: { numeric: 80, 'time series': 12 }, }, score_distribution: { '50-60': 5, '60-70': 10, '70-80': 20, '80-90': 30, '90-100': 15 }, timeline: [ @@ -118,6 +122,7 @@ function mockFetchError() { describe('StatsPage', () => { beforeEach(() => { vi.restoreAllMocks(); + mockTrackEvent.mockClear(); }); // vi.restoreAllMocks() doesn't undo vi.stubGlobal — without this hook the @@ -247,4 +252,40 @@ describe('StatsPage', () => { expect(screen.getByText(/visitor data unavailable/)).toBeInTheDocument(); }); }); + + it('routes tag links to /plots and fires tag_click on click', async () => { + // Regression: pre-fix the href pointed at /?plot=scatter, which landed + // on LandingPage and silently dropped the filter intent. Filter routing + // lives on /plots after the page split, so the link must target that. + mockFetchSuccess(); + const user = userEvent.setup(); + + render(); + + await waitFor(() => { + expect(screen.getByText('scatter')).toBeInTheDocument(); + }); + + const scatterLink = screen.getByText('scatter').closest('a'); + expect(scatterLink).toHaveAttribute('href', '/plots?plot=scatter'); + + // Tag with a space exercises encodeURIComponent — proves the encoding + // step isn't a no-op. + const timeSeriesLink = screen.getByText('time series').closest('a'); + expect(timeSeriesLink).toHaveAttribute('href', '/plots?data=time%20series'); + + await user.click(scatterLink!); + expect(mockTrackEvent).toHaveBeenCalledWith('tag_click', { + param: 'plot', + value: 'scatter', + source: 'stats', + }); + + await user.click(timeSeriesLink!); + expect(mockTrackEvent).toHaveBeenCalledWith('tag_click', { + param: 'data', + value: 'time series', + source: 'stats', + }); + }); }); diff --git a/app/src/pages/StatsPage.tsx b/app/src/pages/StatsPage.tsx index 619df4c106..a3b58cbeda 100644 --- a/app/src/pages/StatsPage.tsx +++ b/app/src/pages/StatsPage.tsx @@ -466,7 +466,8 @@ export function StatsPage() { { if (param) trackEvent('tag_click', { param, value: tag, source: 'stats' }); }} sx={{ fontFamily: typography.fontFamily, fontSize: size, fontWeight: weight, textDecoration: 'none', px: 0.75, py: 0.25, borderRadius: 0.5, diff --git a/automation/scripts/canvas_gate_report.py b/automation/scripts/canvas_gate_report.py new file mode 100755 index 0000000000..f861207978 --- /dev/null +++ b/automation/scripts/canvas_gate_report.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.13" +# dependencies = [] +# /// +"""Aggregate canvas_gate ::notice:: log lines from recent impl-review runs. + +Scans the GitHub Actions logs of recent `Impl: Review` workflow runs, parses the +structured `::notice::canvas_gate library= status= actual=WxH +target=WxH delta=+dx+dy attempt=` lines emitted by the gate step in +`.github/workflows/impl-review.yml`, and prints a per-library first-attempt +pass/fail-rate table. + +The goal is to spot libraries whose **prompt** is leaking (high first-attempt +fail rate → wasted compute on repair) so we know when to tighten the library +prompt rather than accept gate firing as steady-state. + +Usage: + uv run automation/scripts/canvas_gate_report.py + uv run automation/scripts/canvas_gate_report.py --limit 200 + uv run automation/scripts/canvas_gate_report.py --since 14d + +Requires `gh` CLI on PATH and authentication for the current repo. +""" + +from __future__ import annotations + +import argparse +import json +import re +import subprocess +import sys +from collections import defaultdict +from dataclasses import dataclass, field +from datetime import datetime, timedelta, timezone + + +LINE_RE = re.compile( + r"canvas_gate\s+library=(?P\S+)\s+status=(?Ppass|fail)\s+" + r"actual=(?P\d+)x(?P\d+)\s+target=(?P\d+)x(?P\d+)\s+" + r"delta=(?P[+-]?\d+)x(?P[+-]?\d+)\s+attempt=(?P\d+)" +) + + +@dataclass +class LibStats: + runs: int = 0 + first_pass: int = 0 + first_fail: int = 0 + deltas: list[tuple[int, int]] = field(default_factory=list) + + @property + def fail_rate(self) -> float: + if not self.runs: + return 0.0 + return self.first_fail / self.runs + + @property + def avg_delta(self) -> tuple[float, float]: + if not self.deltas: + return (0.0, 0.0) + n = len(self.deltas) + return (sum(d[0] for d in self.deltas) / n, sum(d[1] for d in self.deltas) / n) + + +def parse_since(value: str) -> datetime: + """Parse "14d" / "48h" / ISO date into a UTC cutoff. + + Accepts ISO strings with trailing ``Z`` (normalized to ``+00:00``) and + treats naive ISO timestamps (``2026-05-20`` or ``2026-05-20T00:00:00``) as + UTC, so ``--since 2026-05-20`` is unambiguous regardless of the host + timezone. + """ + now = datetime.now(timezone.utc) + m = re.match(r"^(\d+)([dh])$", value) + if m: + n, unit = int(m.group(1)), m.group(2) + delta = timedelta(days=n) if unit == "d" else timedelta(hours=n) + return now - delta + parsed = datetime.fromisoformat(value.replace("Z", "+00:00")) + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=timezone.utc) + return parsed.astimezone(timezone.utc) + + +def list_runs(limit: int, since: datetime) -> list[dict]: + """Return recent `Impl: Review` runs as dicts (id, createdAt).""" + cmd = [ + "gh", + "run", + "list", + "--workflow", + "impl-review.yml", + "--limit", + str(limit), + "--json", + "databaseId,createdAt,conclusion", + ] + raw = subprocess.check_output(cmd, text=True) + runs = json.loads(raw) + out = [] + for r in runs: + created = datetime.fromisoformat(r["createdAt"].replace("Z", "+00:00")) + if created < since: + continue + out.append({"id": r["databaseId"], "createdAt": created, "conclusion": r["conclusion"]}) + return out + + +def fetch_log_lines(run_id: int) -> list[str]: + """Return canvas_gate notice lines from one run's log (may be empty).""" + try: + log = subprocess.check_output(["gh", "run", "view", str(run_id), "--log"], text=True, stderr=subprocess.DEVNULL) + except subprocess.CalledProcessError: + return [] + return [line for line in log.splitlines() if "canvas_gate" in line] + + +def parse_lines(lines: list[str]) -> list[dict]: + """Extract structured records from raw log lines.""" + records = [] + for line in lines: + m = LINE_RE.search(line) + if not m: + continue + records.append( + { + "library": m.group("lib"), + "status": m.group("status"), + "actual": (int(m.group("aw")), int(m.group("ah"))), + "target": (int(m.group("tw")), int(m.group("th"))), + "delta": (int(m.group("dw")), int(m.group("dh"))), + "attempt": int(m.group("attempt")), + } + ) + return records + + +def aggregate(records: list[dict]) -> dict[str, LibStats]: + """Build per-library first-attempt stats. `attempt=1` is the first try.""" + stats: dict[str, LibStats] = defaultdict(LibStats) + for r in records: + if r["attempt"] != 1: + continue # we only care about first-attempt rate + s = stats[r["library"]] + s.runs += 1 + if r["status"] == "pass": + s.first_pass += 1 + else: + s.first_fail += 1 + s.deltas.append(r["delta"]) + return stats + + +def render_table(stats: dict[str, LibStats]) -> str: + """Format the per-library report as an aligned text table.""" + if not stats: + return "(no canvas_gate notice lines found in the scanned runs)" + + rows = [] + for lib in sorted(stats): + s = stats[lib] + adw, adh = s.avg_delta + delta_col = "—" if s.first_fail == 0 else f"{adw:+.0f}×{adh:+.0f}" + flag = " ← prompt still leaking" if s.fail_rate > 0.20 else "" + rows.append((lib, s.runs, s.first_pass, s.first_fail, f"{s.fail_rate * 100:5.1f}%", delta_col, flag)) + + header = ("library", "runs", "first_pass", "first_fail", "fail_rate", "avg_delta", "") + widths = [max(len(str(r[i])) for r in [header, *rows]) for i in range(len(header))] + fmt = " ".join(f"{{:<{w}}}" for w in widths) + lines = [fmt.format(*header), fmt.format(*("-" * w for w in widths))] + for r in rows: + lines.append(fmt.format(*r)) + return "\n".join(lines) + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + parser.add_argument("--limit", type=int, default=100, help="Max workflow runs to scan (default 100).") + parser.add_argument("--since", default="14d", help="Cutoff window: '14d', '48h', or ISO date.") + args = parser.parse_args() + + cutoff = parse_since(args.since) + runs = list_runs(args.limit, cutoff) + if not runs: + print(f"(no impl-review runs found since {cutoff.isoformat()})") + return 0 + + all_records: list[dict] = [] + for run in runs: + lines = fetch_log_lines(run["id"]) + all_records.extend(parse_lines(lines)) + + stats = aggregate(all_records) + print( + f"# canvas_gate report — {len(runs)} run(s) scanned, {len(all_records)} canvas_gate line(s) since {cutoff.date()}\n" + ) + print(render_table(stats)) + print() + if any(s.fail_rate > 0.20 for s in stats.values()): + print("Any library > 20 % first-attempt fail rate over 14 days warrants tightening that library's prompt.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/docs/reference/plausible.md b/docs/reference/plausible.md index b67d740ce4..934b7866f0 100644 --- a/docs/reference/plausible.md +++ b/docs/reference/plausible.md @@ -129,7 +129,7 @@ https://anyplot.ai/{spec_id}/{language}/{library}/{category}/{value}/... | `open_interactive` | `spec`, `library` | SpecOverview.tsx, SpecDetailView.tsx | User opens interactive HTML view | | `suggest_spec` | - | SpecsListPage.tsx | User clicks the `spec.suggest()` link on the specs list page. The mirror link on the landing page emits `nav_click` with `source: suggest_spec_link` instead. | | `report_issue` | `spec`, `library`? | SpecPage.tsx | User clicks "report issue" link | -| `tag_click` | `param`, `value`, `source` | SpecTabs.tsx | User clicks a tag chip to filter | +| `tag_click` | `param`, `value`, `source` | SpecTabs.tsx, StatsPage.tsx | User clicks a tag chip to filter (`source` ∈ `spec_detail`, `stats`) | | `theme_toggle` | `to` | MastheadRule.tsx | User cycles tri-state theme mode (`to` ∈ `system`, `light`, `dark`). The cycle order is `system → light → dark → system`. | | `potd_dismiss` | `spec`, `library` | PlotOfTheDay.tsx | User dismisses the plot-of-the-day banner | | `view_mode_change` | `mode`, `library` | SpecDetailView.tsx | User toggles preview ↔ interactive view inside a spec detail. `mode` ∈ `preview`, `interactive`. Fires on every toggle in either direction (cf. `open_interactive`, which only fires when the interactive HTML is opened in a new tab). | @@ -445,7 +445,7 @@ User lands on anyplot.ai | `filter_remove` | `category`, `value` | useFilterState.ts | | `grid_resize` | `size` | ToolbarActions.tsx | | `tab_toggle` | `action`, `tab`, `library` | SpecTabs.tsx | -| `tag_click` | `param`, `value`, `source` | SpecTabs.tsx | +| `tag_click` | `param`, `value`, `source` | SpecTabs.tsx, StatsPage.tsx | | `plot_rotate` | `spec` | SpecsListPage.tsx | | `open_interactive` | `spec`, `library` | SpecOverview.tsx, SpecDetailView.tsx | | `suggest_spec` | - | SpecsListPage.tsx (LandingPage mirror attributed via `nav_click` with `source: suggest_spec_link`) | diff --git a/prompts/library/altair.md b/prompts/library/altair.md index f19fd35fb3..ce02ae1a79 100644 --- a/prompts/library/altair.md +++ b/prompts/library/altair.md @@ -6,7 +6,43 @@ import altair as alt ``` -## Create Chart +## Canvas — hard rule, no deviation + +The saved PNG must be **exactly** one of these two sizes (post-render gate in `impl-review.yml` rejects anything off by more than 16 px and re-triggers repair): + +| Orientation | View dims | scale_factor | Final PNG | +|-------------|----------------|--------------|---------------| +| Landscape | width=800, height=450 | 4.0 | 3200 × 1800 | +| Square | width=600, height=600 | 4.0 | 2400 × 2400 | + +Altair's **view dimensions are NOT the saved PNG dimensions.** `vl-convert` pads the view with title, axis-title, axis-tick-label, and legend extents *outside* `width`/`height`, so a chart with `width=800, height=450` and a long title easily saves at 3404×2120 or 4036×2052. This is the documented cause of every altair drift in the May 2026 fan-out. + +**Two-part fix, both required:** + +1. **Constrain the view** with `configure_view(continuousWidth=…, continuousHeight=…)` and **zero out chart padding** with `.properties(padding={"left":0, "right":0, "top":0, "bottom":0})`. This stops most of the inflation but does not guarantee exact dims when titles or legends are present. + +2. **Normalize the saved PNG to exact target dims as the final step** of the implementation. Even with (1), vl-convert can over- or under-shoot by tens of pixels. After `chart.save(...)`, crop (centered) or pad (with `PAGE_BG`) so the saved file lands on the canonical target. Keep this as **inline code** — no helper function (CQ-01 KISS forbids functions/classes in plot impls): + +```python +# Right after chart.save(f'plot-{THEME}.png', scale_factor=4.0): +from PIL import Image + +TW, TH = 3200, 1800 # or (2400, 2400) for square +_img = Image.open(f'plot-{THEME}.png').convert('RGB') +_w, _h = _img.size +if _w > TW or _h > TH: # crop excess centred + _l = max((_w - TW) // 2, 0) + _t = max((_h - TH) // 2, 0) + _img = _img.crop((_l, _t, _l + min(_w, TW), _t + min(_h, TH))) + _w, _h = _img.size +if _w < TW or _h < TH: # pad shortfall with PAGE_BG + _canvas = Image.new('RGB', (TW, TH), PAGE_BG) + _canvas.paste(_img, ((TW - _w) // 2, (TH - _h) // 2)) + _img = _canvas +_img.save(f'plot-{THEME}.png') +``` + +The HTML save is left untouched — only PNGs are gated. ```python chart = alt.Chart(df).mark_point(size=60).encode( # size ~2-3x default, density-aware @@ -15,7 +51,11 @@ chart = alt.Chart(df).mark_point(size=60).encode( # size ~2-3x default, density ).properties( width=800, height=450, + padding={"left": 0, "right": 0, "top": 0, "bottom": 0}, title=alt.Title(title, fontSize=16) # kept compact for the long mandated title +).configure_view( + continuousWidth=800, + continuousHeight=450, ).configure_axis( labelFontSize=10, titleFontSize=12 @@ -57,9 +97,10 @@ x='date:T' ## Save (PNG) ```python -# Target: 3200 × 1800 px (see default-style-guide.md) -# 800 × scale_factor=4 = 3200, 450 × 4 = 1800 +# Hard target: 3200 × 1800 (landscape) or 2400 × 2400 (square). See "Canvas" above. chart.save(f'plot-{THEME}.png', scale_factor=4.0) +# Follow with the inline crop/pad-to-target block from the "Canvas" section above +# (no helper function — KISS). ``` **Note**: Requires `vl-convert-python` for PNG export. @@ -125,6 +166,8 @@ chart = ( ) chart.save(f'plot-{THEME}.png', scale_factor=4.0) +# Apply the inline crop/pad-to-(3200,1800)-or-(2400,2400) block from the +# "Canvas" section above — MANDATORY, no helper function (KISS / CQ-01). chart.save(f'plot-{THEME}.html') ``` diff --git a/prompts/library/bokeh.md b/prompts/library/bokeh.md index 5321966727..3425e77d42 100644 --- a/prompts/library/bokeh.md +++ b/prompts/library/bokeh.md @@ -8,10 +8,18 @@ from bokeh.models import ColumnDataSource from bokeh.io import output_file, save ``` +## Canvas — hard rule, no deviation + +The saved PNG must be **exactly** one of these two sizes (post-render gate in `impl-review.yml` rejects anything off by more than 16 px and re-triggers repair): + +- **Landscape**: `width=3200, height=1800` +- **Square**: `width=2400, height=2400` + +The Selenium `--window-size` below **must match** these dims. **Do not omit `toolbar_location=None`** — bokeh's default toolbar adds ~30–50 px above the plot, which is what shrank every May 2026 bokeh PNG from 3200×1800 to 3200×1661. The HTML artifact still gets a working toolbar from `output_file(...)`. + ## Create Figure ```python -# Target: 3200 × 1800 px (see default-style-guide.md). # `width` / `height` are the TOTAL canvas; axis labels at the new 36–42pt # native-pixel sizes need explicit `min_border_*` reservations or they get # clipped at the edges of the rendered PNG. Reserve ~150px per side for diff --git a/prompts/library/ggplot2.md b/prompts/library/ggplot2.md index c1c52df561..12c71f2246 100644 --- a/prompts/library/ggplot2.md +++ b/prompts/library/ggplot2.md @@ -65,6 +65,17 @@ p <- ggplot(df, aes(x = col_x, y = col_y)) + theme_minimal(base_size = 8) ``` +## Canvas — hard rule, no deviation + +The saved PNG must be **exactly** one of these two sizes (post-render gate in `impl-review.yml` rejects anything off by more than 16 px and re-triggers repair): + +| Orientation | `ggsave` `width × height` (`units="in"`, `dpi=400`) | Final PNG | +|-------------|------------------------------------------------------|---------------| +| Landscape | `width = 8, height = 4.5` | 3200 × 1800 | +| Square | `width = 6, height = 6` | 2400 × 2400 | + +Pick the orientation that suits the spec. `ragg::agg_png` honours these values without trimming, so no extra tricks are required — just don't deviate. + ## Figure Size & Sizing for 3200×1800 px (starting values — review-loop tunes) ggplot2 inherits sizes via `theme(... base_size = ...)`. Override individual diff --git a/prompts/library/highcharts.md b/prompts/library/highcharts.md index aa99474132..0b9d8ae4ff 100644 --- a/prompts/library/highcharts.md +++ b/prompts/library/highcharts.md @@ -103,19 +103,37 @@ with tempfile.NamedTemporaryFile(mode="w", suffix=".html", delete=False, encodin temp_path = f.name chrome_options = Options() -chrome_options.add_argument("--headless") +chrome_options.add_argument("--headless=new") # MUST be the new headless mode chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-gpu") -chrome_options.add_argument("--window-size=3200,1800") +chrome_options.add_argument("--hide-scrollbars") # otherwise Chrome reserves ~16 px on the right +chrome_options.add_argument("--window-size=3200,1800") # NOTE: not authoritative — see CDP override below driver = webdriver.Chrome(options=chrome_options) +# Force the inner viewport to exactly W×H. `--window-size` alone gets eaten by +# Chrome chrome (toolbar/scrollbar leftovers in headless mode), which is what +# left every May 2026 highcharts screenshot at 3200×1661 instead of 3200×1800. +# `setDeviceMetricsOverride` makes the viewport authoritative. +driver.execute_cdp_cmd( + "Emulation.setDeviceMetricsOverride", + {"width": 3200, "height": 1800, "deviceScaleFactor": 1, "mobile": False}, +) driver.get(f"file://{temp_path}") time.sleep(5) # Wait for chart to render driver.save_screenshot(f"plot-{THEME}.png") driver.quit() Path(temp_path).unlink() # Clean up temp file + +# Belt-and-braces: even with the CDP override, an occasional ±1–2 px rounding +# can occur. Pin the saved PNG to exact dims so the post-render gate is happy. +from PIL import Image +_img = Image.open(f"plot-{THEME}.png").convert("RGB") +if _img.size != (3200, 1800): # or (2400, 2400) for square charts + _norm = Image.new("RGB", (3200, 1800), PAGE_BG) + _norm.paste(_img, ((3200 - _img.size[0]) // 2, (1800 - _img.size[1]) // 2)) + _norm.save(f"plot-{THEME}.png") ``` ## Sizing for 3200×1800 px (starting values — review-loop tunes) @@ -132,10 +150,21 @@ chart.options.plot_options = { See `prompts/default-style-guide.md` "Proportional Sizing" for review criteria. -**IMPORTANT:** Three places encode the canvas size — keep them in sync: -1. Selenium `--window-size=3200,1800` -2. HTML `
` -3. `chart.options.chart = {'width': 3200, 'height': 1800, ...}` (see below) +## Canvas — hard rule, no deviation + +The saved PNG must be **exactly** one of these two sizes (post-render gate in `impl-review.yml` rejects anything off by more than 16 px and re-triggers repair): + +- **Landscape**: 3200 × 1800 +- **Square**: 2400 × 2400 + +**Four places encode the canvas size — keep all of them in sync** (in the May 2026 fan-out only Selenium and one HTML attribute were aligned, which is why every highcharts PNG saved at 3200×1661): + +1. Selenium `--window-size=3200,1800` (or 2400,2400) +2. CDP `Emulation.setDeviceMetricsOverride` — **this is the authoritative one**; `--window-size` alone is not (Chrome chrome eats ~139 px in headless mode) +3. HTML `
` +4. `chart.options.chart = {'width': 3200, 'height': 1800, ...}` (see below) + +If you can't get the screenshot to land on exact dims, the PIL pad-or-crop snippet in the export code above is the final safety net. Do not remove it. ## Output Files diff --git a/prompts/library/letsplot.md b/prompts/library/letsplot.md index aeeffdf857..803aa991d6 100644 --- a/prompts/library/letsplot.md +++ b/prompts/library/letsplot.md @@ -19,10 +19,21 @@ plot = ( ) ``` +## Canvas — hard rule, no deviation + +The saved PNG must be **exactly** one of these two sizes (post-render gate in `impl-review.yml` rejects anything off by more than 16 px and re-triggers repair): + +| Orientation | `ggsize` | `ggsave` `scale` | Final PNG | +|-------------|----------|------------------|---------------| +| Landscape | `(800, 450)` | `scale=4` | 3200 × 1800 | +| Square | `(600, 600)` | `scale=4` | 2400 × 2400 | + +Don't deviate from these pairs (e.g. `ggsize(900, 500)` lands at 3600×2000, well outside the ±16 px gate). Pick the orientation that suits the spec. + ## Figure Size & Sizing for 3200×1800 px (starting values — review-loop tunes) ```python -# Base size (scaled 4x on export = 3200 × 1800 px) +# Base size (scaled 4x on export = 3200 × 1800 px) — see "Canvas" above plot = plot + ggsize(800, 450) # Text and element sizes diff --git a/prompts/library/matplotlib.md b/prompts/library/matplotlib.md index 3ae143baa8..7c13f49539 100644 --- a/prompts/library/matplotlib.md +++ b/prompts/library/matplotlib.md @@ -18,13 +18,22 @@ import matplotlib.pyplot as plt from matplotlib.figure import Figure ``` -## Create Figure +## Canvas — hard rule, no deviation + +The saved PNG must be **exactly** one of these two sizes (post-render gate in `impl-review.yml` rejects anything off by more than 16 px and re-triggers repair): ```python -# Target: 3200 × 1800 px (8 × 400dpi). See default-style-guide.md "Visual Sizing Defaults". -fig, ax = plt.subplots(figsize=(8, 4.5), dpi=400) +# Landscape — default for most plots (16:9) +fig, ax = plt.subplots(figsize=(8, 4.5), dpi=400) # → 3200 × 1800 px + +# Square — only for symmetric plots (pie, radar, heatmap, maze, …) +fig, ax = plt.subplots(figsize=(6, 6), dpi=400) # → 2400 × 2400 px ``` +Pick one based on the spec; do not invent a third aspect. Multi-panel layouts must keep the figure dims constant (use `gridspec` / `subplot_mosaic` to subdivide internally). + +**Do not use `bbox_inches="tight"` on `savefig`.** It silently trims the canvas (typically ~30–50 px on each axis → e.g. 3200×1800 becomes 3160×1760) and was the documented cause of every matplotlib drift in the May 2026 fan-out. Use `bbox_inches=None` (the explicit default) instead, and let `figsize × dpi` produce the exact target. If you need to control padding, use `fig.subplots_adjust(left=…, right=…, top=…, bottom=…)` *inside* the figure. + ## Plot Methods Use **Axes methods** (not pyplot): @@ -42,7 +51,7 @@ plt.scatter(x, y) ## Save ```python -plt.savefig(f'plot-{THEME}.png', dpi=400, bbox_inches='tight') +plt.savefig(f'plot-{THEME}.png', dpi=400) # bbox_inches MUST stay default (None) — see "Canvas" above ``` ## Sizing for 3200×1800 px (starting values — adjust per plot, review-loop tunes) @@ -147,7 +156,7 @@ if leg: ax.annotate(..., color=INK, bbox=dict(facecolor=ELEVATED_BG, edgecolor=INK_SOFT, alpha=0.9)) -plt.savefig(f'plot-{THEME}.png', dpi=400, bbox_inches='tight', facecolor=PAGE_BG) +plt.savefig(f'plot-{THEME}.png', dpi=400, facecolor=PAGE_BG) # do NOT add bbox_inches='tight' ``` ## Output Files diff --git a/prompts/library/plotly.md b/prompts/library/plotly.md index f5d1a73f5d..47ea1d1417 100644 --- a/prompts/library/plotly.md +++ b/prompts/library/plotly.md @@ -19,10 +19,32 @@ fig.add_trace(go.Scatter(x=x, y=y)) fig = px.scatter(df, x='col_x', y='col_y') ``` +## Canvas — hard rule, no deviation + +The saved PNG must be **exactly** one of these two sizes (post-render gate in `impl-review.yml` rejects anything off by more than 16 px and re-triggers repair): + +| Orientation | write_image kwargs | Final PNG | +|-------------|--------------------------------------------|---------------| +| Landscape | `width=800, height=450, scale=4` | 3200 × 1800 | +| Square | `width=600, height=600, scale=4` | 2400 × 2400 | + +`fig.update_layout(autosize=True)` is **forbidden** — it overrides `width`/`height` and produces variable output. Always pass `autosize=False` together with explicit `width`, `height`. Pin the layout margins explicitly so titles/legends don't push the inner plot off-center and the long mandated title never gets clipped: + +```python +fig.update_layout( + autosize=False, + margin=dict(l=80, r=40, t=80, b=60), # tweak ±20 if needed; never remove +) +``` + +Pick landscape or square based on the spec's content — same decision rule as every other library in the catalog. + ## Layout & Sizing for 3200×1800 px (starting values — review-loop tunes) ```python fig.update_layout( + autosize=False, + margin=dict(l=80, r=40, t=80, b=60), # Title kept compact — the long mandated "{spec-id} · python · plotly · anyplot.ai" # title would overflow at 22+px on this canvas. title=dict(text=title, font=dict(size=16)), @@ -41,8 +63,9 @@ See `prompts/default-style-guide.md` "Proportional Sizing" for review criteria. ## Save (PNG) ```python -# Target: 3200 × 1800 px (800 × scale=4). See default-style-guide.md. -fig.write_image(f'plot-{THEME}.png', width=800, height=450, scale=4) +# Hard target: 3200 × 1800 (landscape) or 2400 × 2400 (square). See "Canvas" above. +fig.write_image(f'plot-{THEME}.png', width=800, height=450, scale=4) # landscape +# fig.write_image(f'plot-{THEME}.png', width=600, height=600, scale=4) # square ``` **Note**: Requires `kaleido` for PNG export. diff --git a/prompts/library/plotnine.md b/prompts/library/plotnine.md index b6557d39a6..a7d8a5c5d6 100644 --- a/prompts/library/plotnine.md +++ b/prompts/library/plotnine.md @@ -45,6 +45,17 @@ plot = ( ) ``` +## Canvas — hard rule, no deviation + +The saved PNG must be **exactly** one of these two sizes (post-render gate in `impl-review.yml` rejects anything off by more than 16 px and re-triggers repair): + +| Orientation | `figure_size` | `ggsave` `width × height` (`units='in'`, `dpi=400`) | Final PNG | +|-------------|---------------|------------------------------------------------------|---------------| +| Landscape | `(8, 4.5)` | `width=8, height=4.5` | 3200 × 1800 | +| Square | `(6, 6)` | `width=6, height=6` | 2400 × 2400 | + +Pick the orientation that suits the spec — same decision the other libraries make. plotnine's matplotlib backend respects `figure_size`/`width`/`height` directly, so no extra tricks are required as long as you don't pass `bbox_inches='tight'` to `ggsave` (`ggsave` doesn't expose it, but if you fall back to `plt.savefig`, do **not** add it — same trim risk as matplotlib/seaborn). + ## Figure Size & Sizing for 3200×1800 px (starting values — review-loop tunes) ```python diff --git a/prompts/library/pygal.md b/prompts/library/pygal.md index 669a0be752..2ba59b0fe3 100644 --- a/prompts/library/pygal.md +++ b/prompts/library/pygal.md @@ -7,10 +7,16 @@ import pygal from pygal.style import Style ``` +## Canvas — hard rule, no deviation + +The saved PNG must be **exactly** one of these two sizes (post-render gate in `impl-review.yml` rejects anything off by more than 16 px and re-triggers repair): + +- **Landscape**: `width=3200, height=1800` +- **Square**: `width=2400, height=2400` + ## Create Chart ```python -# Target: 3200 × 1800 px (see default-style-guide.md) chart = pygal.Bar( width=3200, height=1800, diff --git a/prompts/library/seaborn.md b/prompts/library/seaborn.md index 3e221ceb47..8925c26a2a 100644 --- a/prompts/library/seaborn.md +++ b/prompts/library/seaborn.md @@ -19,11 +19,30 @@ import matplotlib.pyplot as plt from matplotlib.figure import Figure ``` -## Create Figure +## Canvas — hard rule, no deviation + +The saved PNG must be **exactly** one of these two sizes (post-render gate in `impl-review.yml` rejects anything off by more than 16 px and re-triggers repair): + +```python +# Landscape — default (16:9) +fig, ax = plt.subplots(figsize=(8, 4.5), dpi=400) # → 3200 × 1800 px + +# Square — only for symmetric plots (heatmap, jointplot, pairplot, etc.) +fig, ax = plt.subplots(figsize=(6, 6), dpi=400) # → 2400 × 2400 px +``` + +**Do not use `bbox_inches="tight"` on `savefig`.** It silently trims the canvas (~30–50 px per axis → 3200×1800 becomes ~3160×1760) and was the documented cause of every seaborn drift in the May 2026 fan-out. Use `bbox_inches=None` (the explicit default) and let `figsize × dpi` produce the exact target. If you need to control padding, use `fig.subplots_adjust(...)` inside the figure or `sns.despine(...)` for spine layout. + +For figure-level seaborn plots (`relplot`, `catplot`, `displot`), pass the **same** canvas size via `height=` / `aspect=`: ```python -# Target: 3200 × 1800 px (8 × 400dpi). See default-style-guide.md "Visual Sizing Defaults". -fig, ax = plt.subplots(figsize=(8, 4.5), dpi=400) +# Landscape: aspect=16/9 ≈ 1.778, height controls the figure's height in inches; pick height=4.5 → width=8 +g = sns.relplot(..., height=4.5, aspect=16/9) +g.figure.set_dpi(400) + +# Square: aspect=1, height=6 → 2400×2400 +g = sns.relplot(..., height=6, aspect=1) +g.figure.set_dpi(400) ``` ## Plot Methods @@ -40,7 +59,7 @@ fig = g.figure ## Save ```python -plt.savefig(f'plot-{THEME}.png', dpi=400, bbox_inches='tight') +plt.savefig(f'plot-{THEME}.png', dpi=400) # bbox_inches MUST stay default (None) — see "Canvas" above ``` ## Sizing for 3200×1800 px (starting values — adjust per plot, review-loop tunes) @@ -149,7 +168,7 @@ sns.set_theme( ) # After plotting -plt.savefig(f'plot-{THEME}.png', dpi=400, bbox_inches='tight', facecolor=PAGE_BG) +plt.savefig(f'plot-{THEME}.png', dpi=400, facecolor=PAGE_BG) # do NOT add bbox_inches='tight' ``` ## Output Files diff --git a/prompts/workflow-prompts/ai-quality-review.md b/prompts/workflow-prompts/ai-quality-review.md index 46be522341..80cdab1381 100644 --- a/prompts/workflow-prompts/ai-quality-review.md +++ b/prompts/workflow-prompts/ai-quality-review.md @@ -65,6 +65,24 @@ For `plot-dark.png` (background should be `#1A1A17`): A plot that's perfect in one theme but unreadable in the other still **fails** — both renders must pass. Be strict: a plot that ships to the website broken on dark mode is worse than one that fails review and gets repaired. +### 5c2. MANDATORY: Canvas dimension gate (if present) + +The workflow's pre-check step (`impl-review.yml` → "Canvas dimension gate") measures the saved `plot-light.png` against the two canonical canvas sizes (3200×1800 landscape, 2400×2400 square, ±16 px tolerance). When the gate fails, it writes the synthetic weakness to `/tmp/anyplot-canvas-gate.txt`. + +**Before scoring, check whether `/tmp/anyplot-canvas-gate.txt` exists:** + +```bash +ls -la /tmp/anyplot-canvas-gate.txt 2>/dev/null && cat /tmp/anyplot-canvas-gate.txt +``` + +If it **does** exist: +- The file contains a single paragraph: `Canvas dimensions drifted from required target. Actual: WxH. Closest valid target: TWxTH (±16 px tolerance). Signed delta: ±dx × ±dy — direction. Most likely cause: …` +- **Copy that paragraph verbatim into your `weaknesses` array as the FIRST item.** Do not paraphrase; the repair model needs the literal "actual=WxH" and signed delta numbers to know which knob to turn and which direction. +- **Set VQ-05 (Layout & Canvas) to 0/4 regardless of other observations.** Canvas drift is a hard rule; the quality_score must drop low enough to route the PR into impl-repair through the existing 5-review/4-repair cascade. +- Keep scoring the other categories honestly — useful signal for repair is good signal — but do **not** lift VQ-05 just because the visual proportions look fine inside the wrong-sized canvas. + +If it does **not** exist: the gate passed; no canvas-specific action — score VQ-05 normally based on the visual proportional checks in 5d below. + ### 5d. MANDATORY: Proportional Sizing Check (both renders) Visually estimate from each PNG — no pixel measurement needed. These are soft proportional checks **without hard thresholds**. Violations cost points in the existing VQ-01 (Text Legibility), VQ-02 (No Overlap), and VQ-05 (Layout & Canvas) categories rather than triggering a separate pass/fail item. A single visual problem can reduce points in multiple categories simultaneously (holistic, not strict). diff --git a/prompts/workflow-prompts/impl-generate-claude.md b/prompts/workflow-prompts/impl-generate-claude.md index 9754460326..0588a4c883 100644 --- a/prompts/workflow-prompts/impl-generate-claude.md +++ b/prompts/workflow-prompts/impl-generate-claude.md @@ -21,6 +21,21 @@ The `{EXT}` value depends on `{LANGUAGE}`: - SPEC_ID: {SPEC_ID} - Regeneration: {IS_REGENERATION} +## Step 0: Canvas dimensions (HARD CONTRACT — applies before anything else) + +The saved `plot-light.png` / `plot-dark.png` **must** end up at exactly one of these two pixel sizes: + +| Orientation | Pixels | Use case | +|-------------|-------------|----------| +| Landscape | 3200 × 1800 | Default — most plots (16:9) | +| Square | 2400 × 2400 | Symmetric plots: pie, radar, heatmaps, maze, crossword, anything with no preferred horizontal axis (1:1) | + +Pick the orientation that suits the spec's content; do **not** average between them, do **not** invent a third aspect ratio. The exact knobs to set for your library are in `prompts/library/{LIBRARY}.md` ("Canvas — hard rule, no deviation"). Use those numbers verbatim. + +**If regenerating: the previous implementation's `figsize` / `dpi` / `width` / `height` / `scale_factor` values are HISTORICAL. Do NOT carry them forward, even if every other line in the previous code is fine.** Your very first edit to the previous file is to overwrite those values to the canonical pair from the library prompt. Everything else (fonts, colours, layout, data scenario) comes after that one edit is in place. + +A post-render gate in `impl-review.yml` measures the saved PNG dimensions and rejects anything off-target by more than 16 px on either axis, then routes the PR into `impl-repair.yml`. Repair attempts are capped — drift past 4 attempts deletes the implementation from `main`. The cheapest path is to land on target on the first render. Step 3 below includes a PIL self-check you must run before committing. + ## Step 1: Read the rules (quickly) Read these files to understand the requirements: @@ -41,7 +56,8 @@ When regenerating an existing implementation, you MUST read these BEFORE writing - Preserve the bits listed under "Strengths" unchanged. - Address every bullet under "Weaknesses" and each ❌ item in the criteria checklist. -- **Base style ALWAYS wins.** If anything in `prompts/default-style-guide.md` or `prompts/library/{LIBRARY}.md` differs from the previous implementation, update the previous code to match. This explicitly includes — but is not limited to — **canvas size** (`figsize`/`dpi` for matplotlib/seaborn/plotnine/ggplot2, `width`/`height`/`scale_factor` for plotly/altair/letsplot, `width`/`height` for bokeh/highcharts/pygal), **font sizes** (title, axis labels, tick labels, legend), **marker and line sizes**, **palette** (Okabe-Ito positions), **theme tokens** (background, INK, INK_SOFT, ELEVATED_BG, GRID), and **chrome** (spines, gridlines, legend frame). The previous review may not have flagged the old values because they were valid at the time — that does NOT make them current. Always re-read the library prompt's "Sizing" section and the style guide's "Visual Sizing Defaults" table on every regen and align. +- **Canvas size: the Step 0 contract is non-negotiable on regen.** The previous file's `figsize` / `dpi` / `width` / `height` / `scale_factor` values are **historical**, never current — overwrite them to the canonical pair from `prompts/library/{LIBRARY}.md` as your *first* edit, before touching anything else. The post-render gate checks this and re-triggers repair on drift; do not let that fire. +- **Base style wins on everything else.** If anything in `prompts/default-style-guide.md` or `prompts/library/{LIBRARY}.md` differs from the previous implementation, update the previous code to match. This includes **font sizes** (title, axis labels, tick labels, legend), **marker and line sizes**, **palette** (Okabe-Ito positions), **theme tokens** (background, INK, INK_SOFT, ELEVATED_BG, GRID), and **chrome** (spines, gridlines, legend frame). The previous review may not have flagged the old values because they were valid at the time — that does NOT make them current. Always re-read the library prompt's "Sizing" section and the style guide's "Visual Sizing Defaults" table on every regen and align. - Do NOT discard working structure / data generation / layout choices that the previous review did not flag. - Your deliverable is a refined version of the previous file, not a fresh rewrite from the spec. @@ -142,6 +158,26 @@ ANYPLOT_THEME=dark Rscript {LIBRARY}.R Both runs must succeed and produce `plot-light.png` / `plot-dark.png` (plus `plot-light.html` / `plot-dark.html` for interactive libs). If either fails, fix and try again (max 3 attempts). +### Step 3b: Canvas dimension self-check (Step 0 contract verification) + +After both renders succeed, run this check against the light PNG. It catches Step 0 violations *before* the PR goes to review: + +```bash +source .venv/bin/activate +python -c " +from PIL import Image +import sys +p = 'plots/{SPEC_ID}/implementations/{LANGUAGE}/plot-light.png' +w, h = Image.open(p).size +targets = [(3200, 1800), (2400, 2400)] +ok = any(abs(w-tw) <= 16 and abs(h-th) <= 16 for tw, th in targets) +print(f'canvas {w}x{h} ' + ('OK' if ok else 'DRIFTED — adjust library-specific canvas knobs and re-render')) +sys.exit(0 if ok else 1) +" +``` + +If this exits non-zero, you have **not** satisfied the Step 0 contract. Re-read `prompts/library/{LIBRARY}.md` "Canvas — hard rule" and apply the exact code there; common causes are listed in that file (e.g. matplotlib/seaborn `bbox_inches="tight"` shaves ~40 px; bokeh's default toolbar leaves ~140 px; altair's vl-convert pads outside `width`/`height`). Re-render and re-check until OK. Do not skip this and rely on the post-render gate to catch it — repair cycles cost compute. + ## Step 4: Visual self-check (BOTH renders) Look at `plot-light.png` AND `plot-dark.png`: diff --git a/tests/unit/automation/scripts/test_canvas_gate_report.py b/tests/unit/automation/scripts/test_canvas_gate_report.py new file mode 100644 index 0000000000..c78d1412f0 --- /dev/null +++ b/tests/unit/automation/scripts/test_canvas_gate_report.py @@ -0,0 +1,150 @@ +"""Tests for automation.scripts.canvas_gate_report log parser + aggregator. + +Locks the structured-log contract between the gate step in +.github/workflows/impl-review.yml and the monitoring script: if either side +drifts the regex breaks and these tests fail. +""" + +from automation.scripts.canvas_gate_report import LINE_RE, LibStats, aggregate, parse_lines, render_table + + +class TestLineRegex: + """The structured-log contract from impl-review.yml's gate step.""" + + def test_parses_pass_line(self): + line = ( + "2026-05-20T14:42:01Z ::notice::canvas_gate library=ggplot2 status=pass " + "actual=3200x1800 target=3200x1800 delta=+0x+0 attempt=1" + ) + m = LINE_RE.search(line) + assert m is not None + assert m.group("lib") == "ggplot2" + assert m.group("status") == "pass" + assert m.group("aw") == "3200" + assert m.group("dw") == "+0" + assert m.group("attempt") == "1" + + def test_parses_fail_with_negative_delta(self): + line = ( + "::notice::canvas_gate library=highcharts status=fail actual=3200x1661 " + "target=3200x1800 delta=+0x-139 attempt=1" + ) + m = LINE_RE.search(line) + assert m is not None + assert m.group("lib") == "highcharts" + assert m.group("status") == "fail" + assert m.group("dh") == "-139" + + def test_unsigned_delta_is_accepted(self): + # The gate emits signed ints but some int formatters may drop the +. + line = ( + "::notice::canvas_gate library=matplotlib status=fail actual=2325x2323 " + "target=2400x2400 delta=-75x-77 attempt=2" + ) + m = LINE_RE.search(line) + assert m is not None + assert m.group("dw") == "-75" + assert m.group("dh") == "-77" + + def test_ignores_non_matching_notice_lines(self): + line = "::notice::Found both theme renders — proceeding to review" + assert LINE_RE.search(line) is None + + +class TestParseLines: + def test_extracts_multiple_records(self): + lines = [ + "::notice::canvas_gate library=ggplot2 status=pass actual=3200x1800 target=3200x1800 delta=+0x+0 attempt=1", + "noise line", + "::notice::canvas_gate library=altair status=fail actual=3404x2120 target=3200x1800 delta=+204x+320 attempt=1", + ] + records = parse_lines(lines) + assert len(records) == 2 + assert records[0]["library"] == "ggplot2" + assert records[0]["status"] == "pass" + assert records[1]["library"] == "altair" + assert records[1]["delta"] == (204, 320) + + +class TestAggregate: + def test_first_attempt_only(self): + records = [ + # First attempts: 1 pass + 1 fail for matplotlib + { + "library": "matplotlib", + "status": "pass", + "delta": (0, 0), + "attempt": 1, + "actual": (3200, 1800), + "target": (3200, 1800), + }, + { + "library": "matplotlib", + "status": "fail", + "delta": (-40, -40), + "attempt": 1, + "actual": (3160, 1760), + "target": (3200, 1800), + }, + # Repair attempts: must be ignored + { + "library": "matplotlib", + "status": "pass", + "delta": (0, 0), + "attempt": 2, + "actual": (3200, 1800), + "target": (3200, 1800), + }, + # First attempt for altair: 1 fail + { + "library": "altair", + "status": "fail", + "delta": (204, 320), + "attempt": 1, + "actual": (3404, 2120), + "target": (3200, 1800), + }, + ] + stats = aggregate(records) + assert set(stats.keys()) == {"matplotlib", "altair"} + assert stats["matplotlib"].runs == 2 + assert stats["matplotlib"].first_pass == 1 + assert stats["matplotlib"].first_fail == 1 + assert stats["matplotlib"].fail_rate == 0.5 + # Only failing deltas are recorded + assert stats["matplotlib"].deltas == [(-40, -40)] + assert stats["altair"].first_fail == 1 + assert stats["altair"].fail_rate == 1.0 + + def test_empty_records_returns_no_stats(self): + assert aggregate([]) == {} + + +class TestRenderTable: + def test_returns_no_data_message_when_empty(self): + out = render_table({}) + assert "no canvas_gate" in out.lower() + + def test_flags_libraries_over_threshold(self): + stats = { + "altair": LibStats(runs=10, first_pass=2, first_fail=8, deltas=[(200, 300)] * 8), + "ggplot2": LibStats(runs=10, first_pass=10, first_fail=0, deltas=[]), + } + table = render_table(stats) + assert "altair" in table + assert "prompt still leaking" in table + # ggplot2's 0% should NOT be flagged + ggplot_line = [line for line in table.splitlines() if line.startswith("ggplot2")][0] + assert "prompt still leaking" not in ggplot_line + + +class TestLibStatsProperties: + def test_fail_rate_with_zero_runs(self): + assert LibStats().fail_rate == 0.0 + + def test_avg_delta_handles_empty(self): + assert LibStats().avg_delta == (0.0, 0.0) + + def test_avg_delta_computes_mean(self): + s = LibStats(deltas=[(100, 200), (200, 400)]) + assert s.avg_delta == (150.0, 300.0)