From 95da061615748dab4e462ea66c492410fa49aa90 Mon Sep 17 00:00:00 2001 From: ahd-weekly-eval Date: Mon, 22 Jun 2026 12:49:04 +0000 Subject: [PATCH] docs(evals): weekly run 2026-06-22 Automated weekly eval against Cloudflare Workers AI OSS roster. See the report for per-cell numbers. --- docs/evals/weekly/2026-06-22.md | 95 ++++++ docs/evals/weekly/2026-06-22.replay.json | 372 +++++++++++++++++++++++ 2 files changed, 467 insertions(+) create mode 100644 docs/evals/weekly/2026-06-22.md create mode 100644 docs/evals/weekly/2026-06-22.replay.json diff --git a/docs/evals/weekly/2026-06-22.md b/docs/evals/weekly/2026-06-22.md new file mode 100644 index 0000000..2174a44 --- /dev/null +++ b/docs/evals/weekly/2026-06-22.md @@ -0,0 +1,95 @@ +# ahd eval · swiss-editorial · 2026-06-22T12:49:04.342Z + +```yaml ahd-replay +schema_version: 1 +kind: eval-live +ahd_version: 0.11.0 +ahd_commit: 2fd291864ef39c898f6e0dc31f49973f108ae445 +git_dirty: true +node_version: v20.20.2 +platform: linux-x64 +invoked_at: 2026-06-22T12:17:41.483Z +token: + path: /home/runner/work/ahd/ahd/tokens/swiss-editorial.yml + hash: sha256:380a3d833d94 +brief: + path: briefs/landing.yml + hash: sha256:8b7d42759643 +sampling: + n: 30 + temperature: null + seed: null +models: + - id: @cf/google/gemma-4-26b-a4b-it + provider: cloudflare-workers-ai + provider_request_ids: 54 captured + - id: @cf/meta/llama-4-scout-17b-16e-instruct + provider: cloudflare-workers-ai + provider_request_ids: 60 captured + - id: @cf/mistralai/mistral-small-3.1-24b-instruct + provider: cloudflare-workers-ai + provider_request_ids: 60 captured + - id: @cf/openai/gpt-oss-120b + provider: cloudflare-workers-ai + provider_request_ids: 55 captured + - id: @cf/qwen/qwen3-30b-a3b-fp8 + provider: cloudflare-workers-ai + provider_request_ids: 60 captured +conditions: + requested: [raw, compiled] + effective: [raw, compiled] +``` + +Replay this run: + +```sh +git checkout 2fd291864ef3 +npm ci && npm run build +/opt/hostedtoolcache/node/20.20.2/x64/bin/node /home/runner/work/ahd/ahd/bin/ahd.js eval-live swiss-editorial --brief briefs/landing.yml --models cf:@cf/google/gemma-4-26b-a4b-it,cf:@cf/meta/llama-4-scout-17b-16e-instruct,cf:@cf/mistralai/mistral-small-3.1-24b-instruct,cf:@cf/openai/gpt-oss-120b,cf:@cf/qwen/qwen3-30b-a3b-fp8 --n 30 --sample-concurrency 6 --out evals --report docs/evals/weekly/2026-06-22.md +``` + +## Run + +- Brief: `briefs/landing.yml` +- Samples per cell: **30** +- Max tokens: 12000 +- Models: + - `@cf/google/gemma-4-26b-a4b-it` (cloudflare-workers-ai) · spec `cf:@cf/google/gemma-4-26b-a4b-it` + - `@cf/meta/llama-4-scout-17b-16e-instruct` (cloudflare-workers-ai) · spec `cf:@cf/meta/llama-4-scout-17b-16e-instruct` + - `@cf/mistralai/mistral-small-3.1-24b-instruct` (cloudflare-workers-ai) · spec `cf:@cf/mistralai/mistral-small-3.1-24b-instruct` + - `@cf/openai/gpt-oss-120b` (cloudflare-workers-ai) · spec `cf:@cf/openai/gpt-oss-120b` + - `@cf/qwen/qwen3-30b-a3b-fp8` (cloudflare-workers-ai) · spec `cf:@cf/qwen/qwen3-30b-a3b-fp8` + +## Per-model slop reduction + +| model | raw attempted → scored | compiled attempted → scored | raw mean tells | compiled mean tells | Δ | reduction | +|---|---:|---:|---:|---:|---:|---:| +| `@cf/google/gemma-4-26b-a4b-it` | 30 → 28 | 30 → 26 | 2.61 | 1.12 | 1.49 | 57.2% | +| `@cf/meta/llama-4-scout-17b-16e-instruct` | 30 → 30 | 30 → 30 | 2.03 | 2.00 | 0.03 | 1.6% | +| `@cf/mistralai/mistral-small-3.1-24b-instruct` | 30 → 30 | 30 → 30 | 3.30 | 1.17 | 2.13 | 64.6% | +| `@cf/openai/gpt-oss-120b` | 30 → 29 | 30 → 26 | 3.24 | 0.88 | 2.36 | 72.7% | +| `@cf/qwen/qwen3-30b-a3b-fp8` | 30 → 30 | 30 → 30 | 1.90 | 2.03 | -0.13 | -7.0% | + +## Per-tell frequency (scored samples only) + +| tell | @cf/google/gemma-4-26b-a4b-it/raw | @cf/google/gemma-4-26b-a4b-it/compiled | @cf/meta/llama-4-scout-17b-16e-instruct/raw | @cf/meta/llama-4-scout-17b-16e-instruct/compiled | @cf/mistralai/mistral-small-3.1-24b-instruct/raw | @cf/mistralai/mistral-small-3.1-24b-instruct/compiled | @cf/openai/gpt-oss-120b/raw | @cf/openai/gpt-oss-120b/compiled | @cf/qwen/qwen3-30b-a3b-fp8/raw | @cf/qwen/qwen3-30b-a3b-fp8/compiled | +|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| ahd/a11y/heading-skip | 0% | 8% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | +| ahd/body-measure | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 10% | +| ahd/line-height-per-size | 86% | 0% | 3% | 100% | 47% | 23% | 100% | 8% | 67% | 37% | +| ahd/no-default-grotesque | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 4% | 0% | 0% | +| ahd/no-em-dashes-in-prose | 0% | 0% | 0% | 0% | 0% | 0% | 3% | 4% | 0% | 0% | +| ahd/no-flat-dark-mode | 4% | 0% | 0% | 0% | 3% | 0% | 0% | 0% | 7% | 0% | +| ahd/radius-hierarchy | 57% | 8% | 3% | 100% | 100% | 0% | 83% | 4% | 13% | 53% | +| ahd/require-named-grid | 0% | 0% | 97% | 0% | 100% | 57% | 28% | 0% | 7% | 10% | +| ahd/require-type-pairing | 18% | 0% | 100% | 0% | 80% | 0% | 66% | 0% | 83% | 0% | +| ahd/tracking-per-size | 0% | 15% | 0% | 0% | 0% | 27% | 0% | 0% | 0% | 0% | +| ahd/weight-variety | 96% | 81% | 0% | 0% | 0% | 10% | 45% | 69% | 13% | 93% | + +## Caveats +- Scoring runs the deterministic AHD linter (38 source-level rules) over every sample that passes a basic HTML sanity check. +- Counts reported per cell: attempted (runs initiated) / errored (API / runtime errors) / extractionFailed (response contained no usable HTML) / scored (linted). A large gap between attempted and scored is a signal that the model is struggling with the instruction, not that it passed the taxonomy. +- Raw condition: the brief is expanded as plain prose (intent + audience + surfaces + mustInclude + mustAvoid) with no AHD system prompt, no style token, no forbidden list. Compiled condition: same brief plus the AHD-compiled system prompt. The only thing that differs between conditions is the AHD intervention. +- Vision-only tells (14 rules in the critic) are not scored in this pipeline; run the critic on rendered screenshots for full taxonomy coverage. +- Tells-per-page is a proxy metric: a thin page has little surface for rules to fire against. Read the Δ alongside the actual rendered HTML, not in isolation. +- Model versions change. See the run manifest for exact canonical model ids. \ No newline at end of file diff --git a/docs/evals/weekly/2026-06-22.replay.json b/docs/evals/weekly/2026-06-22.replay.json new file mode 100644 index 0000000..2025e57 --- /dev/null +++ b/docs/evals/weekly/2026-06-22.replay.json @@ -0,0 +1,372 @@ +{ + "schema_version": 1, + "kind": "eval-live", + "ahd_version": "0.11.0", + "ahd_commit": "2fd291864ef39c898f6e0dc31f49973f108ae445", + "git_dirty": true, + "node_version": "v20.20.2", + "platform": "linux-x64", + "invoked_at": "2026-06-22T12:17:41.483Z", + "argv": [ + "/opt/hostedtoolcache/node/20.20.2/x64/bin/node", + "/home/runner/work/ahd/ahd/bin/ahd.js", + "eval-live", + "swiss-editorial", + "--brief", + "briefs/landing.yml", + "--models", + "cf:@cf/google/gemma-4-26b-a4b-it,cf:@cf/meta/llama-4-scout-17b-16e-instruct,cf:@cf/mistralai/mistral-small-3.1-24b-instruct,cf:@cf/openai/gpt-oss-120b,cf:@cf/qwen/qwen3-30b-a3b-fp8", + "--n", + "30", + "--sample-concurrency", + "6", + "--out", + "evals", + "--report", + "docs/evals/weekly/2026-06-22.md" + ], + "token": { + "path": "/home/runner/work/ahd/ahd/tokens/swiss-editorial.yml", + "hash": "sha256:380a3d833d9463dbc681df7465993ab6413d8e77188f55f97359f60dc4b746b1" + }, + "brief": { + "path": "briefs/landing.yml", + "hash": "sha256:8b7d4275964399a91e6ddec525151ba672ed9c4721279a37ec16ab3450493a4c" + }, + "sampling": { + "n": 30, + "temperature": null, + "seed": null + }, + "models": [ + { + "id": "@cf/google/gemma-4-26b-a4b-it", + "provider": "cloudflare-workers-ai", + "provider_request_ids": [ + "a0fb30faeea1c99c-IAD", + "a0fb30fade2ef28b-IAD", + "a0fb30faee2c3450-IAD", + "a0fb30fadda67044-IAD", + "a0fb30fae8c3391f-IAD", + "a0fb31b64e0dc99c-IAD", + "a0fb30fada47d6d8-IAD", + "a0fb31ebe9747044-IAD", + "a0fb31fc2f9d391f-IAD", + "a0fb329e9ccec99c-IAD", + "a0fb33550ef4391f-IAD", + "a0fb34192ac6391f-IAD", + "a0fb32d46ad7d6d8-IAD", + "a0fb333addee7044-IAD", + "a0fb3378dac4c99c-IAD", + "a0fb34d1ac39391f-IAD", + "a0fb34c7ca64ea42-IAD", + "a0fb34c028623929-IAD", + "a0fb35c6ff2a391f-IAD", + "a0fb34fb3b60d6d8-IAD", + "a0fb351669b77044-IAD", + "a0fb355eff7cc99c-IAD", + "a0fb369a3d01d6d8-IAD", + "a0fb3663fff8391f-IAD", + "a0fb37177ce77044-IAD", + "a0fb3622e938ea42-IAD", + "a0fb36578f3651c4-IAD", + "a0fb37768db8c99c-IAD", + "a0fb3953181a56cd-IAD", + "a0fb395319fac5ae-IAD", + "a0fb395318af3929-IAD", + "a0fb39531caed62d-IAD", + "a0fb3a52187c56cd-IAD", + "a0fb3ae428e6c5ae-IAD", + "a0fb3b5a6f80d62d-IAD", + "a0fb3aeb0be53929-IAD", + "a0fb3c416c0156cd-IAD", + "a0fb3c410a71dc40-IAD", + "a0fb3ce87d7fc655-IAD", + "a0fb3d18bc5c3929-IAD", + "a0fb3e1a5811c5ae-IAD", + "a0fb3db9ca4e56cd-IAD", + "a0fb3f2f0ad97044-IAD", + "a0fb3ebd9af2c655-IAD", + "a0fb3f6e0990d6d8-IAD", + "a0fb3fe9fd0c56cd-IAD", + "a0fb3f8318c0f28b-IAD", + "a0fb3ecb1c963929-IAD", + "a0fb40a95a98c655-IAD", + "a0fb411ffeedc99c-IAD", + "a0fb4088fa181f6d-IAD", + "a0fb413ceb2c56cd-IAD", + "a0fb417b6bf63929-IAD", + "a0fb4158ac0bf28b-IAD" + ] + }, + { + "id": "@cf/meta/llama-4-scout-17b-16e-instruct", + "provider": "cloudflare-workers-ai", + "provider_request_ids": [ + "a0fb4380cad0c5ae-IAD", + "a0fb4380cc06c99c-IAD", + "a0fb4380bc31f28b-IAD", + "a0fb4380ce61d62d-IAD", + "a0fb4380ccfd3929-IAD", + "a0fb4380cfb57044-IAD", + "a0fb43bb3ea5c5ae-IAD", + "a0fb43bb3cb7c99c-IAD", + "a0fb43ce9dc6d62d-IAD", + "a0fb43ce8b5bf28b-IAD", + "a0fb43cecce93929-IAD", + "a0fb43cecdd57044-IAD", + "a0fb43fb8949c5ae-IAD", + "a0fb4408ddc9c99c-IAD", + "a0fb4413d9247044-IAD", + "a0fb4413482ed62d-IAD", + "a0fb44139b723929-IAD", + "a0fb4413986df28b-IAD", + "a0fb443ff8fec5ae-IAD", + "a0fb4444fca6c99c-IAD", + "a0fb444c4bbc7044-IAD", + "a0fb445d1b43d62d-IAD", + "a0fb445d8b0bf28b-IAD", + "a0fb445d7f5d3929-IAD", + "a0fb4485f980c5ae-IAD", + "a0fb44880806c99c-IAD", + "a0fb449178887044-IAD", + "a0fb449aefe0d62d-IAD", + "a0fb449b7eb4f28b-IAD", + "a0fb44a1ae9d3929-IAD", + "a0fb44dc78a0d62d-IAD", + "a0fb44dc7bc5f28b-IAD", + "a0fb44dc7ac1c5ae-IAD", + "a0fb44dc8b53c99c-IAD", + "a0fb44dc7b0f7044-IAD", + "a0fb44dc7cfd3929-IAD", + "a0fb45358fa1f28b-IAD", + "a0fb45356ec5d62d-IAD", + "a0fb454d6abcc99c-IAD", + "a0fb454ead047044-IAD", + "a0fb4550297e3929-IAD", + "a0fb453a4874c5ae-IAD", + "a0fb458089c5d62d-IAD", + "a0fb457ebd63f28b-IAD", + "a0fb459f98707044-IAD", + "a0fb45a2dfc0c5ae-IAD", + "a0fb45a0afa03929-IAD", + "a0fb459e7c79c99c-IAD", + "a0fb45cef989d62d-IAD", + "a0fb45e64dd1f28b-IAD", + "a0fb45e95b7d7044-IAD", + "a0fb45ec7cdcc5ae-IAD", + "a0fb45f7c9ef3929-IAD", + "a0fb4612b976d62d-IAD", + "a0fb45fd5aeec99c-IAD", + "a0fb46367b5a7044-IAD", + "a0fb4624790ef28b-IAD", + "a0fb4651de35c5ae-IAD", + "a0fb465d5b353929-IAD", + "a0fb46602e2ad62d-IAD" + ] + }, + { + "id": "@cf/mistralai/mistral-small-3.1-24b-instruct", + "provider": "cloudflare-workers-ai", + "provider_request_ids": [ + "a0fb46d71b3fc99c-IAD", + "a0fb46d6f878d62d-IAD", + "a0fb46d71b1bea42-IAD", + "a0fb46d71ae1c502-IAD", + "a0fb46d71ff0f28b-IAD", + "a0fb46d70e044b11-IAD", + "a0fb4727b833c99c-IAD", + "a0fb472c1f4dd62d-IAD", + "a0fb47304b81ea42-IAD", + "a0fb473afce5c502-IAD", + "a0fb4756b9af4b11-IAD", + "a0fb4755c89df28b-IAD", + "a0fb4777990cc99c-IAD", + "a0fb47815bced62d-IAD", + "a0fb47909e77ea42-IAD", + "a0fb479b0f3ac502-IAD", + "a0fb47ccd88c4b11-IAD", + "a0fb47cda94ef28b-IAD", + "a0fb47d0e909c99c-IAD", + "a0fb47f9985dea42-IAD", + "a0fb47de5ab4d62d-IAD", + "a0fb480f285bc502-IAD", + "a0fb48288c714b11-IAD", + "a0fb483e28edf28b-IAD", + "a0fb48413ef4c99c-IAD", + "a0fb48646967d62d-IAD", + "a0fb485cecf6ea42-IAD", + "a0fb488ffb4f4b11-IAD", + "a0fb48974913f28b-IAD", + "a0fb488a2eccc502-IAD", + "a0fb48fdc986f28b-IAD", + "a0fb48fdcd18c502-IAD", + "a0fb48fdc9bc4b11-IAD", + "a0fb48fdeda056cd-IAD", + "a0fb48fdea19703c-IAD", + "a0fb48fde83851c4-IAD", + "a0fb49942b4af28b-IAD", + "a0fb49c2aa7d703c-IAD", + "a0fb49b47ec94b11-IAD", + "a0fb49c26c8256cd-IAD", + "a0fb49ae99e8c502-IAD", + "a0fb49ddafcb51c4-IAD", + "a0fb4a86f861703c-IAD", + "a0fb4a9ff8f656cd-IAD", + "a0fb4a98ff4c4b11-IAD", + "a0fb4a606d19f28b-IAD", + "a0fb4aa7ccc3c502-IAD", + "a0fb4b496c314b11-IAD", + "a0fb4b325bf3703c-IAD", + "a0fb4b65fcc7f28b-IAD", + "a0fb4aeb686351c4-IAD", + "a0fb4b9f9c29c502-IAD", + "a0fb4b3f4d9556cd-IAD", + "a0fb4bdba9584b11-IAD", + "a0fb4c144cfbf28b-IAD", + "a0fb4c24cbdb51c4-IAD", + "a0fb4c072849703c-IAD", + "a0fb4c4d0bddc502-IAD", + "a0fb4c77fd2156cd-IAD", + "a0fb4c907bcd4b11-IAD" + ] + }, + { + "id": "@cf/openai/gpt-oss-120b", + "provider": "cloudflare-workers-ai", + "provider_request_ids": [ + "a0fb4d99bac44b11-IAD", + "a0fb4d99db4f7044-IAD", + "a0fb4d99d99d3450-IAD", + "a0fb4d99df23e5c7-IAD", + "a0fb4d99da66ea42-IAD", + "a0fb4d99cc17f28b-IAD", + "a0fb4dd09d5f4b11-IAD", + "a0fb4dd988f97044-IAD", + "a0fb4df089733450-IAD", + "a0fb4e03ca48f28b-IAD", + "a0fb4e0f68584b11-IAD", + "a0fb4e256d133450-IAD", + "a0fb4df0cdaae5c7-IAD", + "a0fb4e6f3b044b11-IAD", + "a0fb4ea65f074b11-IAD", + "a0fb4e9198843450-IAD", + "a0fb4df16d27ea42-IAD", + "a0fb4e313db3f28b-IAD", + "a0fb4ea3ef9de5c7-IAD", + "a0fb4f58fefeea42-IAD", + "a0fb4f5f8ca6f28b-IAD", + "a0fb4f919813ea42-IAD", + "a0fb4fc8ed78f28b-IAD", + "a0fb4f723c4be5c7-IAD", + "a0fb4fe21e31ea42-IAD", + "a0fb500d88f9f28b-IAD", + "a0fb4ef37e3f3450-IAD", + "a0fb4eef2b814b11-IAD", + "a0fb5013b8f8e5c7-IAD", + "a0fb50e06b66c99c-IAD", + "a0fb50e0687ad62d-IAD", + "a0fb50e06be33929-IAD", + "a0fb50e04db67044-IAD", + "a0fb5134fb13c99c-IAD", + "a0fb5142f83ed62d-IAD", + "a0fb51549d7c3929-IAD", + "a0fb51942afb7044-IAD", + "a0fb521ee9cf7044-IAD", + "a0fb519a7bf5c99c-IAD", + "a0fb51f66a4c3929-IAD", + "a0fb50e06a03d705-IAD", + "a0fb52839dc27044-IAD", + "a0fb52fd1d41d705-IAD", + "a0fb530e5f007044-IAD", + "a0fb53745e377044-IAD", + "a0fb53ce5ddbc5ae-IAD", + "a0fb53d96f5d703c-IAD", + "a0fb5298cf2bc99c-IAD", + "a0fb54358ae0f28b-IAD", + "a0fb54de28ed56cd-IAD", + "a0fb549d681ae5c7-IAD", + "a0fb5536c8c356cd-IAD", + "a0fb5512de9df28b-IAD", + "a0fb53590a03d705-IAD", + "a0fb54733bb1703c-IAD" + ] + }, + { + "id": "@cf/qwen/qwen3-30b-a3b-fp8", + "provider": "cloudflare-workers-ai", + "provider_request_ids": [ + "a0fb58305e8156cd-IAD", + "a0fb58304d3a3929-IAD", + "a0fb58305c163450-IAD", + "a0fb58305be9d705-IAD", + "a0fb58305ab9c99c-IAD", + "a0fb583059aef28b-IAD", + "a0fb58e2886a56cd-IAD", + "a0fb59024f0a3929-IAD", + "a0fb59555c353929-IAD", + "a0fb5909de74e5c7-IAD", + "a0fb59091fe83450-IAD", + "a0fb59464cdf56cd-IAD", + "a0fb59130d07c99c-IAD", + "a0fb59267ef4f28b-IAD", + "a0fb59b02c403929-IAD", + "a0fb59c7a9233450-IAD", + "a0fb59cd889b56cd-IAD", + "a0fb59c3391fe5c7-IAD", + "a0fb5a2b185f3450-IAD", + "a0fb59fc7b68f28b-IAD", + "a0fb59de9ea7c99c-IAD", + "a0fb5a6fdcd0e5c7-IAD", + "a0fb5a0c98a83929-IAD", + "a0fb5aa5ea02f28b-IAD", + "a0fb5aa6c8bfc99c-IAD", + "a0fb5ab6db1c3929-IAD", + "a0fb5aef484ef28b-IAD", + "a0fb5a4c4ce056cd-IAD", + "a0fb5aa8fe07e5c7-IAD", + "a0fb5a87aebc3450-IAD", + "a0fb5b703c638e06-IAD", + "a0fb5b700d713450-IAD", + "a0fb5b702b9ed62d-IAD", + "a0fb5b701bf7e5c7-IAD", + "a0fb5b702e0f7044-IAD", + "a0fb5b702d39c5ae-IAD", + "a0fb5bdc7ead8e06-IAD", + "a0fb5bef9d0ad62d-IAD", + "a0fb5be82e303450-IAD", + "a0fb5befea0de5c7-IAD", + "a0fb5bf82bce7044-IAD", + "a0fb5c447b20c5ae-IAD", + "a0fb5c5bae2bd62d-IAD", + "a0fb5c6c4abf7044-IAD", + "a0fb5c5ad9d78e06-IAD", + "a0fb5c688b44e5c7-IAD", + "a0fb5c6088553450-IAD", + "a0fb5cda39c28e06-IAD", + "a0fb5c986ab2c5ae-IAD", + "a0fb5ce64b16e5c7-IAD", + "a0fb5cd0eeae7044-IAD", + "a0fb5cfc4fc63450-IAD", + "a0fb5d57880c8e06-IAD", + "a0fb5cd0a933d62d-IAD", + "a0fb5d75eee23450-IAD", + "a0fb5d64db84c5ae-IAD", + "a0fb5d670fc8e5c7-IAD", + "a0fb5d6ceea07044-IAD", + "a0fb5de8fca0d62d-IAD", + "a0fb5ddf18048e06-IAD" + ] + } + ], + "conditions": { + "requested": [ + "raw", + "compiled" + ], + "effective": [ + "raw", + "compiled" + ] + } +}