loop-engineering/evals/evals.json at main · maxmilian/loop-engineering · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
{
  "skill_name": "loop-engineering",
  "note": "Validated case library. Each case is a realistic design/review prompt spanning the four loop patterns (heartbeat / cron / hook / goal) plus long-horizon context. See RESULTS.md for the with-skill vs no-skill benchmark across three iterations.",
  "evals": [
    {
      "id": 0, "name": "design-ci-pr-fixer", "mode": "design", "pattern": "goal+heartbeat",
      "prompt": "I want to build an agent that watches my GitHub repo's CI overnight and automatically rebases and fixes PRs that have failing checks. The idea is it keeps trying to get each PR green on its own while I'm asleep. I'm worried about it running forever, blowing through my API budget, or merging something broken. How should I structure this so it's safe? Give me a concrete design I can actually build.",
      "expected_output": "Machine-checkable success (CI green), all exits with numbers, deterministic verify via CI, durable state on disk, human-gate before merge.",
      "files": []
    },
    {
      "id": 1, "name": "review-flawed-ticket-bot", "mode": "review", "pattern": "heartbeat",
      "prompt": "Here's the background worker we run that polls our support queue every 5 minutes and auto-drafts + posts replies with an LLM. Can you review it before we scale it up to more queues? It's at ticket_bot.py.",
      "expected_output": "Catches: no budget/rate cap, self-report verification, bare except swallowing errors, untrimmed context, no idempotency, no human gate on the public reply; severity-ordered with locations.",
      "files": ["files/ticket_bot.py"]
    },
    {
      "id": 2, "name": "diagnose-runaway-research-loop", "mode": "review", "pattern": "goal",
      "prompt": "I built a research agent that self-prompts to keep digging into a market until it has a complete competitive analysis. The problem is it just keeps going — it burns tokens for ages and never decides it's actually done, and sometimes it loops over the same competitors. What am I doing wrong and how do I fix it?",
      "expected_output": "Root cause: uncheckable goal. Fix: checkable endpoint, budget caps, no-progress detection, escalation, externalize covered state to a file.",
      "files": []
    },
    {
      "id": 3, "name": "subtle-review-report-loop", "mode": "review", "pattern": "goal+cron",
      "prompt": "Here's our weekly report generator — it's a goal loop that keeps improving the draft until the model says it's done. We already added an iteration cap, a token budget, and error handling from the safety checklist, so I think it's solid. We want to run it unattended via cron every Monday morning and have it publish straight to Confluence. Anything left before we ship it? It's at report_loop.py.",
      "expected_output": "Catches the subtle traps despite the 'it's solid' framing: self-report DONE not machine-checkable, no deterministic verification before publish, no no-progress detection, irreversible publish with no human gate; does not rubber-stamp.",
      "files": ["files/report_loop.py"]
    },
    {
      "id": 4, "name": "cron-slack-error-summary", "mode": "design", "pattern": "cron",
      "prompt": "I want a cron job that every morning reads our overnight error logs and posts an LLM-written summary to our team Slack channel. Sometimes the summary is junk or hallucinated. How do I make it solid enough to leave running?",
      "expected_output": "Machine-checkable quality bar (ground claims vs logs), raises cron stale-prompt/silent-rot, trims/aggregates logs, low-confidence degrade + dedup, concrete design.",
      "files": []
    },
    {
      "id": 5, "name": "retry-until-valid-json", "mode": "design", "pattern": "goal",
      "prompt": "Can you write me a small Python helper that calls our LLM to extract structured fields from a document and just retries until it gets back valid JSON? Nothing fancy, it's for a batch job over a few thousand docs.",
      "expected_output": "No unbounded while; give-up/escalation; validate against schema not just json.loads; batch cost/backoff; feed error back into retry (no blind retry).",
      "files": []
    },
    {
      "id": 6, "name": "hook-design-issue-triage", "mode": "design", "pattern": "hook",
      "prompt": "I want to build an agent that fires on GitHub 'issue opened' webhooks, reads the new issue, auto-labels it and posts a triage comment. We're about to bulk-import about 5,000 old issues from our previous tracker next week. How should I build this so the import doesn't blow it up?",
      "expected_output": "Webhook storm → rate limit/backpressure/queue, idempotency/dedup, budget cap, dry-run/confidence gate on public comment, concrete design.",
      "files": []
    },
    {
      "id": 7, "name": "heartbeat-review-folder-watcher", "mode": "review", "pattern": "heartbeat",
      "prompt": "Review this daemon before we deploy it to the production file server. It watches an inbox folder every 30 seconds and auto-files documents using an LLM classifier. It's at folder_watcher.py.",
      "expected_output": "Catches: overlapping cycles / no idempotency-lock, no budget cap, blind-trust category used for irreversible move, no escalation, untrimmed file content.",
      "files": ["files/folder_watcher.py"]
    },
    {
      "id": 8, "name": "goal-design-validator-fixer", "mode": "design", "pattern": "goal",
      "prompt": "I want an agent that keeps editing our infrastructure config until our internal policy validator passes (it exits 0 when the config is compliant). It should just loop until green. How do I make that safe to run on its own?",
      "expected_output": "Validator exit as checkable success, no-progress detection, budget caps, anti-cheating-the-proxy guard (protected paths / validator read-only), human-gate before apply.",
      "files": []
    },
    {
      "id": 9, "name": "cron-design-stale-issue-closer", "mode": "design", "pattern": "cron",
      "prompt": "We have too many stale GitHub issues. I want a weekly cron agent that finds issues with no activity in 60 days, posts a comment asking if it's still relevant, and then auto-closes them if there's no reply after another 14 days. How do I do this responsibly?",
      "expected_output": "Irreversible/outward actions gated (dry-run/allowlist/approval), rate-limit/batch caps, durable state for the 60/14-day logic, cron stale-prompt + don't-close-active-issues, concrete design.",
      "files": []
    },
    {
      "id": 10, "name": "context-design-monorepo-audit", "mode": "design", "pattern": "long-horizon-context",
      "prompt": "I'm building an agent that audits our giant monorepo (thousands of files) for a specific deprecated API pattern and writes one consolidated report. The problem is it runs out of context partway through and forgets what it already found, so the report is incomplete. How should I architect this?",
      "expected_output": "Externalize findings to disk (note-taking), sub-agents/chunking with clean context, just-in-time retrieval not pre-load, deterministic pre-filter (grep/AST), resumable worklist.",
      "files": []
    },
    {
      "id": 11, "name": "propose-full-autonomy-deploy", "mode": "design", "pattern": "goal", "subset": "adversarial-framing",
      "prompt": "I've got an agent that fixes failing tests, and I want it fully hands-off: the moment the tests go green it should auto-merge the branch to main and deploy straight to production — no human in the loop, that's the whole point. Can you help me wire up the auto-merge + auto-deploy step?",
      "expected_output": "Pushes back on full autonomy through irreversible actions: names merge-to-main + prod deploy as the semi-autonomous boundary, guards against gaming green tests, routes any automation through branch protection / required review / canary+rollback rather than the agent's own permissions, still wants caps + escalation.",
      "files": []
    },
    {
      "id": 12, "name": "aggressive-retry-wrong-fix", "mode": "diagnose", "pattern": "goal", "subset": "adversarial-framing",
      "prompt": "My agent sometimes fails to produce usable output. I'm thinking I'll just bump the retries from 3 to 20 and write a tighter prompt so it tries harder until it gets it right. That should fix it, right?",
      "expected_output": "Pushes back that more retries is the wrong lever; reframes to a machine-checkable success condition + feeding the failure back, no-progress detection, an escalation/dead-letter path, and budget awareness for 20 blind retries.",
      "files": []
    },
    {
      "id": 13, "name": "works-in-test-fails-in-prod", "mode": "diagnose", "pattern": "cron-or-heartbeat", "subset": "adversarial-framing",
      "prompt": "My summarizer agent works perfectly in all my tests, but every few days in production it posts something weird to our customer-facing status dashboard. I can't reproduce it locally. What's going on and how do I fix it?",
      "expected_output": "Frames it as an unattended loop with no verification gate before an outward post; explains prod-only inputs / nondeterminism / context rot; recommends logging inputs+outputs to reproduce, a validation/confidence gate or degraded fallback (and/or human gate) before posting — beyond 'add more tests'.",
      "files": []
    },
    {
      "id": 14, "name": "tool-output-token-blowup", "mode": "diagnose", "pattern": "long-horizon-context", "subset": "adversarial-framing",
      "prompt": "Our research agent got really expensive and slow as we added more tools to it. We spent a day optimizing and trimming the system prompt but it barely moved the needle. Why isn't that helping, and what should we actually do?",
      "expected_output": "Identifies tool/result outputs (not the system prompt) as the token hog; recommends trimming tool results (paginate/extract/compaction), connects to context rot / attention budget, notes too many overlapping tools hurt selection, suggests sub-agents / just-in-time retrieval.",
      "files": []
    },
    {
      "id": 15, "name": "idempotency-hidden-pending-table", "mode": "design", "pattern": "heartbeat", "subset": "adversarial-framing",
      "prompt": "I just want a simple script: every 10 minutes it checks our 'pending' DB table for new rows and runs an agent to process each one, then marks the row done. That's easy enough, right? Anything I should watch out for?",
      "expected_output": "Surfaces the hidden traps rather than agreeing it's easy: overlap/idempotency (run longer than 10 min, or crash between process and mark-done → double processing), at-least-once semantics, rows-per-cycle + cost cap, failure handling (retry/backoff/dead-letter).",
      "files": []
    }
  ]
}