From 0fce2a7f635e9c134a79535bdcb352a28f634e08 Mon Sep 17 00:00:00 2001 From: Broomva Date: Thu, 11 Jun 2026 20:07:26 -0500 Subject: [PATCH] fix: gate merge-ready on the real merge predicate, not the watcher exit (BRO-1489) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rule-of-three this session on bstack PR #78: `gh pr checks --watch` returned exit 0 three times while the PR was NOT safe to merge (UNSTABLE / pending required checks / mid-flight bot review). The watcher exit code is necessary-not-sufficient, but `cmd_merge_ready` trusted GREEN (= watcher exit 0) and promoted straight to MERGE_READY → auto-merge could fire on a not-actually- mergeable PR. Caught each time only by agent reasoning, not the mechanism. Fix (scripts/p9.py): - merge_ready_verdict(pr, repo): query `gh pr view --json mergeable, mergeStateStatus,reviewDecision` + best-effort `gh api graphql` unresolved- review-thread count. Ready iff mergeStateStatus ∈ {CLEAN, UNSTABLE}, not CONFLICTING, reviewDecision != CHANGES_REQUESTED, 0 unresolved threads. BLOCKED/DIRTY/BEHIND/DRAFT/UNKNOWN or ANY gh error → not ready (fail-safe). Trusts GitHub's server-computed required-vs-non-required (mergeStateStatus) instead of fragile isRequired parsing. - _unresolved_review_threads(): graphql count; -1 if undeterminable (does not block — mergeStateStatus governs; the thread check is a stricter reflex-18 layer). - cmd_merge_ready VERIFIES the verdict before GREEN→MERGE_READY (--no-verify escape). - New `p9 merge-status [--json]` subcommand — query the predicate directly, exit 0 iff merge-ready. Validation: 22 new tests (predicate states + fail-safes + the gate) + full suite 130/130 green. Dogfooded live: `p9 merge-status 78` on the merged bstack PR → NOT-READY; graphql thread count returned 0 against real gh. Dogfooding ALSO caught the original `reviewThreads`-is-not-a-pr-view-field bug → fixed (moved to graphql). P20: independent review PASS 9/10, no blockers. Co-Authored-By: Claude Opus 4.8 (1M context) --- SKILL.md | 21 ++++ scripts/p9.py | 148 +++++++++++++++++++++++++++ tests/test_p9_integration.py | 6 +- tests/test_p9_merge_verdict.py | 180 +++++++++++++++++++++++++++++++++ 4 files changed, 353 insertions(+), 2 deletions(-) create mode 100644 tests/test_p9_merge_verdict.py diff --git a/SKILL.md b/SKILL.md index a1e5dc9..c15f94d 100644 --- a/SKILL.md +++ b/SKILL.md @@ -80,6 +80,27 @@ When the bg task notification fires: - ABANDONED → surface failure to user; remove watcher; skip cleanup ``` +### The watcher exit code is necessary-not-sufficient (BRO-1489) + +`GREEN` only means `gh pr checks --watch` exited 0 — which it does on a *subset* +of checks (required-only) and *before* async bot reviews (CodeRabbit) settle. +Observed three times on bstack PR #78: exit 0 while the PR was `UNSTABLE` / had a +pending review. + +`p9 merge-ready` therefore **verifies the real merge predicate** before marking +`MERGE_READY`: it queries `gh pr view --json mergeable,mergeStateStatus,reviewDecision` +plus a best-effort `gh api graphql` unresolved-thread count, and is ready iff +`mergeStateStatus ∈ {CLEAN, UNSTABLE}` with no `CHANGES_REQUESTED` and zero +unresolved review threads. `BLOCKED`/`DIRTY`/`BEHIND`/`DRAFT`/`UNKNOWN`, an open +thread, or any gh error → refused (fail-safe). Pass `--no-verify` to skip +(test/offline only). + +Query it directly without transitioning state: + +```text +p9 merge-status [--json] # exit 0 iff merge-ready; prints the verdict + reason +``` + ## Termination conditions The agent exits the heal loop when **any** of: diff --git a/scripts/p9.py b/scripts/p9.py index 3868c9c..4ceaabc 100644 --- a/scripts/p9.py +++ b/scripts/p9.py @@ -1430,6 +1430,121 @@ def cmd_events_tail(args: argparse.Namespace) -> int: return EXIT_OK +def merge_ready_verdict(pr: int, repo: str | None) -> dict: + """Query GitHub for the REAL merge predicate — not the `gh pr checks --watch` + exit code, which is necessary-not-sufficient (it returns 0 on a subset of + checks and before async bot reviews settle; observed three times on bstack + PR #78 — see BRO-1489). + + Ready iff GitHub's server-computed ``mergeStateStatus`` says the required + gates are satisfied AND there are zero unresolved review threads: + + - ``CLEAN`` → ready (everything green). + - ``UNSTABLE`` → ready ONLY when 0 unresolved review threads. UNSTABLE means + mergeable with all *required* checks green but some + *non-required* check pending/failing (e.g. a CodeRabbit + soft-status that never resolves). GitHub computes + required-vs-non-required server-side, so we trust + mergeStateStatus instead of fragile isRequired parsing. + - ``BLOCKED`` / ``DIRTY`` / ``BEHIND`` / ``DRAFT`` / ``UNKNOWN`` → not ready. + - any unresolved review thread → not ready (stricter than branch protection; + bstack reflex: every bot/human thread closed before merge). + - any error querying gh → not ready (fail-safe; never merge blind). + + Returns a dict: {ready, state, mergeable, review_decision, unresolved_threads, reason}. + ``unresolved_threads`` is -1 when it could not be determined (best-effort + graphql); an undeterminable count does NOT block (the GitHub-enforced gates + in mergeStateStatus already cover required conversation-resolution). + """ + # mergeable / mergeStateStatus / reviewDecision are the valid `gh pr view + # --json` fields. (reviewThreads is NOT a pr-view field — it needs graphql; + # see _unresolved_review_threads below. This was caught by dogfooding.) + cmd = ["gh", "pr", "view", str(pr), "--json", + "mergeable,mergeStateStatus,reviewDecision"] + if repo: + cmd += ["--repo", repo] + try: + out = subprocess.run(cmd, capture_output=True, text=True, timeout=30, check=False) + except (FileNotFoundError, subprocess.TimeoutExpired) as e: + return {"ready": False, "state": "QUERY_FAILED", "mergeable": "UNKNOWN", + "review_decision": "", "unresolved_threads": -1, + "reason": f"gh unavailable: {e}"} + if out.returncode != 0: + return {"ready": False, "state": "QUERY_FAILED", "mergeable": "UNKNOWN", + "review_decision": "", "unresolved_threads": -1, + "reason": (out.stderr.strip()[:200] or "gh pr view failed")} + try: + data = json.loads(out.stdout) + except json.JSONDecodeError as e: + return {"ready": False, "state": "QUERY_FAILED", "mergeable": "UNKNOWN", + "review_decision": "", "unresolved_threads": -1, + "reason": f"non-JSON from gh: {e}"} + + state = (data.get("mergeStateStatus") or "UNKNOWN").upper() + mergeable = (data.get("mergeable") or "UNKNOWN").upper() + decision = (data.get("reviewDecision") or "").upper() + unresolved = _unresolved_review_threads(pr, repo) # int, or -1 if undeterminable + + if mergeable == "CONFLICTING" or state in {"DIRTY", "BEHIND", "DRAFT", "BLOCKED", "UNKNOWN"}: + ready, reason = False, f"mergeStateStatus={state} mergeable={mergeable}" + elif decision == "CHANGES_REQUESTED": + ready, reason = False, "reviewDecision=CHANGES_REQUESTED" + elif unresolved > 0: + ready, reason = False, f"{unresolved} unresolved review thread(s)" + elif state == "CLEAN": + ready, reason = True, "CLEAN" + elif state == "UNSTABLE": + ready, reason = True, ("UNSTABLE but required checks green + no blocking review " + "(only non-required checks un-green)") + else: + ready, reason = False, f"unrecognized mergeStateStatus={state}" + + return {"ready": ready, "state": state, "mergeable": mergeable, + "review_decision": decision, "unresolved_threads": unresolved, + "reason": reason} + + +def _unresolved_review_threads(pr: int, repo: str | None) -> int: + """Count unresolved review threads via `gh api graphql` (best-effort). + + Returns the count, or -1 if it cannot be determined (no repo, gh/graphql + error). -1 does not block a merge — mergeStateStatus already reflects + GitHub-enforced conversation resolution when branch protection requires it; + this is the stricter bstack reflex-18 layer on top. + """ + if not repo or "/" not in repo: + return -1 + owner, name = repo.split("/", 1) + q = ("query($o:String!,$n:String!,$p:Int!){repository(owner:$o,name:$n)" + "{pullRequest(number:$p){reviewThreads(first:100){nodes{isResolved}}}}}") + cmd = ["gh", "api", "graphql", "-f", f"query={q}", + "-F", f"o={owner}", "-F", f"n={name}", "-F", f"p={pr}"] + try: + out = subprocess.run(cmd, capture_output=True, text=True, timeout=30, check=False) + if out.returncode != 0: + return -1 + nodes = (json.loads(out.stdout)["data"]["repository"]["pullRequest"] + ["reviewThreads"]["nodes"]) + return sum(1 for t in nodes if not t.get("isResolved", False)) + except (FileNotFoundError, subprocess.TimeoutExpired, json.JSONDecodeError, KeyError, TypeError): + return -1 + + +def cmd_merge_status(args: argparse.Namespace) -> int: + """Query + print the real merge predicate for a PR. The agent calls this + instead of trusting the watcher exit code. Exit 0 iff merge-ready.""" + pr = int(args.pr) + repo = args.repo or _detect_repo() + v = merge_ready_verdict(pr, repo) + if args.json: + print(json.dumps(v)) + else: + verdict = "READY" if v["ready"] else "NOT-READY" + print(f"PR #{pr}: {verdict} — {v['reason']} " + f"(mergeStateStatus={v['state']}, unresolved_threads={v['unresolved_threads']})") + return EXIT_OK if v["ready"] else EXIT_DEGRADED + + def cmd_merge_ready(args: argparse.Namespace) -> int: pr = int(args.pr) state = current_pr_state(pr) @@ -1440,6 +1555,27 @@ def cmd_merge_ready(args: argparse.Namespace) -> int: ) return EXIT_DEGRADED repo = args.repo or _detect_repo() + + # Verify the REAL merge predicate before authorizing MERGE_READY. The GREEN + # state only means `gh pr checks --watch` exited 0 — necessary, not + # sufficient. Refuse if the PR is not actually mergeable (BRO-1489). + verdict = None + if not getattr(args, "no_verify", False): + verdict = merge_ready_verdict(pr, repo) + if not verdict["ready"]: + print( + f"PR #{pr} NOT merge-ready: {verdict['reason']} " + f"(mergeStateStatus={verdict['state']}, " + f"unresolved_threads={verdict['unresolved_threads']}).", + file=sys.stderr, + ) + print( + " The watcher exit code is necessary-not-sufficient; refusing to " + "mark MERGE_READY. Re-run once checks settle, or --no-verify to override.", + file=sys.stderr, + ) + return EXIT_DEGRADED + event = PRStateEvent( ts=_utcnow(), pr=pr, @@ -1448,6 +1584,7 @@ def cmd_merge_ready(args: argparse.Namespace) -> int: to_state=PRState.MERGE_READY.value, watcher_id="merge-ready", attempt=0, + extra={"merge_verdict": verdict} if verdict else {"merge_verdict": "skipped (--no-verify)"}, ) append_state_event(event) print(f"PR #{pr} marked MERGE_READY (control metalayer authorizes merge)") @@ -1562,8 +1699,19 @@ def build_parser() -> argparse.ArgumentParser: pm = sub.add_parser("merge-ready", help="Mark PR as ready for metalayer-authorized merge") pm.add_argument("pr") pm.add_argument("--repo", default=None) + pm.add_argument("--no-verify", action="store_true", + help="Skip the live mergeStateStatus check (test/offline only; " + "by default merge-ready verifies the real merge predicate)") pm.set_defaults(func=cmd_merge_ready) + pms = sub.add_parser("merge-status", + help="Query the REAL merge predicate (mergeStateStatus + " + "unresolved review threads) — not the watcher exit code") + pms.add_argument("pr") + pms.add_argument("--repo", default=None) + pms.add_argument("--json", action="store_true", help="Emit the verdict as JSON") + pms.set_defaults(func=cmd_merge_status) + pab = sub.add_parser("abandon", help="Mark a PR as ABANDONED (frees concurrency slot)") pab.add_argument("pr") diff --git a/tests/test_p9_integration.py b/tests/test_p9_integration.py index a255e3f..31ef6bf 100644 --- a/tests/test_p9_integration.py +++ b/tests/test_p9_integration.py @@ -77,8 +77,10 @@ def test_full_lifecycle_via_state_events(self, p9): watcher_id="w100", )) - # 3. merge-ready (CLI command) - rc = p9.main(["merge-ready", "100", "--repo", "broomva/test"]) + # 3. merge-ready (CLI command). --no-verify: this is a state-machine + # lifecycle test, not a gh-verdict test (the merge_ready_verdict gate is + # covered in test_p9_merge_verdict.py). + rc = p9.main(["merge-ready", "100", "--repo", "broomva/test", "--no-verify"]) assert rc == 0 assert p9.current_pr_state(100) == p9.PRState.MERGE_READY diff --git a/tests/test_p9_merge_verdict.py b/tests/test_p9_merge_verdict.py new file mode 100644 index 0000000..33b4b71 --- /dev/null +++ b/tests/test_p9_merge_verdict.py @@ -0,0 +1,180 @@ +"""BRO-1489 — the merge predicate is mergeStateStatus + reviewDecision + +unresolved threads, NOT the `gh pr checks --watch` exit code. + +Covers: merge_ready_verdict (the predicate, which makes a `gh pr view` call plus +a best-effort `gh api graphql` thread count), cmd_merge_status (the query), and +the cmd_merge_ready verify gate. Provenance: bstack PR #78, where the watcher +exited 0 three times while the PR was UNSTABLE / had a pending bot review. +""" +from __future__ import annotations + +import importlib +import json +import sys +from pathlib import Path + +import pytest + +_HERE = Path(__file__).resolve().parent +_FIXTURES = _HERE / "fixtures" + + +@pytest.fixture() +def p9(tmp_path, monkeypatch): + monkeypatch.setenv("BROOMVA_P9_HOME", str(tmp_path)) + monkeypatch.setenv("BROOMVA_P9_POLICY", str(_FIXTURES / "policy-good.yaml")) + if "p9" in sys.modules: + del sys.modules["p9"] + return importlib.import_module("p9") + + +class _Run: + def __init__(self, *, stdout="", stderr="", returncode=0): + self.stdout = stdout + self.stderr = stderr + self.returncode = returncode + + +def _mock_gh(p9, monkeypatch, *, view=None, threads=None, + view_rc=0, view_stderr="", view_exc=None, + view_nonjson=False, graphql_rc=0): + """Dispatch subprocess.run by command: `gh pr view` vs `gh api graphql`.""" + def fake(cmd, *a, **k): + if cmd[:3] == ["gh", "pr", "view"]: + if view_exc is not None: + raise view_exc + if view_nonjson: + return _Run(stdout="not json", returncode=0) + return _Run(stdout=json.dumps(view) if view is not None else "", + stderr=view_stderr, returncode=view_rc) + if cmd[:3] == ["gh", "api", "graphql"]: + if graphql_rc != 0: + return _Run(stderr="graphql err", returncode=graphql_rc) + payload = {"data": {"repository": {"pullRequest": { + "reviewThreads": {"nodes": threads or []}}}}} + return _Run(stdout=json.dumps(payload), returncode=0) + return _Run(returncode=1) + monkeypatch.setattr(p9.subprocess, "run", fake) + + +_CLEAN = {"mergeable": "MERGEABLE", "mergeStateStatus": "CLEAN", "reviewDecision": ""} +_UNSTABLE = {"mergeable": "MERGEABLE", "mergeStateStatus": "UNSTABLE", "reviewDecision": ""} + + +# ── the predicate ──────────────────────────────────────────────────────────── + +class TestVerdict: + def test_clean_no_threads_is_ready(self, p9, monkeypatch): + _mock_gh(p9, monkeypatch, view=_CLEAN, threads=[]) + v = p9.merge_ready_verdict(1, "o/r") + assert v["ready"] is True and v["state"] == "CLEAN" + + def test_unstable_no_threads_is_ready(self, p9, monkeypatch): + # PR #78 case: required green, only a non-required bot check un-green. + _mock_gh(p9, monkeypatch, view=_UNSTABLE, threads=[{"isResolved": True}]) + v = p9.merge_ready_verdict(1, "o/r") + assert v["ready"] is True and v["state"] == "UNSTABLE" + + def test_unstable_with_unresolved_thread_not_ready(self, p9, monkeypatch): + _mock_gh(p9, monkeypatch, view=_UNSTABLE, + threads=[{"isResolved": False}, {"isResolved": True}]) + v = p9.merge_ready_verdict(1, "o/r") + assert v["ready"] is False and v["unresolved_threads"] == 1 + + def test_clean_with_unresolved_thread_not_ready(self, p9, monkeypatch): + _mock_gh(p9, monkeypatch, view=_CLEAN, threads=[{"isResolved": False}]) + assert p9.merge_ready_verdict(1, "o/r")["ready"] is False + + def test_changes_requested_not_ready(self, p9, monkeypatch): + _mock_gh(p9, monkeypatch, threads=[], view={ + "mergeable": "MERGEABLE", "mergeStateStatus": "CLEAN", + "reviewDecision": "CHANGES_REQUESTED"}) + assert p9.merge_ready_verdict(1, "o/r")["ready"] is False + + @pytest.mark.parametrize("state", ["BLOCKED", "DIRTY", "BEHIND", "DRAFT", "UNKNOWN"]) + def test_hard_blocking_states_not_ready(self, p9, monkeypatch, state): + _mock_gh(p9, monkeypatch, threads=[], view={ + "mergeable": "MERGEABLE", "mergeStateStatus": state, "reviewDecision": ""}) + assert p9.merge_ready_verdict(1, "o/r")["ready"] is False + + def test_conflicting_not_ready(self, p9, monkeypatch): + _mock_gh(p9, monkeypatch, threads=[], view={ + "mergeable": "CONFLICTING", "mergeStateStatus": "DIRTY", "reviewDecision": ""}) + assert p9.merge_ready_verdict(1, "o/r")["ready"] is False + + def test_clean_with_undeterminable_threads_still_ready(self, p9, monkeypatch): + # graphql fails → unresolved=-1 → does NOT block (mergeStateStatus governs). + _mock_gh(p9, monkeypatch, view=_CLEAN, graphql_rc=1) + v = p9.merge_ready_verdict(1, "o/r") + assert v["ready"] is True and v["unresolved_threads"] == -1 + + def test_view_failure_fails_safe(self, p9, monkeypatch): + _mock_gh(p9, monkeypatch, view_rc=1, view_stderr="no such PR") + v = p9.merge_ready_verdict(1, "o/r") + assert v["ready"] is False and v["state"] == "QUERY_FAILED" + + def test_gh_missing_fails_safe(self, p9, monkeypatch): + _mock_gh(p9, monkeypatch, view_exc=FileNotFoundError("gh")) + assert p9.merge_ready_verdict(1, "o/r")["ready"] is False + + def test_non_json_fails_safe(self, p9, monkeypatch): + _mock_gh(p9, monkeypatch, view_nonjson=True) + assert p9.merge_ready_verdict(1, "o/r")["ready"] is False + + +# ── the query subcommand ───────────────────────────────────────────────────── + +class TestMergeStatus: + def test_exit_zero_when_ready(self, p9, monkeypatch): + _mock_gh(p9, monkeypatch, view=_CLEAN, threads=[]) + assert p9.main(["merge-status", "5", "--repo", "o/r"]) == p9.EXIT_OK + + def test_exit_degraded_when_not_ready(self, p9, monkeypatch): + _mock_gh(p9, monkeypatch, threads=[], view={ + "mergeable": "MERGEABLE", "mergeStateStatus": "BLOCKED", "reviewDecision": ""}) + assert p9.main(["merge-status", "5", "--repo", "o/r"]) == p9.EXIT_DEGRADED + + def test_json_output(self, p9, monkeypatch, capsys): + _mock_gh(p9, monkeypatch, view=_CLEAN, threads=[]) + p9.main(["merge-status", "5", "--repo", "o/r", "--json"]) + assert json.loads(capsys.readouterr().out)["ready"] is True + + +# ── the merge-ready verify gate ────────────────────────────────────────────── + +def _seed_green(p9, pr): + for prev, curr in [(p9.PRState.PUSHED, p9.PRState.WATCHING), + (p9.PRState.WATCHING, p9.PRState.GREEN)]: + p9.append_state_event(p9.PRStateEvent( + ts="2026-06-12T00:00:00+00:00", pr=pr, repo="o/r", + from_state=prev.value, to_state=curr.value, watcher_id="seed")) + + +class TestMergeReadyGate: + def test_green_and_ready_marks_merge_ready(self, p9, monkeypatch): + _seed_green(p9, 10) + _mock_gh(p9, monkeypatch, view=_CLEAN, threads=[]) + assert p9.main(["merge-ready", "10", "--repo", "o/r"]) == p9.EXIT_OK + assert p9.current_pr_state(10) == p9.PRState.MERGE_READY + + def test_green_but_not_ready_refuses(self, p9, monkeypatch): + """The core bug: watcher said GREEN, but the PR is UNSTABLE with an open + thread. merge-ready must REFUSE and leave the PR in GREEN.""" + _seed_green(p9, 11) + _mock_gh(p9, monkeypatch, view=_UNSTABLE, threads=[{"isResolved": False}]) + assert p9.main(["merge-ready", "11", "--repo", "o/r"]) == p9.EXIT_DEGRADED + assert p9.current_pr_state(11) == p9.PRState.GREEN # NOT promoted + + def test_no_verify_bypasses(self, p9, monkeypatch): + _seed_green(p9, 12) + _mock_gh(p9, monkeypatch, threads=[], view={ + "mergeable": "MERGEABLE", "mergeStateStatus": "BLOCKED", "reviewDecision": ""}) + assert p9.main(["merge-ready", "12", "--repo", "o/r", "--no-verify"]) == p9.EXIT_OK + assert p9.current_pr_state(12) == p9.PRState.MERGE_READY + + def test_gate_blocks_when_gh_unavailable(self, p9, monkeypatch): + """Fail-safe: if the verdict query errors, merge-ready refuses.""" + _seed_green(p9, 13) + _mock_gh(p9, monkeypatch, view_rc=1, view_stderr="gh down") + assert p9.main(["merge-ready", "13", "--repo", "o/r"]) == p9.EXIT_DEGRADED + assert p9.current_pr_state(13) == p9.PRState.GREEN