From 7f63d4e26f1a69fc9860e3cb286c6bad5751461a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20P=C3=A9ter?= Date: Mon, 22 Jun 2026 12:50:02 +0200 Subject: [PATCH 01/15] feat: add unified-diff parser for tokenless diff provider (#2457) --- pr_agent/git_providers/diff_parsing.py | 95 ++++++++++++++++++++++++++ requirements.txt | 1 + tests/unittest/test_diff_parsing.py | 67 ++++++++++++++++++ 3 files changed, 163 insertions(+) create mode 100644 pr_agent/git_providers/diff_parsing.py create mode 100644 tests/unittest/test_diff_parsing.py diff --git a/pr_agent/git_providers/diff_parsing.py b/pr_agent/git_providers/diff_parsing.py new file mode 100644 index 0000000000..15f6cc7993 --- /dev/null +++ b/pr_agent/git_providers/diff_parsing.py @@ -0,0 +1,95 @@ +from unidiff import PatchSet + +from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo +from pr_agent.log import get_logger + + +def _strip_prefix(path: str) -> str: + if path is None: + return None + if path.startswith(("a/", "b/")): + return path[2:] + return path + + +def parse_unified_diff(diff_text: str) -> list[FilePatchInfo]: + """Parse a unified diff into FilePatchInfo objects (patch + metadata only). + + base_file / head_file are left empty here; the provider fills them from the + working tree and by reverse-applying the patch. Binary files are skipped. + """ + patch_set = PatchSet(diff_text) + files: list[FilePatchInfo] = [] + for pf in patch_set: + if pf.is_binary_file: + get_logger().info(f"Skipping binary file in diff: {pf.path}") + continue + if pf.is_added_file: + edit_type = EDIT_TYPE.ADDED + elif pf.is_removed_file: + edit_type = EDIT_TYPE.DELETED + elif pf.is_rename: + edit_type = EDIT_TYPE.RENAMED + else: + edit_type = EDIT_TYPE.MODIFIED + + if pf.is_removed_file: # target is /dev/null: use source path as the name + filename = _strip_prefix(pf.source_file) + else: + filename = _strip_prefix(pf.target_file) + old_filename = _strip_prefix(pf.source_file) if pf.is_rename else None + + files.append( + FilePatchInfo( + base_file="", + head_file="", + patch=str(pf), + filename=filename, + edit_type=edit_type, + old_filename=old_filename, + ) + ) + return files + + +def reconstruct_base_file(head_file_str: str, patch_str: str) -> str: + """Reverse-apply a single-file unified diff to head (new) content to recover + base (original) content. Returns "" if the patch does not cleanly apply.""" + try: + patch_set = PatchSet(patch_str) + except Exception as e: + get_logger().info(f"Could not parse patch for base reconstruction: {e}") + return "" + if len(patch_set) != 1: + return "" + + head_lines = head_file_str.splitlines() + base_lines: list[str] = [] + head_idx = 0 # 0-based cursor into head_lines + + for hunk in patch_set[0]: + hunk_head_start = hunk.target_start - 1 # 1-based -> 0-based + if hunk_head_start < head_idx or hunk_head_start > len(head_lines): + return "" # out-of-order / out-of-bounds hunk + base_lines.extend(head_lines[head_idx:hunk_head_start]) + head_idx = hunk_head_start + + for line in hunk: + value = line.value.rstrip("\n") + if line.is_context: + if head_idx >= len(head_lines) or head_lines[head_idx] != value: + return "" + base_lines.append(head_lines[head_idx]) + head_idx += 1 + elif line.is_added: # present only in head: verify + skip + if head_idx >= len(head_lines) or head_lines[head_idx] != value: + return "" + head_idx += 1 + elif line.is_removed: # present only in base: emit, don't consume head + base_lines.append(value) + + base_lines.extend(head_lines[head_idx:]) + result = "\n".join(base_lines) + if head_file_str.endswith("\n") and result: + result += "\n" + return result diff --git a/requirements.txt b/requirements.txt index 3d06a555ea..dbf326b939 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,6 +27,7 @@ retry==0.9.2 starlette-context==0.3.6 tiktoken==0.12.0 ujson==5.8.0 +unidiff==0.7.5 uvicorn==0.22.0 tenacity==8.2.3 a2a-sdk[http-server]==1.0.3 diff --git a/tests/unittest/test_diff_parsing.py b/tests/unittest/test_diff_parsing.py new file mode 100644 index 0000000000..863a76639a --- /dev/null +++ b/tests/unittest/test_diff_parsing.py @@ -0,0 +1,67 @@ +from pr_agent.algo.types import EDIT_TYPE +from pr_agent.git_providers.diff_parsing import parse_unified_diff + +MODIFY_DIFF = """diff --git a/foo.py b/foo.py +index 1111111..2222222 100644 +--- a/foo.py ++++ b/foo.py +@@ -1,3 +1,3 @@ + line1 +-line2 ++line2-changed + line3 +""" + +ADD_DIFF = """diff --git a/new.py b/new.py +new file mode 100644 +index 0000000..3333333 +--- /dev/null ++++ b/new.py +@@ -0,0 +1,2 @@ ++hello ++world +""" + +DELETE_DIFF = """diff --git a/gone.py b/gone.py +deleted file mode 100644 +index 4444444..0000000 +--- a/gone.py ++++ /dev/null +@@ -1,1 +0,0 @@ +-bye +""" + +RENAME_DIFF = """diff --git a/old.py b/renamed.py +similarity index 100% +rename from old.py +rename to renamed.py +""" + + +def test_parse_modify(): + files = parse_unified_diff(MODIFY_DIFF) + assert len(files) == 1 + f = files[0] + assert f.filename == "foo.py" + assert f.edit_type == EDIT_TYPE.MODIFIED + assert f.old_filename is None + assert "line2-changed" in f.patch + + +def test_parse_add(): + f = parse_unified_diff(ADD_DIFF)[0] + assert f.filename == "new.py" + assert f.edit_type == EDIT_TYPE.ADDED + + +def test_parse_delete(): + f = parse_unified_diff(DELETE_DIFF)[0] + assert f.filename == "gone.py" + assert f.edit_type == EDIT_TYPE.DELETED + + +def test_parse_rename(): + f = parse_unified_diff(RENAME_DIFF)[0] + assert f.filename == "renamed.py" + assert f.edit_type == EDIT_TYPE.RENAMED + assert f.old_filename == "old.py" From 5f0c0791f4e6110ea5d6b6652674301ed3845c5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20P=C3=A9ter?= Date: Mon, 22 Jun 2026 12:50:35 +0200 Subject: [PATCH 02/15] feat: reconstruct base file by reverse-applying diff (#2457) --- tests/unittest/test_diff_parsing.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/unittest/test_diff_parsing.py b/tests/unittest/test_diff_parsing.py index 863a76639a..e88b3198bd 100644 --- a/tests/unittest/test_diff_parsing.py +++ b/tests/unittest/test_diff_parsing.py @@ -65,3 +65,27 @@ def test_parse_rename(): assert f.filename == "renamed.py" assert f.edit_type == EDIT_TYPE.RENAMED assert f.old_filename == "old.py" + + +from pr_agent.git_providers.diff_parsing import reconstruct_base_file + +_PATCH = """--- a/foo.py ++++ b/foo.py +@@ -1,3 +1,3 @@ + line1 +-line2 ++line2-changed + line3 +""" + +HEAD = "line1\nline2-changed\nline3\n" +BASE = "line1\nline2\nline3\n" + + +def test_reconstruct_base_success(): + assert reconstruct_base_file(HEAD, _PATCH) == BASE + + +def test_reconstruct_base_drift_returns_empty(): + drifted_head = "completely\ndifferent\ncontent\n" + assert reconstruct_base_file(drifted_head, _PATCH) == "" From 2fe67733d630561d40a8d0556b75ca4d88366287 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20P=C3=A9ter?= Date: Mon, 22 Jun 2026 12:55:47 +0200 Subject: [PATCH 03/15] refactor: harden diff base reconstruction (CRLF, newline, tests) (#2457) --- pr_agent/git_providers/diff_parsing.py | 6 +-- tests/unittest/test_diff_parsing.py | 70 ++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/pr_agent/git_providers/diff_parsing.py b/pr_agent/git_providers/diff_parsing.py index 15f6cc7993..45e2caae15 100644 --- a/pr_agent/git_providers/diff_parsing.py +++ b/pr_agent/git_providers/diff_parsing.py @@ -4,7 +4,7 @@ from pr_agent.log import get_logger -def _strip_prefix(path: str) -> str: +def _strip_prefix(path: str | None) -> str | None: if path is None: return None if path.startswith(("a/", "b/")): @@ -75,7 +75,7 @@ def reconstruct_base_file(head_file_str: str, patch_str: str) -> str: head_idx = hunk_head_start for line in hunk: - value = line.value.rstrip("\n") + value = line.value.rstrip("\r\n") if line.is_context: if head_idx >= len(head_lines) or head_lines[head_idx] != value: return "" @@ -90,6 +90,6 @@ def reconstruct_base_file(head_file_str: str, patch_str: str) -> str: base_lines.extend(head_lines[head_idx:]) result = "\n".join(base_lines) - if head_file_str.endswith("\n") and result: + if head_file_str.endswith("\n"): result += "\n" return result diff --git a/tests/unittest/test_diff_parsing.py b/tests/unittest/test_diff_parsing.py index e88b3198bd..a4b715f18b 100644 --- a/tests/unittest/test_diff_parsing.py +++ b/tests/unittest/test_diff_parsing.py @@ -1,5 +1,5 @@ from pr_agent.algo.types import EDIT_TYPE -from pr_agent.git_providers.diff_parsing import parse_unified_diff +from pr_agent.git_providers.diff_parsing import parse_unified_diff, reconstruct_base_file MODIFY_DIFF = """diff --git a/foo.py b/foo.py index 1111111..2222222 100644 @@ -67,8 +67,6 @@ def test_parse_rename(): assert f.old_filename == "old.py" -from pr_agent.git_providers.diff_parsing import reconstruct_base_file - _PATCH = """--- a/foo.py +++ b/foo.py @@ -1,3 +1,3 @@ @@ -89,3 +87,69 @@ def test_reconstruct_base_success(): def test_reconstruct_base_drift_returns_empty(): drifted_head = "completely\ndifferent\ncontent\n" assert reconstruct_base_file(drifted_head, _PATCH) == "" + + +# --- new tests for reconstruct_base_file --- + +_MULTI_HUNK_PATCH = """--- a/bar.py ++++ b/bar.py +@@ -1,3 +1,3 @@ + alpha +-beta ++beta-new + gamma +@@ -5,3 +5,3 @@ + delta +-epsilon ++epsilon-new + zeta +""" + +_MULTI_HUNK_HEAD = "alpha\nbeta-new\ngamma\n\ndelta\nepsilon-new\nzeta\n" +_MULTI_HUNK_BASE = "alpha\nbeta\ngamma\n\ndelta\nepsilon\nzeta\n" + + +def test_reconstruct_base_multi_hunk(): + """Base is correctly reconstructed across two hunks.""" + result = reconstruct_base_file(_MULTI_HUNK_HEAD, _MULTI_HUNK_PATCH) + assert result == _MULTI_HUNK_BASE + + +_NO_TRAILING_NL_PATCH = """--- a/noeol.py ++++ b/noeol.py +@@ -1,3 +1,3 @@ + line1 +-line2 ++line2-changed + line3 +""" + +_NO_TRAILING_NL_HEAD = "line1\nline2-changed\nline3" # no trailing newline +_NO_TRAILING_NL_BASE = "line1\nline2\nline3" # no trailing newline expected + + +def test_reconstruct_base_no_trailing_newline(): + """Result has no trailing newline when head has none.""" + result = reconstruct_base_file(_NO_TRAILING_NL_HEAD, _NO_TRAILING_NL_PATCH) + assert result == _NO_TRAILING_NL_BASE + assert not result.endswith("\n") + + +# A diff that added a file (base was empty); reversed: head is the added content, +# base should be empty. Because head ends with "\n", our fix appends "\n" to the +# empty join, so the result is "\n". +_ADD_FILE_PATCH = """--- /dev/null ++++ b/new.py +@@ -0,0 +1,2 @@ ++hello ++world +""" + +_ADD_FILE_HEAD = "hello\nworld\n" + + +def test_reconstruct_base_add_to_empty(): + """Reversing an add-file patch yields an empty (or lone-newline) base.""" + result = reconstruct_base_file(_ADD_FILE_HEAD, _ADD_FILE_PATCH) + # head ends with "\n", so the trailing-newline guard appends "\n" to "". + assert result == "\n" From 3a7f74e4e671a01f353f7442a424e616c4c6af48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20P=C3=A9ter?= Date: Mon, 22 Jun 2026 13:00:36 +0200 Subject: [PATCH 04/15] feat: add DiffGitProvider with stdout/file output (#2457) --- pr_agent/git_providers/__init__.py | 4 +- pr_agent/git_providers/diff_provider.py | 136 ++++++++++++++++++++++++ tests/unittest/test_diff_provider.py | 57 ++++++++++ 3 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 pr_agent/git_providers/diff_provider.py create mode 100644 tests/unittest/test_diff_provider.py diff --git a/pr_agent/git_providers/__init__.py b/pr_agent/git_providers/__init__.py index 055cdbf140..1b52f26247 100644 --- a/pr_agent/git_providers/__init__.py +++ b/pr_agent/git_providers/__init__.py @@ -13,6 +13,7 @@ from pr_agent.git_providers.gitlab_provider import GitLabProvider from pr_agent.git_providers.local_git_provider import LocalGitProvider from pr_agent.git_providers.gitea_provider import GiteaProvider +from pr_agent.git_providers.diff_provider import DiffGitProvider _GIT_PROVIDERS = { 'github': GithubProvider, @@ -23,7 +24,8 @@ 'codecommit': CodeCommitProvider, 'local': LocalGitProvider, 'gerrit': GerritProvider, - 'gitea': GiteaProvider + 'gitea': GiteaProvider, + 'diff': DiffGitProvider, } diff --git a/pr_agent/git_providers/diff_provider.py b/pr_agent/git_providers/diff_provider.py new file mode 100644 index 0000000000..d3b50ef850 --- /dev/null +++ b/pr_agent/git_providers/diff_provider.py @@ -0,0 +1,136 @@ +import os +from collections import Counter +from typing import List, Optional + +from pr_agent.algo.types import FilePatchInfo +from pr_agent.config_loader import get_settings +from pr_agent.git_providers.diff_parsing import (parse_unified_diff, + reconstruct_base_file) +from pr_agent.git_providers.git_provider import GitProvider +from pr_agent.log import get_logger + + +class PullRequestMimic: + def __init__(self, title: str, diff_files: List[FilePatchInfo]): + self.title = title + self.diff_files = diff_files + + +class DiffGitProvider(GitProvider): + """Tokenless provider that reviews a raw unified diff (stdin/file). + + The diff text and optional output path are read from global settings + (diff.content, diff.output_path). The pr_url arg is an ignored sentinel. + """ + + def __init__(self, pr_url=None, incremental=False): + diff_text = get_settings().get("diff.content", None) + if not diff_text or not str(diff_text).strip(): + raise ValueError("No diff content provided for the 'diff' git provider") + self.diff_text = diff_text + self.output_path = get_settings().get("diff.output_path", None) + self.diff_files = None + self.pr = PullRequestMimic(self.get_pr_title(), self.get_diff_files()) + # inline code comments are not supported for the diff provider + get_settings().pr_reviewer.inline_code_comments = False + + def get_diff_files(self) -> List[FilePatchInfo]: + if self.diff_files is not None: + return self.diff_files + files = parse_unified_diff(self.diff_text) + for f in files: + head = "" + if f.filename and os.path.isfile(f.filename): + try: + with open(f.filename, "r", encoding="utf-8") as fh: + head = fh.read() + except Exception as e: + get_logger().info(f"Could not read working-tree file {f.filename}: {e}") + f.head_file = head + f.base_file = reconstruct_base_file(head, f.patch) if head else "" + self.diff_files = files + return files + + def get_files(self) -> List[str]: + return [f.filename for f in self.get_diff_files()] + + def _write_output(self, content: str): + print(content) + if self.output_path: + try: + with open(self.output_path, "w", encoding="utf-8") as fh: + fh.write(content) + except Exception as e: + get_logger().error(f"Failed to write output to {self.output_path}: {e}") + + def publish_comment(self, pr_comment: str, is_temporary: bool = False): + if is_temporary: + return # don't emit "Preparing review..." placeholders to stdout + self._write_output(pr_comment) + + def publish_description(self, pr_title: str, pr_body: str): + self._write_output(f"{pr_title}\n\n{pr_body}") + + def is_supported(self, capability: str) -> bool: + if capability in ["get_issue_comments", "create_inline_comment", + "publish_inline_comments", "get_labels"]: + return False + return True + + def get_languages(self): + files = [f.filename for f in self.get_diff_files() if f.filename] + lang_count = Counter(os.path.splitext(name)[1].lstrip(".").lower() for name in files) + total = sum(lang_count.values()) or 1 + return {lang: count / total * 100 for lang, count in lang_count.items()} + + def get_pr_title(self): + return "Local diff review" + + def get_pr_description_full(self): + return "" + + def get_user_id(self): + return -1 + + def get_pr_branch(self): + return "" + + # ---- unsupported publish operations (no-op or NotImplementedError) ---- + def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): + raise NotImplementedError("Inline comments are not supported by the diff provider") + + def publish_inline_comments(self, comments: list): + raise NotImplementedError("Inline comments are not supported by the diff provider") + + def publish_code_suggestion(self, body: str, relevant_file: str, relevant_lines_start: int, relevant_lines_end: int): + raise NotImplementedError("Code suggestions are not supported by the diff provider") + + def publish_code_suggestions(self, code_suggestions: list) -> bool: + raise NotImplementedError("Code suggestions are not supported by the diff provider") + + def publish_labels(self, labels): + pass + + def remove_initial_comment(self): + pass + + def remove_comment(self, comment): + pass + + def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: + pass + + def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool: + pass + + def get_commit_messages(self): + return "" + + def get_repo_settings(self): + return None + + def get_issue_comments(self): + raise NotImplementedError("Issue comments are not supported by the diff provider") + + def get_pr_labels(self, update=False): + return [] diff --git a/tests/unittest/test_diff_provider.py b/tests/unittest/test_diff_provider.py new file mode 100644 index 0000000000..a578d72617 --- /dev/null +++ b/tests/unittest/test_diff_provider.py @@ -0,0 +1,57 @@ +from pr_agent.algo.types import EDIT_TYPE +from pr_agent.config_loader import get_settings +from pr_agent.git_providers import _GIT_PROVIDERS +from pr_agent.git_providers.diff_provider import DiffGitProvider + +DIFF = """diff --git a/foo.py b/foo.py +index 1111111..2222222 100644 +--- a/foo.py ++++ b/foo.py +@@ -1,3 +1,3 @@ + line1 +-line2 ++line2-changed + line3 +""" + + +def test_registered(): + assert _GIT_PROVIDERS["diff"] is DiffGitProvider + + +def test_get_diff_files(monkeypatch): + get_settings().set("diff.content", DIFF) + get_settings().set("diff.output_path", None) + provider = DiffGitProvider(None) + files = provider.get_diff_files() + assert len(files) == 1 + assert files[0].filename == "foo.py" + assert files[0].edit_type == EDIT_TYPE.MODIFIED + + +def test_publish_comment_to_stdout(capsys): + get_settings().set("diff.content", DIFF) + get_settings().set("diff.output_path", None) + provider = DiffGitProvider(None) + provider.publish_comment("# Review\nlooks good") + captured = capsys.readouterr() + assert "looks good" in captured.out + + +def test_publish_comment_to_file(tmp_path): + out = tmp_path / "review.md" + get_settings().set("diff.content", DIFF) + get_settings().set("diff.output_path", str(out)) + provider = DiffGitProvider(None) + provider.publish_comment("# Review\nsaved") + assert "saved" in out.read_text(encoding="utf-8") + + +def test_empty_diff_raises(): + get_settings().set("diff.content", "") + get_settings().set("diff.output_path", None) + try: + DiffGitProvider(None) + assert False, "expected ValueError" + except ValueError: + pass From eb2bf46c71bffad703c9e281140900b5d123d8aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20P=C3=A9ter?= Date: Mon, 22 Jun 2026 13:05:23 +0200 Subject: [PATCH 05/15] fix: guard malformed diff parsing in DiffGitProvider (#2457) --- pr_agent/git_providers/diff_provider.py | 5 ++++- tests/unittest/test_diff_provider.py | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/pr_agent/git_providers/diff_provider.py b/pr_agent/git_providers/diff_provider.py index d3b50ef850..4cad34c372 100644 --- a/pr_agent/git_providers/diff_provider.py +++ b/pr_agent/git_providers/diff_provider.py @@ -37,7 +37,10 @@ def __init__(self, pr_url=None, incremental=False): def get_diff_files(self) -> List[FilePatchInfo]: if self.diff_files is not None: return self.diff_files - files = parse_unified_diff(self.diff_text) + try: + files = parse_unified_diff(self.diff_text) + except Exception as e: + raise ValueError(f"Failed to parse the provided diff: {e}") from e for f in files: head = "" if f.filename and os.path.isfile(f.filename): diff --git a/tests/unittest/test_diff_provider.py b/tests/unittest/test_diff_provider.py index a578d72617..fcc719bcaf 100644 --- a/tests/unittest/test_diff_provider.py +++ b/tests/unittest/test_diff_provider.py @@ -55,3 +55,24 @@ def test_empty_diff_raises(): assert False, "expected ValueError" except ValueError: pass + + +def test_temporary_comment_not_emitted(capsys): + get_settings().set("diff.content", DIFF) + get_settings().set("diff.output_path", None) + provider = DiffGitProvider(None) + provider.publish_comment("Preparing review...", is_temporary=True) + captured = capsys.readouterr() + assert "Preparing review" not in captured.out + + +def test_malformed_diff_raises_valueerror(): + # A hunk with no file header triggers UnidiffParseError inside parse_unified_diff, + # which the provider must re-raise as ValueError with a clear message. + get_settings().set("diff.content", "@@ -1,3 +1,3 @@\n line1\n-line2\n+line2-changed\n line3\n") + get_settings().set("diff.output_path", None) + try: + DiffGitProvider(None) + assert False, "expected ValueError" + except ValueError: + pass From e4868f837e052a695ab77e924c0ec6926704b8dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20P=C3=A9ter?= Date: Mon, 22 Jun 2026 13:11:48 +0200 Subject: [PATCH 06/15] feat: CLI --stdin/--diff-file/--output for tokenless diff mode (#2457) --- pr_agent/cli.py | 26 ++++++++++++++++++++++++-- tests/unittest/test_cli_diff_mode.py | 15 +++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 tests/unittest/test_cli_diff_mode.py diff --git a/pr_agent/cli.py b/pr_agent/cli.py index 246105af20..a1e29b447c 100644 --- a/pr_agent/cli.py +++ b/pr_agent/cli.py @@ -1,6 +1,7 @@ import argparse import asyncio import os +import sys from pr_agent.agent.pr_agent import PRAgent, commands from pr_agent.algo.utils import get_version @@ -65,6 +66,12 @@ def set_parser(): "Repo-local .pr_agent.toml overrides values set here." ), ) + parser.add_argument('--diff-file', dest='diff_file', type=str, default=None, + help='Path to a unified diff file to review (tokenless local mode)') + parser.add_argument('--stdin', action='store_true', default=False, + help='Read a unified diff from stdin (tokenless local mode)') + parser.add_argument('--output', dest='output', type=str, default=None, + help='Write the result to this file (in addition to stdout)') parser.add_argument('command', type=str, help='The', choices=commands, default='review') parser.add_argument('rest', nargs=argparse.REMAINDER, default=[]) return parser @@ -83,7 +90,21 @@ def run(inargs=None, args=None): parser = set_parser() if not args: args = parser.parse_args(inargs) - if not args.pr_url and not args.issue_url: + diff_mode = getattr(args, "stdin", False) or getattr(args, "diff_file", None) + if diff_mode: + if args.stdin and args.diff_file: + parser.error("--stdin and --diff-file are mutually exclusive") + if args.diff_file: + with open(args.diff_file, "r", encoding="utf-8") as fh: + diff_content = fh.read() + else: + diff_content = sys.stdin.read() + if not diff_content.strip(): + parser.error("No diff content received (empty stdin/file)") + get_settings().set("config.git_provider", "diff") + get_settings().set("diff.content", diff_content) + get_settings().set("diff.output_path", getattr(args, "output", None)) + elif not args.pr_url and not args.issue_url: parser.print_help() return @@ -106,7 +127,8 @@ async def inner(): if args.issue_url: result = await asyncio.create_task(PRAgent().handle_request(args.issue_url, [command] + args.rest)) else: - result = await asyncio.create_task(PRAgent().handle_request(args.pr_url, [command] + args.rest)) + target = args.pr_url if args.pr_url else "local_diff" + result = await asyncio.create_task(PRAgent().handle_request(target, [command] + args.rest)) if get_settings().litellm.get("enable_callbacks", False): # There may be additional events on the event queue from the run above. If there are give them time to complete. diff --git a/tests/unittest/test_cli_diff_mode.py b/tests/unittest/test_cli_diff_mode.py new file mode 100644 index 0000000000..223a6f13b4 --- /dev/null +++ b/tests/unittest/test_cli_diff_mode.py @@ -0,0 +1,15 @@ +from pr_agent.cli import set_parser + + +def test_parser_has_diff_flags(): + parser = set_parser() + args = parser.parse_args(["--diff-file", "x.diff", "--output", "out.md", "review"]) + assert args.diff_file == "x.diff" + assert args.output == "out.md" + assert args.command == "review" + + +def test_parser_stdin_flag(): + parser = set_parser() + args = parser.parse_args(["--stdin", "review"]) + assert args.stdin is True From 4416ce27ab90f98eb1e56937eb4d83b6e0b0065d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20P=C3=A9ter?= Date: Mon, 22 Jun 2026 13:15:25 +0200 Subject: [PATCH 07/15] test: end-to-end coverage for tokenless diff provider (#2457) --- tests/unittest/test_diff_mode_e2e.py | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 tests/unittest/test_diff_mode_e2e.py diff --git a/tests/unittest/test_diff_mode_e2e.py b/tests/unittest/test_diff_mode_e2e.py new file mode 100644 index 0000000000..6f63660cb8 --- /dev/null +++ b/tests/unittest/test_diff_mode_e2e.py @@ -0,0 +1,30 @@ +import pytest + +from pr_agent.config_loader import get_settings +from pr_agent.git_providers.diff_provider import DiffGitProvider + +DIFF = """diff --git a/foo.py b/foo.py +index 1111111..2222222 100644 +--- a/foo.py ++++ b/foo.py +@@ -1,3 +1,3 @@ + line1 +-line2 ++line2-changed + line3 +""" + + +def test_provider_end_to_end_files_and_output(capsys): + get_settings().set("diff.content", DIFF) + get_settings().set("diff.output_path", None) + provider = DiffGitProvider(None) + + # files parsed and content reconstructed where working tree is absent + files = provider.get_diff_files() + assert files[0].filename == "foo.py" + assert files[0].base_file == "" # foo.py not on disk -> patch-only fallback + + # output reaches stdout + provider.publish_comment("## PR Review\n- finding one") + assert "finding one" in capsys.readouterr().out From 8306df744f8001dfd6b0b038fff913246268cc4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20P=C3=A9ter?= Date: Mon, 22 Jun 2026 13:21:35 +0200 Subject: [PATCH 08/15] test: strengthen diff provider e2e (base reconstruction + mocked-LLM) (#2457) --- tests/unittest/test_diff_mode_e2e.py | 92 ++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/tests/unittest/test_diff_mode_e2e.py b/tests/unittest/test_diff_mode_e2e.py index 6f63660cb8..6c4fbc0c50 100644 --- a/tests/unittest/test_diff_mode_e2e.py +++ b/tests/unittest/test_diff_mode_e2e.py @@ -1,7 +1,9 @@ import pytest +from unittest.mock import AsyncMock, MagicMock from pr_agent.config_loader import get_settings from pr_agent.git_providers.diff_provider import DiffGitProvider +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler DIFF = """diff --git a/foo.py b/foo.py index 1111111..2222222 100644 @@ -23,8 +25,98 @@ def test_provider_end_to_end_files_and_output(capsys): # files parsed and content reconstructed where working tree is absent files = provider.get_diff_files() assert files[0].filename == "foo.py" + # The causal condition: head_file is empty because foo.py is not on disk + assert files[0].head_file == "" + # Consequence: base_file is also empty (patch-only fallback, no reconstruction) assert files[0].base_file == "" # foo.py not on disk -> patch-only fallback # output reaches stdout provider.publish_comment("## PR Review\n- finding one") assert "finding one" in capsys.readouterr().out + + +def test_base_file_reconstructed_from_working_tree(tmp_path, monkeypatch): + """When the working-tree file exists, head_file is populated and base_file + is reconstructed from it by reversing the diff patch.""" + # The HEAD (current) content of foo.py — line2 has already been changed + head_content = "line1\nline2-changed\nline3\n" + foo = tmp_path / "foo.py" + foo.write_text(head_content, encoding="utf-8") + + # Change cwd so the provider's relative-path lookup resolves into tmp_path + monkeypatch.chdir(tmp_path) + + get_settings().set("diff.content", DIFF) + get_settings().set("diff.output_path", None) + provider = DiffGitProvider(None) + + files = provider.get_diff_files() + assert files[0].filename == "foo.py" + + # head_file must be the working-tree content (non-empty) + assert files[0].head_file != "", "head_file should be populated from the working-tree file" + assert "line2-changed" in files[0].head_file + + # base_file must be reconstructed (non-empty) + assert files[0].base_file != "", "base_file should be reconstructed by reversing the patch" + # The original (pre-change) content must contain the old line, not the new one + assert "line2" in files[0].base_file + assert "line2-changed" not in files[0].base_file + + +@pytest.mark.asyncio +async def test_review_command_through_diff_provider_mocked_llm(monkeypatch): + """Integration test: drives PRReviewer with a fake AI handler through the + diff provider. We assert that the AI handler is invoked with a prompt that + contains the changed content from the diff, proving end-to-end wiring from + diff -> DiffGitProvider -> PRReviewer -> LLM boundary. + + We do NOT attempt to parse back a full schema-valid review YAML from the + fake response, because the exact expected schema is prone to breaking with + prompt changes. The 'handler was called with the diff content' assertion is + the meaningful claim here. + """ + from pr_agent.tools.pr_reviewer import PRReviewer + + # --- configure provider --- + get_settings().set("config.git_provider", "diff") + get_settings().set("diff.content", DIFF) + get_settings().set("diff.output_path", None) + # Disable publish so we don't need a real comment sink + get_settings().set("config.publish_output", False) + + # --- fake AI handler --- + # PRReviewer calls ai_handler() (a factory/partial), so we pass a class + # whose constructor returns the fake instance. + calls = [] # records (system, user) for each chat_completion call + + class FakeAiHandler(BaseAiHandler): + def __init__(self): + # No super().__init__() — BaseAiHandler is abstract, nothing to call + self.main_pr_language = None + + @property + def deployment_id(self): + return "fake" + + async def chat_completion(self, model: str, system: str, user: str, + temperature: float = 0.2, img_path: str = None): + calls.append({"system": system, "user": user}) + # Return a minimal response; PRReviewer will store this as self.prediction + # and then attempt _prepare_pr_review(). If parsing fails the run() + # method catches the exception gracefully, so the test won't error out. + return ("## Review\nNo major issues detected.", "stop") + + reviewer = PRReviewer("local_diff", ai_handler=FakeAiHandler, args=[]) + await reviewer.run() + + # The AI handler must have been called at least once + assert len(calls) >= 1, "FakeAiHandler.chat_completion was never called — end-to-end wiring is broken" + + # The prompt sent to the LLM must contain content derived from the diff + # (the changed line "line2-changed" appears in the diff hunk) + all_prompt_text = " ".join(c["system"] + c["user"] for c in calls) + assert "line2" in all_prompt_text, ( + "The diff content ('line2') was not found in the prompt sent to the AI handler. " + "End-to-end wiring from diff -> provider -> reviewer -> LLM is broken." + ) From 5b83516e9a30af515fe3e65aa5aec3af5d4941e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20P=C3=A9ter?= Date: Mon, 22 Jun 2026 13:25:32 +0200 Subject: [PATCH 09/15] docs: document tokenless local diff mode (#2457) --- docs/docs/usage-guide/tokenless_diff_mode.md | 75 ++++++++++++++++++++ docs/mkdocs.yml | 1 + 2 files changed, 76 insertions(+) create mode 100644 docs/docs/usage-guide/tokenless_diff_mode.md diff --git a/docs/docs/usage-guide/tokenless_diff_mode.md b/docs/docs/usage-guide/tokenless_diff_mode.md new file mode 100644 index 0000000000..08cc7559c2 --- /dev/null +++ b/docs/docs/usage-guide/tokenless_diff_mode.md @@ -0,0 +1,75 @@ +## Tokenless local diff mode + +Run PR-Agent against a raw unified diff with no platform API token and no PR URL. +Results are printed to stdout (and optionally saved to a file). This suits security-first +or air-gapped environments where HTTP access tokens are avoided, and enables pre-push +hooks or CI pipelines that operate on a diff artifact rather than a live pull request. + +## Usage + +Pipe a diff directly from stdin: + +```bash +git diff main...feature-branch | python -m pr_agent.cli --stdin review +``` + +Or pass a diff file and save the output alongside stdout: + +```bash +git diff main...feature-branch > changes.diff +python -m pr_agent.cli --diff-file changes.diff --output review.md review +``` + +### Flags + +| Flag | Description | +|---|---| +| `--stdin` | Read a unified diff from stdin | +| `--diff-file ` | Read a unified diff from a file | +| `--output ` | Write the result to a file in addition to stdout | + +`--stdin` and `--diff-file` are mutually exclusive. At least one must be provided to +enter tokenless diff mode; omitting both falls back to the normal `--pr_url` flow. + +### Supported commands + +`review`, `improve`, `describe`, and `ask` are supported. Commands that require live +platform interaction (such as `update_changelog` or `similar_issue`) are not meaningful +in this mode. + +## How it works + +1. **Diff parsing** — the unified diff is parsed locally into per-file patch objects. + Binary files are skipped automatically. + +2. **Working-tree enrichment** — when PR-Agent is run inside the repository working tree, + it reads each changed file from disk and reverse-applies the diff to reconstruct the + base (pre-change) file content. Having both the base and head versions available gives + the LLM full file context, which produces higher-quality analysis. + +3. **Patch-only fallback** — if a changed file cannot be found on disk (e.g. the diff was + generated elsewhere, or the file was deleted), PR-Agent falls back to patch-only mode + for that file. The review still runs; it simply has less context. + +4. **Output** — the result is written to stdout. If `--output ` is given, it is + also written to that file (UTF-8, overwritten on each run). + +No platform token, no PR URL, and no internet access are required for the diff processing +step itself. An LLM API key is still needed unless you configure a local model. + +## Difference from the `local` git provider + +The existing `git_provider = "local"` mode (invoked with `--pr_url`) computes a diff +by comparing branches in a local Git repository and requires a clean working tree. +The diff mode is different in the following ways: + +| | `local` provider | `diff` provider (this page) | +|---|---|---| +| Input | Branch names in a local repo | A unified diff supplied via stdin or file | +| Working tree required | Yes (clean) | No | +| Platform token required | No | No | +| Output | GitHub-style comment published locally | stdout (+ optional file) | +| Inline comments | Supported | Not supported | + +Use the `diff` provider when you already have a diff artifact (e.g. from a CI step or +`git format-patch`) and want a zero-configuration, token-free review. diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index a448a44074..cfc72e806a 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -17,6 +17,7 @@ nav: - Managing Mail Notifications: 'usage-guide/mail_notifications.md' - Changing a Model: 'usage-guide/changing_a_model.md' - Additional Configurations: 'usage-guide/additional_configurations.md' + - Tokenless Diff Mode: 'usage-guide/tokenless_diff_mode.md' - Frequently Asked Questions: 'faq/index.md' - Tools: - 'tools/index.md' From daeb714da7945c30e08451d53bdfb2784b9341ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20P=C3=A9ter?= Date: Mon, 22 Jun 2026 13:36:03 +0200 Subject: [PATCH 10/15] fix: harden DiffGitProvider capability gating and file reads (#2457) - Add publish_file_comments to is_supported() deny-list so describe walkthrough is not silently dropped with inline_file_summary=true - Guard get_diff_files() against path traversal: reject absolute paths and relative paths that resolve outside the repo root; log and fall back to patch-only mode instead of raising - Fix tokenless_diff_mode.md comparison table: local provider inline comments are Not supported (raises NotImplementedError) - Promote page title from H2 to H1 to match other usage-guide pages - Add tests: test_publish_file_comments_not_supported, test_path_traversal_file_not_read --- docs/docs/usage-guide/tokenless_diff_mode.md | 4 +-- pr_agent/git_providers/diff_provider.py | 27 +++++++++++++++----- tests/unittest/test_diff_provider.py | 27 ++++++++++++++++++++ 3 files changed, 49 insertions(+), 9 deletions(-) diff --git a/docs/docs/usage-guide/tokenless_diff_mode.md b/docs/docs/usage-guide/tokenless_diff_mode.md index 08cc7559c2..197f125b0e 100644 --- a/docs/docs/usage-guide/tokenless_diff_mode.md +++ b/docs/docs/usage-guide/tokenless_diff_mode.md @@ -1,4 +1,4 @@ -## Tokenless local diff mode +# Tokenless local diff mode Run PR-Agent against a raw unified diff with no platform API token and no PR URL. Results are printed to stdout (and optionally saved to a file). This suits security-first @@ -69,7 +69,7 @@ The diff mode is different in the following ways: | Working tree required | Yes (clean) | No | | Platform token required | No | No | | Output | GitHub-style comment published locally | stdout (+ optional file) | -| Inline comments | Supported | Not supported | +| Inline comments | Not supported | Not supported | Use the `diff` provider when you already have a diff artifact (e.g. from a CI step or `git format-patch`) and want a zero-configuration, token-free review. diff --git a/pr_agent/git_providers/diff_provider.py b/pr_agent/git_providers/diff_provider.py index 4cad34c372..97fa72121a 100644 --- a/pr_agent/git_providers/diff_provider.py +++ b/pr_agent/git_providers/diff_provider.py @@ -41,14 +41,26 @@ def get_diff_files(self) -> List[FilePatchInfo]: files = parse_unified_diff(self.diff_text) except Exception as e: raise ValueError(f"Failed to parse the provided diff: {e}") from e + root = os.path.realpath(os.getcwd()) for f in files: head = "" - if f.filename and os.path.isfile(f.filename): - try: - with open(f.filename, "r", encoding="utf-8") as fh: - head = fh.read() - except Exception as e: - get_logger().info(f"Could not read working-tree file {f.filename}: {e}") + if f.filename: + if os.path.isabs(f.filename): + get_logger().info( + f"Skipping absolute path in diff (unsafe): {f.filename}" + ) + else: + candidate = os.path.realpath(os.path.join(root, f.filename)) + if candidate != root and not candidate.startswith(root + os.sep): + get_logger().info( + f"Skipping path that escapes repo root (path traversal): {f.filename}" + ) + elif os.path.isfile(candidate): + try: + with open(candidate, "r", encoding="utf-8") as fh: + head = fh.read() + except Exception as e: + get_logger().info(f"Could not read working-tree file {f.filename}: {e}") f.head_file = head f.base_file = reconstruct_base_file(head, f.patch) if head else "" self.diff_files = files @@ -76,7 +88,8 @@ def publish_description(self, pr_title: str, pr_body: str): def is_supported(self, capability: str) -> bool: if capability in ["get_issue_comments", "create_inline_comment", - "publish_inline_comments", "get_labels"]: + "publish_inline_comments", "publish_file_comments", + "get_labels"]: return False return True diff --git a/tests/unittest/test_diff_provider.py b/tests/unittest/test_diff_provider.py index fcc719bcaf..44d5253e75 100644 --- a/tests/unittest/test_diff_provider.py +++ b/tests/unittest/test_diff_provider.py @@ -66,6 +66,33 @@ def test_temporary_comment_not_emitted(capsys): assert "Preparing review" not in captured.out +def test_publish_file_comments_not_supported(): + get_settings().set("diff.content", DIFF) + get_settings().set("diff.output_path", None) + provider = DiffGitProvider(None) + assert provider.is_supported("publish_file_comments") is False + + +def test_path_traversal_file_not_read(): + # A diff whose target path escapes the repo root must NOT be read from disk. + traversal_diff = ( + "diff --git a/../../evil.txt b/../../evil.txt\n" + "index 0000000..1111111 100644\n" + "--- a/../../evil.txt\n" + "+++ b/../../evil.txt\n" + "@@ -1,1 +1,1 @@\n" + "-old\n" + "+new\n" + ) + get_settings().set("diff.content", traversal_diff) + get_settings().set("diff.output_path", None) + provider = DiffGitProvider(None) + files = provider.get_diff_files() + assert len(files) == 1 + assert files[0].head_file == "" + assert files[0].base_file == "" + + def test_malformed_diff_raises_valueerror(): # A hunk with no file header triggers UnidiffParseError inside parse_unified_diff, # which the provider must re-raise as ValueError with a clear message. From 566cbaf49720288db21a0754461f49968258e7ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20P=C3=A9ter?= Date: Mon, 22 Jun 2026 13:39:48 +0200 Subject: [PATCH 11/15] test: make path-traversal guard test a real sentinel (#2457) --- tests/unittest/test_diff_provider.py | 40 +++++++++++++++++++++------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/tests/unittest/test_diff_provider.py b/tests/unittest/test_diff_provider.py index 44d5253e75..a4b3e593c1 100644 --- a/tests/unittest/test_diff_provider.py +++ b/tests/unittest/test_diff_provider.py @@ -73,24 +73,46 @@ def test_publish_file_comments_not_supported(): assert provider.is_supported("publish_file_comments") is False -def test_path_traversal_file_not_read(): - # A diff whose target path escapes the repo root must NOT be read from disk. +def test_path_traversal_file_not_read(tmp_path, monkeypatch): + # SENTINEL TEST: this test FAILS if the path-traversal guard in + # DiffGitProvider.get_diff_files() is removed. + # + # Without the guard, os.path.isfile("../secret.txt") would be True + # (because we create the file below) and the provider would read its + # contents into head_file. With the guard in place the path escapes + # the repo root so it is rejected and head_file stays "". + # + # Setup: an inner "repo" dir is the working directory; the secret file + # lives one level up (reachable via "../secret.txt" traversal). + repo = tmp_path / "repo" + repo.mkdir() + secret = tmp_path / "secret.txt" + secret.write_text("TOP SECRET\n", encoding="utf-8") + + # Make the provider believe the repo root is `repo`. + monkeypatch.chdir(repo) + traversal_diff = ( - "diff --git a/../../evil.txt b/../../evil.txt\n" + "diff --git a/../secret.txt b/../secret.txt\n" "index 0000000..1111111 100644\n" - "--- a/../../evil.txt\n" - "+++ b/../../evil.txt\n" + "--- a/../secret.txt\n" + "+++ b/../secret.txt\n" "@@ -1,1 +1,1 @@\n" - "-old\n" - "+new\n" + "-TOP SECRET\n" + "+REPLACED\n" ) get_settings().set("diff.content", traversal_diff) get_settings().set("diff.output_path", None) provider = DiffGitProvider(None) files = provider.get_diff_files() assert len(files) == 1 - assert files[0].head_file == "" - assert files[0].base_file == "" + # Guard must block the read: both fields must remain empty strings. + assert files[0].head_file == "", ( + "Path-traversal guard failed: head_file was read from ../secret.txt" + ) + assert files[0].base_file == "", ( + "Path-traversal guard failed: base_file was populated from traversal path" + ) def test_malformed_diff_raises_valueerror(): From 304384df318200420dc7d41d5343d1cbfcdad704 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20P=C3=A9ter?= Date: Mon, 22 Jun 2026 17:11:16 +0200 Subject: [PATCH 12/15] fix(diff-provider): address Qodo review findings on tokenless diff mode - cli: handle OSError/UnicodeDecodeError when reading --diff-file, failing fast with a clear parser.error instead of an uncaught traceback - diff_parsing: catch the specific UnidiffParseError (not bare Exception) in reconstruct_base_file; keep an empty reconstructed base as "" (never "\n") so downstream extend_patch() treats it as having no original file - diff_provider: narrow exception handling to UnidiffParseError for parsing and (OSError, UnicodeDecodeError) for working-tree reads; resolve diff paths against the repository root (_find_repository_root) instead of the raw CWD so enrichment still works when run from a subdirectory - tests: update add-to-empty base expectation to ""; add regression tests for subdirectory enrichment and missing --diff-file fail-fast; remove unused imports (F401) --- pr_agent/cli.py | 9 ++++++-- pr_agent/git_providers/diff_parsing.py | 8 +++++-- pr_agent/git_providers/diff_provider.py | 13 +++++++---- tests/unittest/test_cli_diff_mode.py | 14 +++++++++++- tests/unittest/test_diff_mode_e2e.py | 29 ++++++++++++++++++++++++- tests/unittest/test_diff_parsing.py | 11 +++++----- 6 files changed, 69 insertions(+), 15 deletions(-) diff --git a/pr_agent/cli.py b/pr_agent/cli.py index a1e29b447c..fae960eb6a 100644 --- a/pr_agent/cli.py +++ b/pr_agent/cli.py @@ -95,8 +95,13 @@ def run(inargs=None, args=None): if args.stdin and args.diff_file: parser.error("--stdin and --diff-file are mutually exclusive") if args.diff_file: - with open(args.diff_file, "r", encoding="utf-8") as fh: - diff_content = fh.read() + try: + with open(args.diff_file, "r", encoding="utf-8") as fh: + diff_content = fh.read() + except OSError as e: + parser.error(f"Could not read --diff-file '{args.diff_file}': {e}") + except UnicodeDecodeError as e: + parser.error(f"--diff-file '{args.diff_file}' is not valid UTF-8 text: {e}") else: diff_content = sys.stdin.read() if not diff_content.strip(): diff --git a/pr_agent/git_providers/diff_parsing.py b/pr_agent/git_providers/diff_parsing.py index 45e2caae15..dc145d4138 100644 --- a/pr_agent/git_providers/diff_parsing.py +++ b/pr_agent/git_providers/diff_parsing.py @@ -1,4 +1,5 @@ from unidiff import PatchSet +from unidiff.errors import UnidiffParseError from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo from pr_agent.log import get_logger @@ -57,7 +58,7 @@ def reconstruct_base_file(head_file_str: str, patch_str: str) -> str: base (original) content. Returns "" if the patch does not cleanly apply.""" try: patch_set = PatchSet(patch_str) - except Exception as e: + except UnidiffParseError as e: get_logger().info(f"Could not parse patch for base reconstruction: {e}") return "" if len(patch_set) != 1: @@ -90,6 +91,9 @@ def reconstruct_base_file(head_file_str: str, patch_str: str) -> str: base_lines.extend(head_lines[head_idx:]) result = "\n".join(base_lines) - if head_file_str.endswith("\n"): + # Preserve a trailing newline only when the base actually has content; an + # empty base (e.g. reversing an add-file patch) must stay "" so downstream + # extend_patch() correctly treats it as having no original file. + if base_lines and head_file_str.endswith("\n"): result += "\n" return result diff --git a/pr_agent/git_providers/diff_provider.py b/pr_agent/git_providers/diff_provider.py index 97fa72121a..eca23da8fe 100644 --- a/pr_agent/git_providers/diff_provider.py +++ b/pr_agent/git_providers/diff_provider.py @@ -2,8 +2,10 @@ from collections import Counter from typing import List, Optional +from unidiff.errors import UnidiffParseError + from pr_agent.algo.types import FilePatchInfo -from pr_agent.config_loader import get_settings +from pr_agent.config_loader import _find_repository_root, get_settings from pr_agent.git_providers.diff_parsing import (parse_unified_diff, reconstruct_base_file) from pr_agent.git_providers.git_provider import GitProvider @@ -39,9 +41,12 @@ def get_diff_files(self) -> List[FilePatchInfo]: return self.diff_files try: files = parse_unified_diff(self.diff_text) - except Exception as e: + except UnidiffParseError as e: raise ValueError(f"Failed to parse the provided diff: {e}") from e - root = os.path.realpath(os.getcwd()) + # Resolve diff paths against the actual repository root (not the raw CWD) + # so working-tree enrichment still works when run from a subdirectory. + repo_root = _find_repository_root() + root = os.path.realpath(str(repo_root) if repo_root else os.getcwd()) for f in files: head = "" if f.filename: @@ -59,7 +64,7 @@ def get_diff_files(self) -> List[FilePatchInfo]: try: with open(candidate, "r", encoding="utf-8") as fh: head = fh.read() - except Exception as e: + except (OSError, UnicodeDecodeError) as e: get_logger().info(f"Could not read working-tree file {f.filename}: {e}") f.head_file = head f.base_file = reconstruct_base_file(head, f.patch) if head else "" diff --git a/tests/unittest/test_cli_diff_mode.py b/tests/unittest/test_cli_diff_mode.py index 223a6f13b4..c10af477ac 100644 --- a/tests/unittest/test_cli_diff_mode.py +++ b/tests/unittest/test_cli_diff_mode.py @@ -1,4 +1,6 @@ -from pr_agent.cli import set_parser +import pytest + +from pr_agent.cli import run, set_parser def test_parser_has_diff_flags(): @@ -13,3 +15,13 @@ def test_parser_stdin_flag(): parser = set_parser() args = parser.parse_args(["--stdin", "review"]) assert args.stdin is True + + +def test_missing_diff_file_fails_fast(tmp_path, capsys): + """A non-existent --diff-file must exit cleanly via parser.error (SystemExit) + with a clear message, not crash with an uncaught OSError traceback.""" + missing = tmp_path / "does-not-exist.diff" + with pytest.raises(SystemExit): + run(inargs=["--diff-file", str(missing), "review"]) + err = capsys.readouterr().err + assert "Could not read --diff-file" in err diff --git a/tests/unittest/test_diff_mode_e2e.py b/tests/unittest/test_diff_mode_e2e.py index 6c4fbc0c50..dbd5914cbe 100644 --- a/tests/unittest/test_diff_mode_e2e.py +++ b/tests/unittest/test_diff_mode_e2e.py @@ -1,5 +1,4 @@ import pytest -from unittest.mock import AsyncMock, MagicMock from pr_agent.config_loader import get_settings from pr_agent.git_providers.diff_provider import DiffGitProvider @@ -38,6 +37,9 @@ def test_provider_end_to_end_files_and_output(capsys): def test_base_file_reconstructed_from_working_tree(tmp_path, monkeypatch): """When the working-tree file exists, head_file is populated and base_file is reconstructed from it by reversing the diff patch.""" + # Mark tmp_path as a repository root so _find_repository_root() resolves + # diff paths against it. + (tmp_path / ".git").mkdir() # The HEAD (current) content of foo.py — line2 has already been changed head_content = "line1\nline2-changed\nline3\n" foo = tmp_path / "foo.py" @@ -64,6 +66,31 @@ def test_base_file_reconstructed_from_working_tree(tmp_path, monkeypatch): assert "line2-changed" not in files[0].base_file +def test_base_file_reconstructed_when_run_from_subdirectory(tmp_path, monkeypatch): + """Regression for the CWD-vs-repo-root bug: enrichment must still find + working-tree files when the CLI is run from a subdirectory of the repo.""" + # tmp_path is the repo root; foo.py lives at the root, matching the diff path. + (tmp_path / ".git").mkdir() + foo = tmp_path / "foo.py" + foo.write_text("line1\nline2-changed\nline3\n", encoding="utf-8") + + # Run from a nested subdirectory, not the repo root. + subdir = tmp_path / "pkg" / "sub" + subdir.mkdir(parents=True) + monkeypatch.chdir(subdir) + + get_settings().set("diff.content", DIFF) + get_settings().set("diff.output_path", None) + provider = DiffGitProvider(None) + + files = provider.get_diff_files() + assert files[0].filename == "foo.py" + # Despite running from a subdir, the file is resolved against the repo root. + assert files[0].head_file != "", "head_file should be found via repo root, not CWD" + assert files[0].base_file != "", "base_file should be reconstructed from repo-root file" + assert "line2" in files[0].base_file + + @pytest.mark.asyncio async def test_review_command_through_diff_provider_mocked_llm(monkeypatch): """Integration test: drives PRReviewer with a fake AI handler through the diff --git a/tests/unittest/test_diff_parsing.py b/tests/unittest/test_diff_parsing.py index a4b715f18b..a6adb28d20 100644 --- a/tests/unittest/test_diff_parsing.py +++ b/tests/unittest/test_diff_parsing.py @@ -136,8 +136,8 @@ def test_reconstruct_base_no_trailing_newline(): # A diff that added a file (base was empty); reversed: head is the added content, -# base should be empty. Because head ends with "\n", our fix appends "\n" to the -# empty join, so the result is "\n". +# base must be exactly "" — never "\n" — so downstream extend_patch() correctly +# treats the original file as non-existent. _ADD_FILE_PATCH = """--- /dev/null +++ b/new.py @@ -0,0 +1,2 @@ @@ -149,7 +149,8 @@ def test_reconstruct_base_no_trailing_newline(): def test_reconstruct_base_add_to_empty(): - """Reversing an add-file patch yields an empty (or lone-newline) base.""" + """Reversing an add-file patch yields a truly empty base, not a lone newline.""" result = reconstruct_base_file(_ADD_FILE_HEAD, _ADD_FILE_PATCH) - # head ends with "\n", so the trailing-newline guard appends "\n" to "". - assert result == "\n" + # Even though head ends with "\n", an empty base must stay "" (no trailing + # newline appended) so it is falsy for extend_patch(). + assert result == "" From dfb19b228e888598fa9d1483a5db61587a9e5828 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20P=C3=A9ter?= Date: Tue, 23 Jun 2026 07:21:47 +0200 Subject: [PATCH 13/15] fix(diff-provider): resolve Qodo follow-up findings on tokenless diff mode Addresses the new findings raised after the previous fix round: - improve crash (correctness): DiffGitProvider now implements publish_code_suggestions()/publish_code_suggestion() to render suggestions as a markdown document to stdout/--output instead of raising NotImplementedError, so `--stdin/--diff-file improve` works as documented - CWD file-disclosure risk (security): when no .git repository root is detected, working-tree enrichment is disabled (patch-only) instead of reading files from an arbitrary current directory - broad except (reliability): _write_output() now catches only (OSError, UnicodeError) and re-raises, since --output is an explicit request whose failure must not be swallowed - style: double quotes for the new CLI args; wrapped the long publish_inline_comment signature to satisfy Ruff - docs: note that improve renders suggestions as markdown in this mode - tests: add coverage for code-suggestion rendering (and empty no-op), no-repo-root patch-only mode; harden the path-traversal sentinel with a real .git root so it isolates the traversal guard --- docs/docs/usage-guide/tokenless_diff_mode.md | 8 +-- pr_agent/cli.py | 12 ++--- pr_agent/git_providers/diff_provider.py | 51 ++++++++++++++++---- tests/unittest/test_diff_provider.py | 50 ++++++++++++++++++- 4 files changed, 100 insertions(+), 21 deletions(-) diff --git a/docs/docs/usage-guide/tokenless_diff_mode.md b/docs/docs/usage-guide/tokenless_diff_mode.md index 197f125b0e..39a3952498 100644 --- a/docs/docs/usage-guide/tokenless_diff_mode.md +++ b/docs/docs/usage-guide/tokenless_diff_mode.md @@ -33,9 +33,11 @@ enter tokenless diff mode; omitting both falls back to the normal `--pr_url` flo ### Supported commands -`review`, `improve`, `describe`, and `ask` are supported. Commands that require live -platform interaction (such as `update_changelog` or `similar_issue`) are not meaningful -in this mode. +`review`, `improve`, `describe`, and `ask` are supported. Because there is no hosting +platform to push to, `improve` renders its code suggestions as a single markdown +document to stdout (and to `--output`, if given) instead of as committable inline +suggestions. Commands that require live platform interaction (such as +`update_changelog` or `similar_issue`) are not meaningful in this mode. ## How it works diff --git a/pr_agent/cli.py b/pr_agent/cli.py index fae960eb6a..d3dbc35b36 100644 --- a/pr_agent/cli.py +++ b/pr_agent/cli.py @@ -66,12 +66,12 @@ def set_parser(): "Repo-local .pr_agent.toml overrides values set here." ), ) - parser.add_argument('--diff-file', dest='diff_file', type=str, default=None, - help='Path to a unified diff file to review (tokenless local mode)') - parser.add_argument('--stdin', action='store_true', default=False, - help='Read a unified diff from stdin (tokenless local mode)') - parser.add_argument('--output', dest='output', type=str, default=None, - help='Write the result to this file (in addition to stdout)') + parser.add_argument("--diff-file", dest="diff_file", type=str, default=None, + help="Path to a unified diff file to review (tokenless local mode)") + parser.add_argument("--stdin", action="store_true", default=False, + help="Read a unified diff from stdin (tokenless local mode)") + parser.add_argument("--output", dest="output", type=str, default=None, + help="Write the result to this file (in addition to stdout)") parser.add_argument('command', type=str, help='The', choices=commands, default='review') parser.add_argument('rest', nargs=argparse.REMAINDER, default=[]) return parser diff --git a/pr_agent/git_providers/diff_provider.py b/pr_agent/git_providers/diff_provider.py index eca23da8fe..6792475233 100644 --- a/pr_agent/git_providers/diff_provider.py +++ b/pr_agent/git_providers/diff_provider.py @@ -45,11 +45,19 @@ def get_diff_files(self) -> List[FilePatchInfo]: raise ValueError(f"Failed to parse the provided diff: {e}") from e # Resolve diff paths against the actual repository root (not the raw CWD) # so working-tree enrichment still works when run from a subdirectory. + # If there is no detectable .git root, disable enrichment entirely and + # run patch-only: reading files from an arbitrary CWD could disclose + # unrelated local files to the LLM. repo_root = _find_repository_root() - root = os.path.realpath(str(repo_root) if repo_root else os.getcwd()) + root = os.path.realpath(str(repo_root)) if repo_root else None + if root is None: + get_logger().info( + "No repository root (.git) found; running in patch-only mode " + "(working-tree enrichment disabled)." + ) for f in files: head = "" - if f.filename: + if root is not None and f.filename: if os.path.isabs(f.filename): get_logger().info( f"Skipping absolute path in diff (unsafe): {f.filename}" @@ -77,11 +85,14 @@ def get_files(self) -> List[str]: def _write_output(self, content: str): print(content) if self.output_path: + # --output is always an explicit user request, so a write failure + # must surface (fail fast) rather than be silently swallowed. try: with open(self.output_path, "w", encoding="utf-8") as fh: fh.write(content) - except Exception as e: + except (OSError, UnicodeError) as e: get_logger().error(f"Failed to write output to {self.output_path}: {e}") + raise def publish_comment(self, pr_comment: str, is_temporary: bool = False): if is_temporary: @@ -116,19 +127,39 @@ def get_user_id(self): def get_pr_branch(self): return "" + # ---- code suggestions: rendered to stdout/--output (no hosting platform) ---- + def publish_code_suggestion(self, body: str, relevant_file: str, + relevant_lines_start: int, relevant_lines_end: int): + location = f"{relevant_file}:{relevant_lines_start}-{relevant_lines_end}" + self._write_output(f"### {location}\n\n{body}") + + def publish_code_suggestions(self, code_suggestions: list) -> bool: + # The 'improve' tool calls this unconditionally; render the suggestions + # as a single markdown document to stdout/--output instead of pushing + # them to a (non-existent) hosting platform. + if not code_suggestions: + return True + sections = ["## Code suggestions", ""] + for s in code_suggestions: + relevant_file = s.get("relevant_file", "") + start = s.get("relevant_lines_start", "") + end = s.get("relevant_lines_end", "") + location = f"{relevant_file}:{start}-{end}".strip(":-") + if location: + sections.append(f"### {location}") + sections.append(s.get("body", "")) + sections.append("") + self._write_output("\n".join(sections).rstrip() + "\n") + return True + # ---- unsupported publish operations (no-op or NotImplementedError) ---- - def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): + def publish_inline_comment(self, body: str, relevant_file: str, + relevant_line_in_file: str, original_suggestion=None): raise NotImplementedError("Inline comments are not supported by the diff provider") def publish_inline_comments(self, comments: list): raise NotImplementedError("Inline comments are not supported by the diff provider") - def publish_code_suggestion(self, body: str, relevant_file: str, relevant_lines_start: int, relevant_lines_end: int): - raise NotImplementedError("Code suggestions are not supported by the diff provider") - - def publish_code_suggestions(self, code_suggestions: list) -> bool: - raise NotImplementedError("Code suggestions are not supported by the diff provider") - def publish_labels(self, labels): pass diff --git a/tests/unittest/test_diff_provider.py b/tests/unittest/test_diff_provider.py index a4b3e593c1..24ebecfef8 100644 --- a/tests/unittest/test_diff_provider.py +++ b/tests/unittest/test_diff_provider.py @@ -82,10 +82,13 @@ def test_path_traversal_file_not_read(tmp_path, monkeypatch): # contents into head_file. With the guard in place the path escapes # the repo root so it is rejected and head_file stays "". # - # Setup: an inner "repo" dir is the working directory; the secret file - # lives one level up (reachable via "../secret.txt" traversal). + # Setup: an inner "repo" dir is a real repo root (has .git) and is the + # working directory; the secret file lives one level up (reachable via + # "../secret.txt" traversal). The .git marker ensures working-tree + # enrichment is active, so this isolates the traversal guard itself. repo = tmp_path / "repo" repo.mkdir() + (repo / ".git").mkdir() secret = tmp_path / "secret.txt" secret.write_text("TOP SECRET\n", encoding="utf-8") @@ -125,3 +128,46 @@ def test_malformed_diff_raises_valueerror(): assert False, "expected ValueError" except ValueError: pass + + +def test_no_repo_root_disables_enrichment(tmp_path, monkeypatch): + # When run outside any git repo (no .git ancestor), enrichment must be + # disabled and the provider must not read working-tree files even if a + # file with the diff's name happens to exist in the CWD. + decoy = tmp_path / "foo.py" + decoy.write_text("line1\nline2-changed\nline3\n", encoding="utf-8") + monkeypatch.chdir(tmp_path) + + get_settings().set("diff.content", DIFF) + get_settings().set("diff.output_path", None) + provider = DiffGitProvider(None) + files = provider.get_diff_files() + assert files[0].head_file == "", ( + "Enrichment must be disabled when no .git root is found (patch-only)" + ) + assert files[0].base_file == "" + + +def test_publish_code_suggestions_renders_to_stdout(capsys): + get_settings().set("diff.content", DIFF) + get_settings().set("diff.output_path", None) + provider = DiffGitProvider(None) + suggestions = [ + {"body": "**Suggestion:** use a constant", "relevant_file": "foo.py", + "relevant_lines_start": 2, "relevant_lines_end": 2}, + ] + # The 'improve' tool calls this unconditionally; it must not crash and must + # render the suggestions to stdout. + assert provider.publish_code_suggestions(suggestions) is True + out = capsys.readouterr().out + assert "Code suggestions" in out + assert "foo.py:2-2" in out + assert "use a constant" in out + + +def test_publish_code_suggestions_empty_is_noop(capsys): + get_settings().set("diff.content", DIFF) + get_settings().set("diff.output_path", None) + provider = DiffGitProvider(None) + assert provider.publish_code_suggestions([]) is True + assert capsys.readouterr().out.strip() == "" From a05bce305821f58c0418b6a222ffa835feb6d059 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20P=C3=A9ter?= Date: Tue, 23 Jun 2026 07:41:20 +0200 Subject: [PATCH 14/15] fix(diff-provider): resolve Qodo third-round findings on tokenless diff mode - incremental crash (correctness): DiffGitProvider.get_incremental_commits() now disables incremental mode (-i has no meaning for a standalone diff), preventing a TypeError when PRReviewer takes len() of an empty commits_range - extra-config override (correctness): provider selection now forces the diff provider whenever diff content is loaded, so an extra/repo config setting config.git_provider can no longer silently derail tokenless mode - suppressed output (correctness): CLI diff mode forces config.publish_output =True so stdout/--output is always produced even if publishing was disabled - global mutation (reliability): removed the dead pr_reviewer.inline_code_comments=False write in __init__ (never read anywhere; was copied from LocalGitProvider) - test isolation (reliability): added an autouse fixture to each diff test module that snapshots and restores the mutated global settings keys - tests: added coverage for incremental-disabled, diff-content-forces-provider, and publish_output forcing --- pr_agent/cli.py | 3 ++ pr_agent/git_providers/__init__.py | 5 +++ pr_agent/git_providers/diff_provider.py | 13 +++++-- tests/unittest/test_cli_diff_mode.py | 46 +++++++++++++++++++++++++ tests/unittest/test_diff_mode_e2e.py | 15 ++++++++ tests/unittest/test_diff_provider.py | 40 +++++++++++++++++++++ 6 files changed, 120 insertions(+), 2 deletions(-) diff --git a/pr_agent/cli.py b/pr_agent/cli.py index d3dbc35b36..e8e065b3d0 100644 --- a/pr_agent/cli.py +++ b/pr_agent/cli.py @@ -109,6 +109,9 @@ def run(inargs=None, args=None): get_settings().set("config.git_provider", "diff") get_settings().set("diff.content", diff_content) get_settings().set("diff.output_path", getattr(args, "output", None)) + # Diff mode's whole purpose is to emit the result to stdout/--output, so + # force publishing on even if a config/env set publish_output=false. + get_settings().set("config.publish_output", True) elif not args.pr_url and not args.issue_url: parser.print_help() return diff --git a/pr_agent/git_providers/__init__.py b/pr_agent/git_providers/__init__.py index 1b52f26247..fb119cc745 100644 --- a/pr_agent/git_providers/__init__.py +++ b/pr_agent/git_providers/__init__.py @@ -59,6 +59,11 @@ def get_git_provider_with_context(pr_url) -> GitProvider: else: try: provider_id = get_settings().config.git_provider + # Tokenless diff mode is keyed on loaded diff content; it must not be + # overridden by an extra/repo config file that sets a different + # git_provider (apply_repo_settings merges those before this call). + if get_settings().get("diff.content", None): + provider_id = "diff" if provider_id not in _GIT_PROVIDERS: raise ValueError(f"Unknown git provider: {provider_id}") git_provider = _GIT_PROVIDERS[provider_id](pr_url) diff --git a/pr_agent/git_providers/diff_provider.py b/pr_agent/git_providers/diff_provider.py index 6792475233..eaa6d29ae6 100644 --- a/pr_agent/git_providers/diff_provider.py +++ b/pr_agent/git_providers/diff_provider.py @@ -33,8 +33,6 @@ def __init__(self, pr_url=None, incremental=False): self.output_path = get_settings().get("diff.output_path", None) self.diff_files = None self.pr = PullRequestMimic(self.get_pr_title(), self.get_diff_files()) - # inline code comments are not supported for the diff provider - get_settings().pr_reviewer.inline_code_comments = False def get_diff_files(self) -> List[FilePatchInfo]: if self.diff_files is not None: @@ -82,6 +80,17 @@ def get_diff_files(self) -> List[FilePatchInfo]: def get_files(self) -> List[str]: return [f.filename for f in self.get_diff_files()] + def get_incremental_commits(self, incremental): + # A standalone diff has no commit history, so incremental review (-i) is + # not applicable. Disable it explicitly to avoid a TypeError downstream + # (PRReviewer would otherwise call len() on an unpopulated commits_range). + if getattr(incremental, "is_incremental", False): + get_logger().info( + "Incremental review is not supported in tokenless diff mode; " + "running a full review instead." + ) + incremental.is_incremental = False + def _write_output(self, content: str): print(content) if self.output_path: diff --git a/tests/unittest/test_cli_diff_mode.py b/tests/unittest/test_cli_diff_mode.py index c10af477ac..01d98285e9 100644 --- a/tests/unittest/test_cli_diff_mode.py +++ b/tests/unittest/test_cli_diff_mode.py @@ -1,6 +1,21 @@ import pytest from pr_agent.cli import run, set_parser +from pr_agent.config_loader import get_settings + +# Keys the run()-based test mutates on the process-wide settings singleton; +# saved and restored around every test so global state never leaks. +_SETTINGS_KEYS = ["diff.content", "diff.output_path", + "config.git_provider", "config.publish_output"] + + +@pytest.fixture(autouse=True) +def _restore_settings(): + s = get_settings() + saved = {k: s.get(k, None) for k in _SETTINGS_KEYS} + yield + for k, v in saved.items(): + s.set(k, v) def test_parser_has_diff_flags(): @@ -25,3 +40,34 @@ def test_missing_diff_file_fails_fast(tmp_path, capsys): run(inargs=["--diff-file", str(missing), "review"]) err = capsys.readouterr().err assert "Could not read --diff-file" in err + + +_DIFF = ( + "diff --git a/foo.py b/foo.py\n" + "index 1111111..2222222 100644\n" + "--- a/foo.py\n" + "+++ b/foo.py\n" + "@@ -1,3 +1,3 @@\n" + " line1\n-line2\n+line2-changed\n line3\n" +) + + +def test_diff_mode_forces_publish_output(monkeypatch): + """Diff mode must force config.publish_output=True so stdout/--output is + never suppressed by a config/env that disabled publishing.""" + import io + + import pr_agent.cli as cli + + get_settings().set("config.publish_output", False) + captured = {} + + class FakeAgent: + async def handle_request(self, target, request, notify=None): + captured["publish_output"] = get_settings().config.publish_output + return True + + monkeypatch.setattr(cli, "PRAgent", FakeAgent) + monkeypatch.setattr("sys.stdin", io.StringIO(_DIFF)) + run(inargs=["--stdin", "review"]) + assert captured["publish_output"] is True diff --git a/tests/unittest/test_diff_mode_e2e.py b/tests/unittest/test_diff_mode_e2e.py index dbd5914cbe..748988ab8f 100644 --- a/tests/unittest/test_diff_mode_e2e.py +++ b/tests/unittest/test_diff_mode_e2e.py @@ -4,6 +4,21 @@ from pr_agent.git_providers.diff_provider import DiffGitProvider from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler +# Keys these tests mutate on the process-wide settings singleton; saved and +# restored around every test so global state never leaks between tests. +_SETTINGS_KEYS = ["diff.content", "diff.output_path", + "config.git_provider", "config.publish_output"] + + +@pytest.fixture(autouse=True) +def _restore_settings(): + s = get_settings() + saved = {k: s.get(k, None) for k in _SETTINGS_KEYS} + yield + for k, v in saved.items(): + s.set(k, v) + + DIFF = """diff --git a/foo.py b/foo.py index 1111111..2222222 100644 --- a/foo.py diff --git a/tests/unittest/test_diff_provider.py b/tests/unittest/test_diff_provider.py index 24ebecfef8..191c96eeea 100644 --- a/tests/unittest/test_diff_provider.py +++ b/tests/unittest/test_diff_provider.py @@ -1,8 +1,25 @@ +import pytest + from pr_agent.algo.types import EDIT_TYPE from pr_agent.config_loader import get_settings from pr_agent.git_providers import _GIT_PROVIDERS from pr_agent.git_providers.diff_provider import DiffGitProvider +# Keys these tests mutate on the process-wide settings singleton; saved and +# restored around every test so global state never leaks between tests. +_SETTINGS_KEYS = ["diff.content", "diff.output_path", + "config.git_provider", "config.publish_output"] + + +@pytest.fixture(autouse=True) +def _restore_settings(): + s = get_settings() + saved = {k: s.get(k, None) for k in _SETTINGS_KEYS} + yield + for k, v in saved.items(): + s.set(k, v) + + DIFF = """diff --git a/foo.py b/foo.py index 1111111..2222222 100644 --- a/foo.py @@ -171,3 +188,26 @@ def test_publish_code_suggestions_empty_is_noop(capsys): provider = DiffGitProvider(None) assert provider.publish_code_suggestions([]) is True assert capsys.readouterr().out.strip() == "" + + +def test_incremental_review_disabled(): + # -i has no meaning for a standalone diff; the provider must disable it so + # PRReviewer never takes the incremental path (which would TypeError). + from pr_agent.git_providers.git_provider import IncrementalPR + get_settings().set("diff.content", DIFF) + get_settings().set("diff.output_path", None) + provider = DiffGitProvider(None) + incremental = IncrementalPR(is_incremental=True) + provider.get_incremental_commits(incremental) + assert incremental.is_incremental is False + + +def test_diff_content_forces_diff_provider(): + # Even if config.git_provider points elsewhere (e.g. set by extra config), + # the presence of loaded diff content must select the diff provider. + from pr_agent.git_providers import get_git_provider_with_context + get_settings().set("config.git_provider", "github") + get_settings().set("diff.content", DIFF) + get_settings().set("diff.output_path", None) + provider = get_git_provider_with_context("local_diff") + assert isinstance(provider, DiffGitProvider) From ae473c48e87c80a4ac8d6afebabd156f6fdd931e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20P=C3=A9ter?= Date: Tue, 23 Jun 2026 08:06:40 +0200 Subject: [PATCH 15/15] fix(diff-provider): hunk-only patches + isort/test-isolation (Qodo round 4) - patch misparsed as hunks (correctness): the stored FilePatchInfo.patch was str(pf) from unidiff, which keeps the 'diff --git'/'index'/'---'/'+++' headers before the first '@@'. The shared hunk/line-number converter treats '+'/'-' lines as content, so those headers were emitted as a bogus leading hunk with invalid line numbers into the LLM prompt. Added to_hunk_only_patch() and normalize f.patch after base reconstruction (which still needs the full headers to parse), matching platform providers. - isort (I001): collapsed the two-name diff_parsing import to one line and sorted imports in the diff test modules; verified with ruff. - test isolation: tests now mutate settings only through an autouse `cfg` fixture that snapshots and restores the diff-mode keys (covers run()'s indirect mutations too); no bare get_settings().set() in test bodies. - test style: replaced `assert False` with pytest.raises (B011). - tests: added a hunk-only-patch assertion. --- pr_agent/git_providers/diff_parsing.py | 15 +++ pr_agent/git_providers/diff_provider.py | 6 +- tests/unittest/test_cli_diff_mode.py | 23 +++-- tests/unittest/test_diff_mode_e2e.py | 48 +++++----- tests/unittest/test_diff_provider.py | 120 +++++++++++++----------- 5 files changed, 127 insertions(+), 85 deletions(-) diff --git a/pr_agent/git_providers/diff_parsing.py b/pr_agent/git_providers/diff_parsing.py index dc145d4138..c33d48696d 100644 --- a/pr_agent/git_providers/diff_parsing.py +++ b/pr_agent/git_providers/diff_parsing.py @@ -13,6 +13,21 @@ def _strip_prefix(path: str | None) -> str | None: return path +def to_hunk_only_patch(patch_str: str) -> str: + """Drop file-header lines ('diff --git', 'index', '---', '+++') that precede + the first '@@' hunk header. + + Platform providers (GitHub, GitLab, ...) store hunk-only patches, and the + shared hunk/line-number converter treats any '+'/'-' line as content. Left + in, the '---'/'+++' headers would be emitted as a bogus leading hunk with + invalid line numbers. Returns "" when there is no hunk (e.g. rename-only).""" + lines = patch_str.splitlines(keepends=True) + for i, line in enumerate(lines): + if line.startswith("@@"): + return "".join(lines[i:]) + return "" + + def parse_unified_diff(diff_text: str) -> list[FilePatchInfo]: """Parse a unified diff into FilePatchInfo objects (patch + metadata only). diff --git a/pr_agent/git_providers/diff_provider.py b/pr_agent/git_providers/diff_provider.py index eaa6d29ae6..25db207e30 100644 --- a/pr_agent/git_providers/diff_provider.py +++ b/pr_agent/git_providers/diff_provider.py @@ -6,8 +6,7 @@ from pr_agent.algo.types import FilePatchInfo from pr_agent.config_loader import _find_repository_root, get_settings -from pr_agent.git_providers.diff_parsing import (parse_unified_diff, - reconstruct_base_file) +from pr_agent.git_providers.diff_parsing import parse_unified_diff, reconstruct_base_file, to_hunk_only_patch from pr_agent.git_providers.git_provider import GitProvider from pr_agent.log import get_logger @@ -74,6 +73,9 @@ def get_diff_files(self) -> List[FilePatchInfo]: get_logger().info(f"Could not read working-tree file {f.filename}: {e}") f.head_file = head f.base_file = reconstruct_base_file(head, f.patch) if head else "" + # Reconstruction needs the full patch (with --- /+++ headers); the + # rest of the pipeline expects hunk-only patches, so normalize after. + f.patch = to_hunk_only_patch(f.patch) self.diff_files = files return files diff --git a/tests/unittest/test_cli_diff_mode.py b/tests/unittest/test_cli_diff_mode.py index 01d98285e9..2d91a67839 100644 --- a/tests/unittest/test_cli_diff_mode.py +++ b/tests/unittest/test_cli_diff_mode.py @@ -3,19 +3,26 @@ from pr_agent.cli import run, set_parser from pr_agent.config_loader import get_settings -# Keys the run()-based test mutates on the process-wide settings singleton; -# saved and restored around every test so global state never leaks. +# Keys run() mutates on the process-wide settings singleton, directly or via the +# diff-mode CLI path. Snapshotted and restored around every test (autouse) so +# state never leaks, even when run() sets keys the test never touches itself. _SETTINGS_KEYS = ["diff.content", "diff.output_path", "config.git_provider", "config.publish_output"] @pytest.fixture(autouse=True) -def _restore_settings(): +def cfg(): + """Restore all diff-mode settings keys after each test, and expose a setter + so tests mutate settings through the fixture rather than bare set() calls.""" s = get_settings() saved = {k: s.get(k, None) for k in _SETTINGS_KEYS} - yield - for k, v in saved.items(): - s.set(k, v) + + def _set(key, value): + s.set(key, value) + + yield _set + for key, value in saved.items(): + s.set(key, value) def test_parser_has_diff_flags(): @@ -52,14 +59,14 @@ def test_missing_diff_file_fails_fast(tmp_path, capsys): ) -def test_diff_mode_forces_publish_output(monkeypatch): +def test_diff_mode_forces_publish_output(cfg, monkeypatch): """Diff mode must force config.publish_output=True so stdout/--output is never suppressed by a config/env that disabled publishing.""" import io import pr_agent.cli as cli - get_settings().set("config.publish_output", False) + cfg("config.publish_output", False) captured = {} class FakeAgent: diff --git a/tests/unittest/test_diff_mode_e2e.py b/tests/unittest/test_diff_mode_e2e.py index 748988ab8f..ec5a073120 100644 --- a/tests/unittest/test_diff_mode_e2e.py +++ b/tests/unittest/test_diff_mode_e2e.py @@ -1,22 +1,28 @@ import pytest +from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.config_loader import get_settings from pr_agent.git_providers.diff_provider import DiffGitProvider -from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler -# Keys these tests mutate on the process-wide settings singleton; saved and -# restored around every test so global state never leaks between tests. +# Diff-mode settings keys these tests mutate on the process-wide singleton. _SETTINGS_KEYS = ["diff.content", "diff.output_path", "config.git_provider", "config.publish_output"] @pytest.fixture(autouse=True) -def _restore_settings(): +def cfg(): + """Restore all diff-mode settings keys after each test (autouse) and expose a + setter so tests mutate settings through the fixture rather than bare set() + calls. Keeps the process-wide settings singleton from leaking between tests.""" s = get_settings() saved = {k: s.get(k, None) for k in _SETTINGS_KEYS} - yield - for k, v in saved.items(): - s.set(k, v) + + def _set(key, value): + s.set(key, value) + + yield _set + for key, value in saved.items(): + s.set(key, value) DIFF = """diff --git a/foo.py b/foo.py @@ -31,9 +37,9 @@ def _restore_settings(): """ -def test_provider_end_to_end_files_and_output(capsys): - get_settings().set("diff.content", DIFF) - get_settings().set("diff.output_path", None) +def test_provider_end_to_end_files_and_output(cfg, capsys): + cfg("diff.content", DIFF) + cfg("diff.output_path", None) provider = DiffGitProvider(None) # files parsed and content reconstructed where working tree is absent @@ -49,7 +55,7 @@ def test_provider_end_to_end_files_and_output(capsys): assert "finding one" in capsys.readouterr().out -def test_base_file_reconstructed_from_working_tree(tmp_path, monkeypatch): +def test_base_file_reconstructed_from_working_tree(cfg, tmp_path, monkeypatch): """When the working-tree file exists, head_file is populated and base_file is reconstructed from it by reversing the diff patch.""" # Mark tmp_path as a repository root so _find_repository_root() resolves @@ -63,8 +69,8 @@ def test_base_file_reconstructed_from_working_tree(tmp_path, monkeypatch): # Change cwd so the provider's relative-path lookup resolves into tmp_path monkeypatch.chdir(tmp_path) - get_settings().set("diff.content", DIFF) - get_settings().set("diff.output_path", None) + cfg("diff.content", DIFF) + cfg("diff.output_path", None) provider = DiffGitProvider(None) files = provider.get_diff_files() @@ -81,7 +87,7 @@ def test_base_file_reconstructed_from_working_tree(tmp_path, monkeypatch): assert "line2-changed" not in files[0].base_file -def test_base_file_reconstructed_when_run_from_subdirectory(tmp_path, monkeypatch): +def test_base_file_reconstructed_when_run_from_subdirectory(cfg, tmp_path, monkeypatch): """Regression for the CWD-vs-repo-root bug: enrichment must still find working-tree files when the CLI is run from a subdirectory of the repo.""" # tmp_path is the repo root; foo.py lives at the root, matching the diff path. @@ -94,8 +100,8 @@ def test_base_file_reconstructed_when_run_from_subdirectory(tmp_path, monkeypatc subdir.mkdir(parents=True) monkeypatch.chdir(subdir) - get_settings().set("diff.content", DIFF) - get_settings().set("diff.output_path", None) + cfg("diff.content", DIFF) + cfg("diff.output_path", None) provider = DiffGitProvider(None) files = provider.get_diff_files() @@ -107,7 +113,7 @@ def test_base_file_reconstructed_when_run_from_subdirectory(tmp_path, monkeypatc @pytest.mark.asyncio -async def test_review_command_through_diff_provider_mocked_llm(monkeypatch): +async def test_review_command_through_diff_provider_mocked_llm(cfg, monkeypatch): """Integration test: drives PRReviewer with a fake AI handler through the diff provider. We assert that the AI handler is invoked with a prompt that contains the changed content from the diff, proving end-to-end wiring from @@ -121,11 +127,11 @@ async def test_review_command_through_diff_provider_mocked_llm(monkeypatch): from pr_agent.tools.pr_reviewer import PRReviewer # --- configure provider --- - get_settings().set("config.git_provider", "diff") - get_settings().set("diff.content", DIFF) - get_settings().set("diff.output_path", None) + cfg("config.git_provider", "diff") + cfg("diff.content", DIFF) + cfg("diff.output_path", None) # Disable publish so we don't need a real comment sink - get_settings().set("config.publish_output", False) + cfg("config.publish_output", False) # --- fake AI handler --- # PRReviewer calls ai_handler() (a factory/partial), so we pass a class diff --git a/tests/unittest/test_diff_provider.py b/tests/unittest/test_diff_provider.py index 191c96eeea..cb981acd4b 100644 --- a/tests/unittest/test_diff_provider.py +++ b/tests/unittest/test_diff_provider.py @@ -5,19 +5,25 @@ from pr_agent.git_providers import _GIT_PROVIDERS from pr_agent.git_providers.diff_provider import DiffGitProvider -# Keys these tests mutate on the process-wide settings singleton; saved and -# restored around every test so global state never leaks between tests. +# Diff-mode settings keys these tests mutate on the process-wide singleton. _SETTINGS_KEYS = ["diff.content", "diff.output_path", "config.git_provider", "config.publish_output"] @pytest.fixture(autouse=True) -def _restore_settings(): +def cfg(): + """Restore all diff-mode settings keys after each test (autouse) and expose a + setter so tests mutate settings through the fixture rather than bare set() + calls. Keeps the process-wide settings singleton from leaking between tests.""" s = get_settings() saved = {k: s.get(k, None) for k in _SETTINGS_KEYS} - yield - for k, v in saved.items(): - s.set(k, v) + + def _set(key, value): + s.set(key, value) + + yield _set + for key, value in saved.items(): + s.set(key, value) DIFF = """diff --git a/foo.py b/foo.py @@ -36,9 +42,9 @@ def test_registered(): assert _GIT_PROVIDERS["diff"] is DiffGitProvider -def test_get_diff_files(monkeypatch): - get_settings().set("diff.content", DIFF) - get_settings().set("diff.output_path", None) +def test_get_diff_files(cfg): + cfg("diff.content", DIFF) + cfg("diff.output_path", None) provider = DiffGitProvider(None) files = provider.get_diff_files() assert len(files) == 1 @@ -46,51 +52,60 @@ def test_get_diff_files(monkeypatch): assert files[0].edit_type == EDIT_TYPE.MODIFIED -def test_publish_comment_to_stdout(capsys): - get_settings().set("diff.content", DIFF) - get_settings().set("diff.output_path", None) +def test_get_diff_files_patch_is_hunk_only(cfg): + # The stored patch must not carry the 'diff --git'/'index'/'---'/'+++' + # headers, which the shared hunk converter would misparse as a bogus hunk. + cfg("diff.content", DIFF) + cfg("diff.output_path", None) + provider = DiffGitProvider(None) + patch = provider.get_diff_files()[0].patch + assert patch.startswith("@@") + assert "diff --git" not in patch + assert "+++ b/foo.py" not in patch + + +def test_publish_comment_to_stdout(cfg, capsys): + cfg("diff.content", DIFF) + cfg("diff.output_path", None) provider = DiffGitProvider(None) provider.publish_comment("# Review\nlooks good") captured = capsys.readouterr() assert "looks good" in captured.out -def test_publish_comment_to_file(tmp_path): +def test_publish_comment_to_file(cfg, tmp_path): out = tmp_path / "review.md" - get_settings().set("diff.content", DIFF) - get_settings().set("diff.output_path", str(out)) + cfg("diff.content", DIFF) + cfg("diff.output_path", str(out)) provider = DiffGitProvider(None) provider.publish_comment("# Review\nsaved") assert "saved" in out.read_text(encoding="utf-8") -def test_empty_diff_raises(): - get_settings().set("diff.content", "") - get_settings().set("diff.output_path", None) - try: +def test_empty_diff_raises(cfg): + cfg("diff.content", "") + cfg("diff.output_path", None) + with pytest.raises(ValueError): DiffGitProvider(None) - assert False, "expected ValueError" - except ValueError: - pass -def test_temporary_comment_not_emitted(capsys): - get_settings().set("diff.content", DIFF) - get_settings().set("diff.output_path", None) +def test_temporary_comment_not_emitted(cfg, capsys): + cfg("diff.content", DIFF) + cfg("diff.output_path", None) provider = DiffGitProvider(None) provider.publish_comment("Preparing review...", is_temporary=True) captured = capsys.readouterr() assert "Preparing review" not in captured.out -def test_publish_file_comments_not_supported(): - get_settings().set("diff.content", DIFF) - get_settings().set("diff.output_path", None) +def test_publish_file_comments_not_supported(cfg): + cfg("diff.content", DIFF) + cfg("diff.output_path", None) provider = DiffGitProvider(None) assert provider.is_supported("publish_file_comments") is False -def test_path_traversal_file_not_read(tmp_path, monkeypatch): +def test_path_traversal_file_not_read(cfg, tmp_path, monkeypatch): # SENTINEL TEST: this test FAILS if the path-traversal guard in # DiffGitProvider.get_diff_files() is removed. # @@ -121,8 +136,8 @@ def test_path_traversal_file_not_read(tmp_path, monkeypatch): "-TOP SECRET\n" "+REPLACED\n" ) - get_settings().set("diff.content", traversal_diff) - get_settings().set("diff.output_path", None) + cfg("diff.content", traversal_diff) + cfg("diff.output_path", None) provider = DiffGitProvider(None) files = provider.get_diff_files() assert len(files) == 1 @@ -135,19 +150,16 @@ def test_path_traversal_file_not_read(tmp_path, monkeypatch): ) -def test_malformed_diff_raises_valueerror(): +def test_malformed_diff_raises_valueerror(cfg): # A hunk with no file header triggers UnidiffParseError inside parse_unified_diff, # which the provider must re-raise as ValueError with a clear message. - get_settings().set("diff.content", "@@ -1,3 +1,3 @@\n line1\n-line2\n+line2-changed\n line3\n") - get_settings().set("diff.output_path", None) - try: + cfg("diff.content", "@@ -1,3 +1,3 @@\n line1\n-line2\n+line2-changed\n line3\n") + cfg("diff.output_path", None) + with pytest.raises(ValueError): DiffGitProvider(None) - assert False, "expected ValueError" - except ValueError: - pass -def test_no_repo_root_disables_enrichment(tmp_path, monkeypatch): +def test_no_repo_root_disables_enrichment(cfg, tmp_path, monkeypatch): # When run outside any git repo (no .git ancestor), enrichment must be # disabled and the provider must not read working-tree files even if a # file with the diff's name happens to exist in the CWD. @@ -155,8 +167,8 @@ def test_no_repo_root_disables_enrichment(tmp_path, monkeypatch): decoy.write_text("line1\nline2-changed\nline3\n", encoding="utf-8") monkeypatch.chdir(tmp_path) - get_settings().set("diff.content", DIFF) - get_settings().set("diff.output_path", None) + cfg("diff.content", DIFF) + cfg("diff.output_path", None) provider = DiffGitProvider(None) files = provider.get_diff_files() assert files[0].head_file == "", ( @@ -165,9 +177,9 @@ def test_no_repo_root_disables_enrichment(tmp_path, monkeypatch): assert files[0].base_file == "" -def test_publish_code_suggestions_renders_to_stdout(capsys): - get_settings().set("diff.content", DIFF) - get_settings().set("diff.output_path", None) +def test_publish_code_suggestions_renders_to_stdout(cfg, capsys): + cfg("diff.content", DIFF) + cfg("diff.output_path", None) provider = DiffGitProvider(None) suggestions = [ {"body": "**Suggestion:** use a constant", "relevant_file": "foo.py", @@ -182,32 +194,32 @@ def test_publish_code_suggestions_renders_to_stdout(capsys): assert "use a constant" in out -def test_publish_code_suggestions_empty_is_noop(capsys): - get_settings().set("diff.content", DIFF) - get_settings().set("diff.output_path", None) +def test_publish_code_suggestions_empty_is_noop(cfg, capsys): + cfg("diff.content", DIFF) + cfg("diff.output_path", None) provider = DiffGitProvider(None) assert provider.publish_code_suggestions([]) is True assert capsys.readouterr().out.strip() == "" -def test_incremental_review_disabled(): +def test_incremental_review_disabled(cfg): # -i has no meaning for a standalone diff; the provider must disable it so # PRReviewer never takes the incremental path (which would TypeError). from pr_agent.git_providers.git_provider import IncrementalPR - get_settings().set("diff.content", DIFF) - get_settings().set("diff.output_path", None) + cfg("diff.content", DIFF) + cfg("diff.output_path", None) provider = DiffGitProvider(None) incremental = IncrementalPR(is_incremental=True) provider.get_incremental_commits(incremental) assert incremental.is_incremental is False -def test_diff_content_forces_diff_provider(): +def test_diff_content_forces_diff_provider(cfg): # Even if config.git_provider points elsewhere (e.g. set by extra config), # the presence of loaded diff content must select the diff provider. from pr_agent.git_providers import get_git_provider_with_context - get_settings().set("config.git_provider", "github") - get_settings().set("diff.content", DIFF) - get_settings().set("diff.output_path", None) + cfg("config.git_provider", "github") + cfg("diff.content", DIFF) + cfg("diff.output_path", None) provider = get_git_provider_with_context("local_diff") assert isinstance(provider, DiffGitProvider)