diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index d2278ad..1fa0788 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -25,11 +25,16 @@ jobs: # run pre-commit ci lite for automated fixes - uses: pre-commit-ci/lite-action@v1.1.0 if: ${{ !cancelled() }} - # Test that the hooks from `pre-commit-hooks.yaml` - # are working as expected. + # Test that the hooks from `pre-commit-hooks.yaml` are working as + # expected by running the check hook against known-compliant fixtures. + # The non-compliant fixtures under tests/data are intentional violations + # and are exercised through pre-commit and prek by the pytest suite. - name: run local hook run: | - pre-commit try-repo . --all + pre-commit try-repo . check --files \ + tests/data/2_true_neg.md \ + tests/data/4_true_neg.rst \ + tests/data/5_policy_compliant.md run_tests: strategy: matrix: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9a23038..613ed60 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: args: ["--keep", "mdformat", "--keep", "pre-commit-update"] - repo: https://github.com/tox-dev/pyproject-fmt - rev: "v2.23.0" + rev: "v2.25.0" hooks: - id: pyproject-fmt - repo: https://github.com/codespell-project/codespell diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml index 2f307d5..b57a310 100644 --- a/.pre-commit-hooks.yaml +++ b/.pre-commit-hooks.yaml @@ -5,7 +5,9 @@ description: Check if each line in the given file contains only one sentence. entry: onesentence check language: python - types: [text, markdown, rst] + # `types` is an AND-intersection, so `text` matches Markdown, reST, and plain + # text alike; the `files` regex narrows that to the extensions we support. + types: [text] files: \.(md|rst|txt)$ # fix - id: fix @@ -13,5 +15,5 @@ description: Fix files to ensure each line contains only one sentence. entry: onesentence fix language: python - types: [text, markdown, rst] + types: [text] files: \.(md|rst|txt)$ diff --git a/README.md b/README.md index 1a5e9f8..6d0e42d 100644 --- a/README.md +++ b/README.md @@ -33,16 +33,21 @@ The `onesentence` tool provides a command-line interface for checking and fixing #### Commands ```bash - onesentence check + onesentence check [ ...] ``` -This command checks if the specified file adheres to the "one sentence per line" rule. It will return a non-zero exit code if any violations are found. +This command checks whether each given file adheres to the "one sentence per line" rule. +One or more files may be passed (for example, the filenames pre-commit hands to a hook). +It returns a non-zero exit code if any file has a violation. ```bash - onesentence fix [] + onesentence fix [ ...] [--output ] ``` -This command corrects the specified file by splitting lines with multiple sentences into separate lines. If a dest_path is provided, the corrected file will be written to that path; otherwise, the original file will be overwritten. +This command corrects each given file by splitting lines with multiple sentences onto separate lines. +By default every file is corrected in place, so processing many files never lets one file overwrite another. +Pass `--output ` to write a single corrected file to a separate destination; this is only valid with exactly one input file. +It returns a non-zero exit code if any file required changes. ## Pre-commit hook @@ -51,10 +56,28 @@ Install this pre-commit hook into your project with a block like the following: ```yaml repos: - repo: https://github.com/CU-DBMI/onesentence - rev: v0.0.1 + rev: v0.1.1 hooks: # run checks - id: check # run checks and fixes where possible - id: fix ``` + +### Using onesentence with a Markdown formatter + +If you also run a Markdown formatter such as +[`mdformat`](https://github.com/executablebooks/mdformat), configure it to +preserve existing line breaks so it does not undo the one-sentence-per-line +splitting. +For `mdformat` this means using `--wrap=keep` (the default), and notably **not** +`--wrap=no` or a fixed wrap width, either of which would rejoin sentences onto a +single line. + +```yaml + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.22 + hooks: + - id: mdformat + args: ["--wrap=keep"] +``` diff --git a/pyproject.toml b/pyproject.toml index 39cdd3e..d6fc0d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,13 +26,11 @@ scripts.onesentence = "onesentence.cli:trigger" [dependency-groups] dev = [ "coverage>=7.6.12", + "mdformat>=0.7.22", + "pre-commit>=4", + "prek>=0.4.5", "pytest>=8.3.5", ] [tool.setuptools_scm] root = "." - -[tool.uv] -dev-dependencies = [ - "pre-commit>=4.0.0", -] diff --git a/src/onesentence/analyze.py b/src/onesentence/analyze.py index 0104c31..7a964de 100644 --- a/src/onesentence/analyze.py +++ b/src/onesentence/analyze.py @@ -2,9 +2,91 @@ Module for checking for one sentence per line and related. """ -import pysbd import re -from typing import Optional +from typing import List, Optional + +import pysbd + +# A single segmenter is reused across calls; constructing one per line is +# needless overhead and the segmenter is stateless between ``segment`` calls. +_SEGMENTER = pysbd.Segmenter(language="en", clean=False) + +# Non-prose Markdown / reStructuredText structures that are exempt from the +# "one sentence per line" rule. These are matched against the stripped line. +_HEADING_RE = re.compile(r"^#{1,6}(\s|$)") # ATX heading, incl. "### 1. Foo" +_RULE_OR_UNDERLINE_RE = re.compile( + r"^[=\-~`#*_]+$" +) # setext underline, thematic break, or emphasis-only line +_RST_DIRECTIVE_RE = re.compile(r"^\.\.\s+\w+::") # ".. directive::" +_LINK_DEFINITION_RE = re.compile( + r"^\[[^\]]+\]:\s+\S" +) # link reference definition, e.g. "[id]: https://example.com" +_TABLE_ROW_RE = re.compile(r"^\|") # "| cell | cell |" +_TABLE_SEPARATOR_RE = re.compile( + r"^\|?[\s:|-]*\|[\s:|-]*$" +) # "|---|:--:|" style separators + +# Prose constructs whose internal punctuation must not be read as sentence +# boundaries; each is replaced with a single opaque token before segmentation. +_INLINE_CODE_RE = re.compile(r"``[^`]*``|`[^`]*`") # `code` / ``code`` +_INLINE_LINK_RE = re.compile(r"!?\[[^\]]*\]\([^)]*\)") # [text](url) / ![alt](url) +_REFERENCE_LINK_RE = re.compile(r"\[[^\]]*\]\[[^\]]*\]") # [text][ref] +_AUTOLINK_RE = re.compile(r"<[^>\s]+>") # +# Bare URLs, stopping before any trailing sentence punctuation so a period that +# actually ends the sentence is preserved. +_BARE_URL_RE = re.compile( + r"\b(?:https?|ftp)://[^\s]+?(?=[.,!?;:'\")\]]*(?:\s|$))" + r"|\bwww\.[^\s]+?(?=[.,!?;:'\")\]]*(?:\s|$))" +) +# Remaining markup (emphasis, heading markers, pipes, ...) that is not part of +# sentence structure. +_NON_SENTENCE_CHARS_RE = re.compile(r"[^a-zA-Z0-9\s.,!?\'\"()\-]") + + +def _is_structural_line(stripped: str) -> bool: + """ + Return True for non-prose Markdown/reST lines exempt from the rule. + + Args: + stripped (str): The line with surrounding whitespace removed. + + Returns: + bool: True if the line is a heading, horizontal rule / underline, + reST directive, link reference definition, or table row/separator. + """ + return bool( + _HEADING_RE.match(stripped) + or _RULE_OR_UNDERLINE_RE.match(stripped) + or _RST_DIRECTIVE_RE.match(stripped) + or _LINK_DEFINITION_RE.match(stripped) + or _TABLE_ROW_RE.match(stripped) + or _TABLE_SEPARATOR_RE.match(stripped) + ) + + +def _mask_inline_constructs(text: str) -> str: + """ + Replace inline code, links, and URLs with opaque tokens. + + This keeps their internal punctuation (dots in URLs, abbreviations inside + code, ...) from being treated as sentence boundaries while leaving the + surrounding prose intact for segmentation. + + Args: + text (str): The stripped line to mask. + + Returns: + str: The line with inline constructs and stray markup removed. + """ + # Order matters: code first, then links (which may wrap URLs), then any + # remaining autolinks / bare URLs. + text = _INLINE_CODE_RE.sub("INLINECODE", text) + text = _INLINE_LINK_RE.sub("LINK", text) + text = _REFERENCE_LINK_RE.sub("LINK", text) + text = _AUTOLINK_RE.sub("URL", text) + text = _BARE_URL_RE.sub("URL", text) + return _NON_SENTENCE_CHARS_RE.sub("", text) + def is_single_sentence(line: str, ignore_block: bool) -> bool: """ @@ -29,34 +111,28 @@ def is_single_sentence(line: str, ignore_block: bool) -> bool: if ignore_block: return True - # Additional filtering for common reST and Markdown formatting - if re.match(r'^[=\-~`#\*]+$', line.strip()): - return True - if re.match(r'^\.\.\s+\w+::', line.strip()): + stripped = line.strip() + + # Ignore non-prose structures (headings, rules, link definitions, tables, + # reST directives). Numbered headings such as "### 1. Foo" are covered here. + if _is_structural_line(stripped): return True # Allow multiple sentences in list items, their continuations, and blockquotes - if re.match(r'^\s*[-*+]\s+', line): # Unordered list item + if re.match(r"^\s*[-*+]\s+", line): # Unordered list item return True - if re.match(r'^\s*\d+\.\s+', line): # Ordered list item + if re.match(r"^\s*\d+\.\s+", line): # Ordered list item return True - if re.match(r'^\s+\S', line): # Indented continuation of a list item + if re.match(r"^\s+\S", line): # Indented continuation of a list item return True - if re.match(r'^>\s*', line): # Blockquote + if re.match(r"^>\s*", line): # Blockquote return True - line = line.strip() - - # Mask inline code spans so their content doesn't trigger false sentence breaks - # Double backticks (reST) must be matched before single backticks (Markdown) - line = re.sub(r'``[^`]*``|`[^`]*`', 'INLINECODE', line) + # Mask inline code, links, and URLs so their punctuation does not trigger + # false sentence breaks, then count the remaining sentences. + masked = _mask_inline_constructs(stripped) + return len(_SEGMENTER.segment(masked)) == 1 - # Remove special characters that do not pertain to sentence structure - line = re.sub(r'[^a-zA-Z0-9\s.,!?\'"()\-]', '', line) - - segmenter = pysbd.Segmenter(language="en", clean=False) - sentences = segmenter.segment(line) - return len(sentences) == 1 def check_file_for_one_sentence_per_line(file_path: str) -> bool: """ @@ -71,9 +147,9 @@ def check_file_for_one_sentence_per_line(file_path: str) -> bool: all_single_sentences = True ignore_block = False in_code_block = False - with open(file_path, 'r') as file: + with open(file_path, "r") as file: for line_number, line in enumerate(file, start=1): - if line.strip().startswith('```'): + if line.strip().startswith("```"): in_code_block = not in_code_block continue if "noqa: onesentence-start" in line: @@ -82,12 +158,15 @@ def check_file_for_one_sentence_per_line(file_path: str) -> bool: if "noqa: onesentence-end" in line: ignore_block = False continue - if not is_single_sentence(line.rstrip('\n'), ignore_block or in_code_block): + if not is_single_sentence(line.rstrip("\n"), ignore_block or in_code_block): print(f"Failed: line {line_number}: {line.strip()}") all_single_sentences = False return all_single_sentences -def correct_file_for_one_sentence_per_line(file_path: str, dest_path: Optional[str] = None) -> bool: + +def correct_file_for_one_sentence_per_line( + file_path: str, dest_path: Optional[str] = None +) -> bool: """ Check if each line in the given file contains only one sentence. If not, correct the file by replacing the contents with correctly segmented sentences. @@ -105,16 +184,16 @@ def correct_file_for_one_sentence_per_line(file_path: str, dest_path: Optional[s all_single_sentences = True ignore_block = False in_code_block = False - corrected_lines = [] - - segmenter = pysbd.Segmenter(language="en", clean=False) + corrected_lines: List[str] = [] - with open(file_path, 'r') as file: + with open(file_path, "r") as file: for line_number, line in enumerate(file, start=1): - original_indent = re.match(r'^\s*', line).group() # Capture the original indentation + original_indent = re.match( + r"^\s*", line + ).group() # Capture the original indentation stripped_line = line.strip() - if stripped_line.startswith('```'): + if stripped_line.startswith("```"): in_code_block = not in_code_block corrected_lines.append(line.rstrip()) continue @@ -126,17 +205,19 @@ def correct_file_for_one_sentence_per_line(file_path: str, dest_path: Optional[s ignore_block = False corrected_lines.append(line.rstrip()) continue - if not is_single_sentence(line.rstrip('\n'), ignore_block or in_code_block): + if not is_single_sentence(line.rstrip("\n"), ignore_block or in_code_block): print(f"Failed: line {line_number}: {stripped_line}") all_single_sentences = False if not ignore_block: - sentences = segmenter.segment(stripped_line) + sentences = _SEGMENTER.segment(stripped_line) # Detect and move lines with only Markdown characters to the end of the second-to-last line - if sentences and re.match(r'^[=\-~`#\*]+$', sentences[-1]): + if sentences and re.match(r"^[=\-~`#\*]+$", sentences[-1]): markdown_line = sentences.pop() if sentences: sentences[-1] += markdown_line - corrected_lines.extend([original_indent + sentence.strip() for sentence in sentences]) + corrected_lines.extend( + [original_indent + sentence.strip() for sentence in sentences] + ) else: corrected_lines.append(line.rstrip()) else: @@ -147,8 +228,8 @@ def correct_file_for_one_sentence_per_line(file_path: str, dest_path: Optional[s dest_path = file_path # Write the corrected content back to the file - with open(dest_path, 'w') as file: + with open(dest_path, "w") as file: for corrected_line in corrected_lines: - file.write(corrected_line + '\n') + file.write(corrected_line + "\n") return all_single_sentences diff --git a/src/onesentence/cli.py b/src/onesentence/cli.py index 0a15c63..bc98142 100644 --- a/src/onesentence/cli.py +++ b/src/onesentence/cli.py @@ -2,42 +2,83 @@ CLI for onesentence """ -import fire import sys from typing import Optional -from onesentence.analyze import check_file_for_one_sentence_per_line, correct_file_for_one_sentence_per_line + +import fire + +from onesentence.analyze import ( + check_file_for_one_sentence_per_line, + correct_file_for_one_sentence_per_line, +) + class OneSentenceCheckCLI: - def check(self, file_path: str) -> bool: + def check(self, *file_paths: str) -> None: """ - Check if each line in the given file contains only one sentence. + Check that each line in the given file(s) contains only one sentence. + + Accepts one or more file paths (for example, the filenames pre-commit + passes to a hook). Every file is checked independently. Args: - file_path (str): The path to the file to check. + *file_paths (str): One or more paths to files to check. - Returns: - bool: True if all lines contain only one sentence, False otherwise. + Exits: + 0 if every file is compliant, 1 if any file has a violation, + 2 on usage errors (no files provided). """ - result = check_file_for_one_sentence_per_line(file_path= file_path) - if result: - sys.exit(0) - else: - sys.exit(1) - def fix(self, file_path: str, dest_path: Optional[str]=None) -> bool: + if not file_paths: + print("error: no input files provided", file=sys.stderr) + sys.exit(2) + + all_single_sentences = True + for file_path in file_paths: + if not check_file_for_one_sentence_per_line(file_path=file_path): + all_single_sentences = False + + sys.exit(0 if all_single_sentences else 1) + + def fix(self, *file_paths: str, output: Optional[str] = None) -> None: """ - Fix each line in the given file contains more than one sentence. + Fix lines that contain more than one sentence in the given file(s). + + Accepts one or more file paths. By default each file is corrected in + place, so processing many files never lets one file overwrite another. + Pass ``--output`` to write a single corrected file to a separate path; + this is only valid with exactly one input file. Args: - file_path (str): The path to the file to check. + *file_paths (str): One or more paths to files to fix. + output (str): Optional destination path for the corrected file. + Only valid when exactly one input file is given. - Returns: - bool: True if all lines contain only one sentence, False otherwise. + Exits: + 0 if every file was already compliant, 1 if any file required + changes, 2 on usage errors (no files, or ``--output`` with more + than one input file). """ - result = correct_file_for_one_sentence_per_line(file_path=file_path, dest_path=dest_path) - if result: - sys.exit(0) - else: - sys.exit(1) + if not file_paths: + print("error: no input files provided", file=sys.stderr) + sys.exit(2) + + if output is not None and len(file_paths) != 1: + print( + "error: --output requires exactly one input file " + f"(received {len(file_paths)})", + file=sys.stderr, + ) + sys.exit(2) + + all_single_sentences = True + for file_path in file_paths: + if not correct_file_for_one_sentence_per_line( + file_path=file_path, dest_path=output + ): + all_single_sentences = False + + sys.exit(0 if all_single_sentences else 1) + def trigger(): """ diff --git a/tests/data/5_policy_compliant.md b/tests/data/5_policy_compliant.md new file mode 100644 index 0000000..8ff0a09 --- /dev/null +++ b/tests/data/5_policy_compliant.md @@ -0,0 +1,25 @@ +# Security Policy + +## 1. Reporting a Vulnerability + +Please report responsibly. +Email the maintainers. + +### 1.2 Triage Process + +We triage within 48 hours. +We then assign a severity. + +| Severity | SLA | +| -------- | ------ | +| High | 1 day | +| Low | 1 week | + +See the [docs] for details. +Visit https://example.com/security for more. + +[docs]: https://example.com/docs "Documentation" + +``` +This is code. It has two sentences but must be ignored. +``` diff --git a/tests/test_cli.py b/tests/test_cli.py index 6845593..096dccd 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -58,7 +58,7 @@ def test_cli_fix_file(tmp_path, file_path, fixed_path, expected_returncode): """ dest_path = tmp_path / "test_file.md" - _, _, returncode = run_cli_command(["onesentence", "fix", str(file_path), str(dest_path)]) + _, _, returncode = run_cli_command(["onesentence", "fix", str(file_path), "--output", str(dest_path)]) assert returncode == expected_returncode @@ -73,3 +73,117 @@ def test_cli_fix_file(tmp_path, file_path, fixed_path, expected_returncode): comparison_dest_content = file.read() assert dest_content == comparison_dest_content + + +def test_cli_check_multiple_files(tmp_path): + """ + check accepts multiple paths and fails if any single file has a violation. + """ + good = tmp_path / "good.md" + good.write_text("One good sentence.\nAnother good one.\n") + bad = tmp_path / "bad.md" + bad.write_text("One. Two.\n") + + # All compliant -> 0 + _, _, returncode = run_cli_command(["onesentence", "check", str(good)]) + assert returncode == 0 + + # Any violation across the set -> 1 + _, _, returncode = run_cli_command(["onesentence", "check", str(good), str(bad)]) + assert returncode == 1 + + +def test_cli_fix_multiple_files_independently(tmp_path): + """ + fix corrects each file in place without letting one overwrite another. + """ + a = tmp_path / "a.md" + a.write_text("Alpha one. Alpha two.\n") + b = tmp_path / "b.md" + b.write_text("Beta one. Beta two.\n") + + _, _, returncode = run_cli_command(["onesentence", "fix", str(a), str(b)]) + assert returncode == 1 # both required changes + + assert a.read_text() == "Alpha one.\nAlpha two.\n" + assert b.read_text() == "Beta one.\nBeta two.\n" + + +def test_cli_fix_output_rejects_multiple_files(tmp_path): + """ + --output is only valid with a single input; using it with several inputs is + a usage error and must not write or modify anything. + """ + a = tmp_path / "a.md" + a.write_text("Alpha one. Alpha two.\n") + b = tmp_path / "b.md" + b.write_text("Beta one. Beta two.\n") + out = tmp_path / "out.md" + + _, _, returncode = run_cli_command( + ["onesentence", "fix", str(a), str(b), "--output", str(out)] + ) + assert returncode == 2 + assert not out.exists() + # inputs are left untouched + assert a.read_text() == "Alpha one. Alpha two.\n" + assert b.read_text() == "Beta one. Beta two.\n" + + +def test_cli_fix_output_single_file_leaves_source_untouched(tmp_path): + """ + fix --output writes the corrected content to the destination and leaves the + source file unchanged. + """ + src = tmp_path / "src.md" + src.write_text("Alpha. Beta.\n") + out = tmp_path / "out.md" + + _, _, returncode = run_cli_command( + ["onesentence", "fix", str(src), "--output", str(out)] + ) + assert returncode == 1 + assert src.read_text() == "Alpha. Beta.\n" + assert out.read_text() == "Alpha.\nBeta.\n" + + +@pytest.mark.parametrize("subcommand", ["check", "fix"]) +def test_cli_no_input_files_is_usage_error(subcommand): + """ + Running check/fix with no file arguments is a usage error (exit code 2). + """ + _, _, returncode = run_cli_command(["onesentence", subcommand]) + assert returncode == 2 + + +def test_cli_fix_is_idempotent(tmp_path): + """ + Fixing a file once makes subsequent checks pass and further fixes no-ops. + """ + doc = tmp_path / "doc.md" + doc.write_text("One. Two. Three.\n") + + _, _, returncode = run_cli_command(["onesentence", "fix", str(doc)]) + assert returncode == 1 + after_first_fix = doc.read_text() + assert after_first_fix == "One.\nTwo.\nThree.\n" + + # A check now passes. + _, _, returncode = run_cli_command(["onesentence", "check", str(doc)]) + assert returncode == 0 + + # A second fix changes nothing and reports success. + _, _, returncode = run_cli_command(["onesentence", "fix", str(doc)]) + assert returncode == 0 + assert doc.read_text() == after_first_fix + + +def test_cli_check_compliant_policy_document_returns_zero(): + """ + Checking a compliant document full of headings, tables, link definitions, + URLs, and code returns a zero exit code. + """ + _, _, returncode = run_cli_command( + ["onesentence", "check", "tests/data/5_policy_compliant.md"] + ) + assert returncode == 0 diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..703ef96 --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,171 @@ +""" +Integration tests for onesentence. + +These exercise onesentence together with the tools it is meant to be used +alongside: the ``mdformat`` formatter and the ``pre-commit`` / ``prek`` hook +runners. Each test is skipped when the relevant tool is not installed. +""" + +import os +import shutil +import subprocess +import sys +from pathlib import Path + +import pytest +import yaml + +REPO_ROOT = Path(__file__).resolve().parent.parent +MANIFEST = REPO_ROOT / ".pre-commit-hooks.yaml" + + +def _resolve_tool(name: str): + """ + Resolve a CLI tool to an absolute path. + + Prefer the script that ships next to the active interpreter (the venv used + to run the tests) so we bypass any shims on PATH; fall back to PATH. + Returns None when the tool cannot be found. + """ + candidate = os.path.join(os.path.dirname(sys.executable), name) + if os.path.isfile(candidate): + return candidate + return shutil.which(name) + + +ONESENTENCE = _resolve_tool("onesentence") + + +def _run(cmd, cwd=None): + return subprocess.run(cmd, cwd=cwd, capture_output=True, text=True, check=False) + + +@pytest.mark.skipif( + _resolve_tool("mdformat") is None or ONESENTENCE is None, + reason="mdformat (or onesentence) is not installed", +) +def test_mdformat_and_onesentence_are_idempotent(tmp_path): + """ + Repeated ``mdformat --wrap=keep`` + ``onesentence fix`` runs converge to a + fixed point: once stable, further rounds change nothing and the document is + one-sentence-per-line compliant. + """ + mdformat = _resolve_tool("mdformat") + doc = tmp_path / "doc.md" + doc.write_text( + "# Title\n" + "\n" + "## 1. Section\n" + "\n" + "First sentence. Second sentence. Third sentence.\n" + "\n" + "See https://example.com/x for details. Then continue onward.\n" + ) + + def round_trip(): + _run([mdformat, "--wrap=keep", str(doc)]) + return _run([ONESENTENCE, "fix", str(doc)]) + + # Iterate until the content stops changing. + previous = None + for _ in range(6): + round_trip() + current = doc.read_text() + if current == previous: + break + previous = current + else: + pytest.fail("mdformat + onesentence did not reach a fixed point") + + stable = doc.read_text() + + # Once stable, another round changes nothing and reports success (0). + result = round_trip() + assert doc.read_text() == stable + assert result.returncode == 0 + + # The stable document is one-sentence-per-line compliant. + assert _run([ONESENTENCE, "check", str(doc)]).returncode == 0 + + # And the multi-sentence prose really was split onto separate lines. + assert "First sentence.\nSecond sentence.\nThird sentence." in stable + + +def _check_hook_from_manifest(): + """Load the ``check`` hook definition from the project's hook manifest.""" + hooks = yaml.safe_load(MANIFEST.read_text()) + return next(hook for hook in hooks if hook["id"] == "check") + + +def _write_consumer_repo(tmp_path): + """ + Create a throwaway git repo configured to run onesentence as a local hook, + deriving the file-selection rules (``types`` / ``files``) from the project's + actual hook manifest so the test validates the manifest itself. + """ + hook = _check_hook_from_manifest() + _run(["git", "init", "-q", str(tmp_path)]) + _run(["git", "-C", str(tmp_path), "config", "user.email", "test@example.com"]) + _run(["git", "-C", str(tmp_path), "config", "user.name", "test"]) + + config = { + "repos": [ + { + "repo": "local", + "hooks": [ + { + "id": "onesentence-check", + "name": hook["name"], + # Use the resolved absolute entry so the hook does not + # depend on onesentence being on PATH. + "entry": f"{ONESENTENCE} check", + "language": "system", + "types": hook["types"], + "files": hook["files"], + } + ], + } + ] + } + (tmp_path / ".pre-commit-config.yaml").write_text(yaml.safe_dump(config)) + + # A genuine violation in a Markdown file... + (tmp_path / "bad.md").write_text("# Title\n\nOne sentence. Two sentence.\n") + # ...a compliant reStructuredText file... + (tmp_path / "ok.rst").write_text("Title\n=====\n\nA single sentence.\n") + # ...and a non-matching source file that must be ignored entirely. + (tmp_path / "code.py").write_text("x = 1 # One. Two.\n") + _run(["git", "-C", str(tmp_path), "add", "-A"]) + + +@pytest.mark.skipif(ONESENTENCE is None, reason="onesentence is not installed") +@pytest.mark.parametrize("runner_name", ["pre-commit", "prek"]) +def test_hook_runs_through_runner(runner_name, tmp_path): + """ + The hook's file matching and exit codes behave correctly under both + pre-commit and prek: matching files are checked, a violation fails, a + compliant file passes, and non-matching files are skipped. + """ + runner = _resolve_tool(runner_name) + if runner is None: + pytest.skip(f"{runner_name} is not installed") + + _write_consumer_repo(tmp_path) + + def run_on(*filenames): + cmd = [runner, "run", "--files", *[str(tmp_path / f) for f in filenames]] + return _run(cmd, cwd=tmp_path) + + # The Markdown file with two sentences on one line fails the hook. This + # only happens if the file actually matched (the file-selection fix). + bad = run_on("bad.md") + assert bad.returncode != 0, bad.stdout + bad.stderr + + # The compliant reST file passes. + ok = run_on("ok.rst") + assert ok.returncode == 0, ok.stdout + ok.stderr + + # The Python file does not match and is skipped, so the run succeeds even + # though its comment contains two sentences. + code = run_on("code.py") + assert code.returncode == 0, code.stdout + code.stderr diff --git a/tests/test_utils.py b/tests/test_utils.py index eb12330..bae45d6 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -20,6 +20,28 @@ ("The string `foo. Bar baz` is invalid.", True), ("> This is a blockquote. It has two sentences.", True), (" continuation of a list item. With two sentences.", True), + # Headings are structural and exempt, including numbered headings that pysbd + # would otherwise split on (e.g. "### 1. Foo" -> ["### 1. ", "Foo"]). + ("# Heading", True), + ("## 1. Numbered Section", True), + ("### 1.2 Subsection With Two. Looking Parts", True), + # Horizontal rules / setext underlines. + ("---", True), + ("=====", True), + # Table rows and separators are structural. + ("| Severity | SLA |", True), + ("| -------- | --- |", True), + # Link reference definitions are structural. + ("[docs]: https://example.com/docs", True), + ('[docs]: https://example.com/docs "Documentation"', True), + # URLs and Markdown links must not trigger false sentence breaks. + ("Visit https://example.com/path for details.", True), + ("See the [guide](https://example.com/guide) for more.", True), + ("Contact us at .", True), + ("An image ![alt text](image.png) sits here.", True), + # ...but a genuine sentence break after a URL/link is still detected. + ("See https://example.com/a. Then continue here.", False), + ("Read [the docs](https://example.com/y). Then proceed onward.", False), ]) def test_is_single_sentence(line, expected): assert is_single_sentence(line, ignore_block=False) == expected @@ -37,6 +59,35 @@ def test_check_file_for_single_sentences(tmp_path, file_content, expected): file_path.write_text(file_content) assert check_file_for_one_sentence_per_line(file_path) == expected +def test_check_compliant_policy_document(): + """ + A realistic policy document made of headings (including numbered ones), + a table, a link reference definition, URLs, and a code block must be + reported as compliant. + """ + assert check_file_for_one_sentence_per_line("tests/data/5_policy_compliant.md") is True + + +def test_headings_and_tables_are_not_flagged_but_prose_is(tmp_path): + """ + Structural Markdown (headings, tables, link defs) is exempt, while genuine + multi-sentence prose lines are still caught. + """ + content = ( + "# Title\n\n" + "## 1. Numbered Heading\n\n" + "| Col A | Col B |\n" + "| ----- | ----- |\n" + "| 1 | 2 |\n\n" + "This line is fine.\n" + "This line has two sentences. That is a violation.\n" + ) + file_path = tmp_path / "doc.md" + file_path.write_text(content) + # Only the genuine prose violation should fail the check. + assert check_file_for_one_sentence_per_line(str(file_path)) is False + + @pytest.mark.parametrize("file_content, expected_content, expected_returncode", [ ( "This is a single sentence.\nAnother single sentence.\n", diff --git a/uv.lock b/uv.lock index cb69a39..51da1e3 100644 --- a/uv.lock +++ b/uv.lock @@ -1,6 +1,10 @@ version = 1 revision = 3 requires-python = ">=3.9" +resolution-markers = [ + "python_full_version >= '3.10'", + "python_full_version < '3.10'", +] [[package]] name = "cfgv" @@ -179,6 +183,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e0/86/c4395700f3c5475424fb5c41e20c16be28d10c904aee4d005ba3217fc8e7/identify-2.6.2-py2.py3-none-any.whl", hash = "sha256:c097384259f49e372f4ea00a19719d95ae27dd5ff0fd77ad630aa891306b82f3", size = 98982, upload-time = "2024-11-09T18:11:35.861Z" }, ] +[[package]] +name = "importlib-metadata" +version = "8.7.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp", marker = "python_full_version < '3.10'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, +] + [[package]] name = "iniconfig" version = "2.0.0" @@ -188,6 +204,78 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892, upload-time = "2023-01-07T11:08:09.864Z" }, ] +[[package]] +name = "markdown-it-py" +version = "3.0.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] +dependencies = [ + { name = "mdurl", marker = "python_full_version < '3.10'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/38/71/3b932df36c1a044d397a1f92d1cf91ee0a503d91e470cbd670aa66b07ed0/markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb", size = 74596, upload-time = "2023-06-03T06:41:14.443Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528, upload-time = "2023-06-03T06:41:11.019Z" }, +] + +[[package]] +name = "markdown-it-py" +version = "4.2.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.10'", +] +dependencies = [ + { name = "mdurl", marker = "python_full_version >= '3.10'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/06/ff/7841249c247aa650a76b9ee4bbaeae59370dc8bfd2f6c01f3630c35eb134/markdown_it_py-4.2.0.tar.gz", hash = "sha256:04a21681d6fbb623de53f6f364d352309d4094dd4194040a10fd51833e418d49", size = 82454, upload-time = "2026-05-07T12:08:28.36Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/81/4da04ced5a082363ecfa159c010d200ecbd959ae410c10c0264a38cac0f5/markdown_it_py-4.2.0-py3-none-any.whl", hash = "sha256:9f7ebbcd14fe59494226453aed97c1070d83f8d24b6fc3a3bcf9a38092641c4a", size = 91687, upload-time = "2026-05-07T12:08:27.182Z" }, +] + +[[package]] +name = "mdformat" +version = "0.7.22" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] +dependencies = [ + { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, + { name = "markdown-it-py", version = "3.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "tomli", marker = "python_full_version < '3.10'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fc/eb/b5cbf2484411af039a3d4aeb53a5160fae25dd8c84af6a4243bc2f3fedb3/mdformat-0.7.22.tar.gz", hash = "sha256:eef84fa8f233d3162734683c2a8a6222227a229b9206872e6139658d99acb1ea", size = 34610, upload-time = "2025-01-30T18:00:51.418Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f2/6f/94a7344f6d634fe3563bea8b33bccedee37f2726f7807e9a58440dc91627/mdformat-0.7.22-py3-none-any.whl", hash = "sha256:61122637c9e1d9be1329054f3fa216559f0d1f722b7919b060a8c2a4ae1850e5", size = 34447, upload-time = "2025-01-30T18:00:48.708Z" }, +] + +[[package]] +name = "mdformat" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.10'", +] +dependencies = [ + { name = "markdown-it-py", version = "4.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "tomli", marker = "python_full_version == '3.10.*'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3f/05/32b5e14b192b0a8a309f32232c580aefedd9d06017cb8fe8fce34bec654c/mdformat-1.0.0.tar.gz", hash = "sha256:4954045fcae797c29f86d4ad879e43bb151fa55dbaf74ac6eaeacf1d45bb3928", size = 56953, upload-time = "2025-10-16T12:05:03.695Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/9a/8fe71b95985ca7a4001effbcc58e5a07a1f2a2884203f74dcf48a3b08315/mdformat-1.0.0-py3-none-any.whl", hash = "sha256:bca015d65a1d063a02e885a91daee303057bc7829c2cd37b2075a50dbb65944b", size = 53288, upload-time = "2025-10-16T12:05:02.607Z" }, +] + +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, +] + [[package]] name = "nodeenv" version = "1.9.1" @@ -208,7 +296,10 @@ dependencies = [ [package.dev-dependencies] dev = [ { name = "coverage" }, + { name = "mdformat", version = "0.7.22", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "mdformat", version = "1.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "pre-commit" }, + { name = "prek" }, { name = "pytest" }, ] @@ -221,7 +312,9 @@ requires-dist = [ [package.metadata.requires-dev] dev = [ { name = "coverage", specifier = ">=7.6.12" }, + { name = "mdformat", specifier = ">=0.7.22" }, { name = "pre-commit", specifier = ">=4.0.0" }, + { name = "prek", specifier = ">=0.4.5" }, { name = "pytest", specifier = ">=8.3.5" }, ] @@ -268,6 +361,30 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/16/8f/496e10d51edd6671ebe0432e33ff800aa86775d2d147ce7d43389324a525/pre_commit-4.0.1-py2.py3-none-any.whl", hash = "sha256:efde913840816312445dc98787724647c65473daefe420785f885e8ed9a06878", size = 218713, upload-time = "2024-10-08T16:09:35.726Z" }, ] +[[package]] +name = "prek" +version = "0.4.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2f/65/23866f43521d31173879aa74bb3a2df50ab7f3f74cdb4eaa31b8f446c7ca/prek-0.4.5.tar.gz", hash = "sha256:2be7bcf839de19a0144ed5a5aadf73bc5899cf6823bb1c58cf1d45ae389c201a", size = 482566, upload-time = "2026-06-15T11:36:48.299Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/cb/a9eedf9a35ca6ec72f12af2b4392d7f757bb24863b7b7af4523f939cf3fa/prek-0.4.5-py3-none-linux_armv6l.whl", hash = "sha256:f7517774c72b001573520dc7111156779fd3e5b4452c11f09ff53c71a067e835", size = 5618105, upload-time = "2026-06-15T11:36:21.998Z" }, + { url = "https://files.pythonhosted.org/packages/30/a7/c96c06f17db7da0a57be2be4c229aa00b525bca8001c9c765663b339cbb7/prek-0.4.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:aca9fa995536036a0171bcf7a4db96dc0a14f480054eda1d7d1c2e7739650993", size = 5972998, upload-time = "2026-06-15T11:36:41.12Z" }, + { url = "https://files.pythonhosted.org/packages/28/f1/721695355cdaa44be6f091e3a77fb9c72ed60289520f78b2f8c9a7197bdd/prek-0.4.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:66877ff21ae9d548f0f7e56fab8e65f1500a74a810e7749188c3f35a4a1b911b", size = 5525098, upload-time = "2026-06-15T11:36:30.127Z" }, + { url = "https://files.pythonhosted.org/packages/9b/1b/a334e1bb5361b49adf52b5ac7b6532018940f9f0f253437e8f43c3c1f7f3/prek-0.4.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:50697089a86a78d16f087c1912a2f3bc2bea82319a220fac52cc8e3ec9fc0426", size = 5793732, upload-time = "2026-06-15T11:36:35.745Z" }, + { url = "https://files.pythonhosted.org/packages/28/8c/aff94d276e91207a87cedff7cfefdd4aca20444137cca77bf53fffebe77a/prek-0.4.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:590427a42a3c1e5064487a0dc91167ae0c8a52168e77f574758ef9b138fcfd61", size = 5521719, upload-time = "2026-06-15T11:36:39.383Z" }, + { url = "https://files.pythonhosted.org/packages/4f/73/cfb0c5c909442050a8357e26233f7e511ba8e0d2f4b0bdc460065d62beb6/prek-0.4.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fd98b986767dafdb6b4305b563ee5a3a8f13bd3c78b98d708626815ea9f147f", size = 5922623, upload-time = "2026-06-15T11:36:18.063Z" }, + { url = "https://files.pythonhosted.org/packages/0a/ad/ff9d26551ba80d190bd08c6341176a5d56d4e6de9c2ebf077793d4adbb78/prek-0.4.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fccd11613ae92619d1ecda0ab3359ceebeb38898909ec84a8d383733d12158cc", size = 6722071, upload-time = "2026-06-15T11:36:43.086Z" }, + { url = "https://files.pythonhosted.org/packages/d3/43/11d1dfd66c919953fe89ae2fdedd4f413ee923883043816d35982177bb75/prek-0.4.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14109d37b33e5529db41a3539d4f8f72d295f6eeddede3964994d898b8cec05c", size = 6176454, upload-time = "2026-06-15T11:36:33.803Z" }, + { url = "https://files.pythonhosted.org/packages/d7/d4/9749f25c2e0ee5225f812457b888acef301e0ccce64bebcda2ac1d04abee/prek-0.4.5-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:40d262418105b2ede9836593a1927fc927cc8093c432e998640964102196996e", size = 5791133, upload-time = "2026-06-15T11:36:23.891Z" }, + { url = "https://files.pythonhosted.org/packages/c7/72/5e0344bab1eacf813a5b1b082cb4c6253930096166dad51c1cccee0a4f83/prek-0.4.5-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:a586d14c3b852fdee1c3dcd0b9cb0915db9f9d054334b854fd9470bf68edf129", size = 5658098, upload-time = "2026-06-15T11:36:44.862Z" }, + { url = "https://files.pythonhosted.org/packages/be/a5/1f406e0362dd0f18ba09a562d50d7c04a70ac05d350b1ab6fba36ca3e9f0/prek-0.4.5-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:a8ed0d28f3e7790e4402a9324c386509066df6e67cc587f7406f9a245b97b7e8", size = 5498634, upload-time = "2026-06-15T11:36:31.828Z" }, + { url = "https://files.pythonhosted.org/packages/c7/df/b0cbf0fa527330188390b7b6c8d279cd5e509923262d0a6c5cc44bbdf103/prek-0.4.5-py3-none-musllinux_1_1_i686.whl", hash = "sha256:86f76bd3d2ecf6fd9034d75c62ff4c786eb11d0dd0a1f79bbb4343b023e12769", size = 5784840, upload-time = "2026-06-15T11:36:37.481Z" }, + { url = "https://files.pythonhosted.org/packages/9d/d7/977ee3c622c906677dd94187a00392ce2dd76035486b3a3b1b5a5267dd34/prek-0.4.5-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:e491a1a4641d91d8b03dcce5588397e76d2a5b432c9b0a6c70475972b4512ab4", size = 6300384, upload-time = "2026-06-15T11:36:27.602Z" }, + { url = "https://files.pythonhosted.org/packages/79/fa/43b1d761381dc1c7eeb8f2235c66e902970d4b2bff2dec0f02836c085769/prek-0.4.5-py3-none-win32.whl", hash = "sha256:7546989b2403c96137bd79d19ebfe21facb87266cefe819db2458c3b9b23f350", size = 5287935, upload-time = "2026-06-15T11:36:20.293Z" }, + { url = "https://files.pythonhosted.org/packages/f5/fe/59b5eb3124f5a4cc255a93857b9ab42402635b273f157e91de23bfa40e8f/prek-0.4.5-py3-none-win_amd64.whl", hash = "sha256:8b2ac9227504371d97338215b344184cb0b31ca94113515a3a90c509c6c5a707", size = 5682560, upload-time = "2026-06-15T11:36:25.865Z" }, + { url = "https://files.pythonhosted.org/packages/97/0e/589ff0eab9034909b1ec8654ee03483797305fb743b3554ce6140d82da9d/prek-0.4.5-py3-none-win_arm64.whl", hash = "sha256:646a86a1a082dbd99fed96314b1064f5644bb34c1f4037a63547a18e2160fb86", size = 5509019, upload-time = "2026-06-15T11:36:46.595Z" }, +] + [[package]] name = "pygments" version = "2.19.2" @@ -417,3 +534,12 @@ sdist = { url = "https://files.pythonhosted.org/packages/8c/b3/7b6a79c5c8cf6d90e wheels = [ { url = "https://files.pythonhosted.org/packages/ae/92/78324ff89391e00c8f4cf6b8526c41c6ef36b4ea2d2c132250b1a6fc2b8d/virtualenv-20.27.1-py3-none-any.whl", hash = "sha256:f11f1b8a29525562925f745563bfd48b189450f61fb34c4f9cc79dd5aa32a1f4", size = 3117838, upload-time = "2024-10-28T18:00:19.994Z" }, ] + +[[package]] +name = "zipp" +version = "3.23.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/30/21/093488dfc7cc8964ded15ab726fad40f25fd3d788fd741cc1c5a17d78ee8/zipp-3.23.1.tar.gz", hash = "sha256:32120e378d32cd9714ad503c1d024619063ec28aad2248dc6672ad13edfa5110", size = 25965, upload-time = "2026-04-13T23:21:46.6Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/8a/0861bec20485572fbddf3dfba2910e38fe249796cb73ecdeb74e07eeb8d3/zipp-3.23.1-py3-none-any.whl", hash = "sha256:0b3596c50a5c700c9cb40ba8d86d9f2cc4807e9bedb06bcdf7fac85633e444dc", size = 10378, upload-time = "2026-04-13T23:21:45.386Z" }, +]