CU-DBMI · d33bs · Jun 21, 2026 · Jun 21, 2026 · Jun 21, 2026
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -25,11 +25,16 @@ jobs:
       # run pre-commit ci lite for automated fixes
       - uses: pre-commit-ci/lite-action@v1.1.0
         if: ${{ !cancelled() }}
-      # Test that the hooks from `pre-commit-hooks.yaml`
-      # are working as expected.
+      # Test that the hooks from `pre-commit-hooks.yaml` are working as
+      # expected by running the check hook against known-compliant fixtures.
+      # The non-compliant fixtures under tests/data are intentional violations
+      # and are exercised through pre-commit and prek by the pytest suite.
       - name: run local hook
         run: |
-          pre-commit try-repo . --all
+          pre-commit try-repo . check --files \
+            tests/data/2_true_neg.md \
+            tests/data/4_true_neg.rst \
+            tests/data/5_policy_compliant.md
   run_tests:
     strategy:
       matrix:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
         args: ["--keep", "mdformat", "--keep", "pre-commit-update"]
 
 -   repo: https://github.com/tox-dev/pyproject-fmt
-    rev: "v2.23.0"
+    rev: "v2.25.0"
     hooks:
     -   id: pyproject-fmt
 -   repo: https://github.com/codespell-project/codespell

diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml
@@ -5,13 +5,15 @@
   description: Check if each line in the given file contains only one sentence.
   entry: onesentence check
   language: python
-  types: [text, markdown, rst]
+  # `types` is an AND-intersection, so `text` matches Markdown, reST, and plain
+  # text alike; the `files` regex narrows that to the extensions we support.
+  types: [text]
   files: \.(md|rst|txt)$
 # fix
 - id: fix
   name: One Sentence Per Line Fix
   description: Fix files to ensure each line contains only one sentence.
   entry: onesentence fix
   language: python
-  types: [text, markdown, rst]
+  types: [text]
   files: \.(md|rst|txt)$
diff --git a/README.md b/README.md
@@ -33,16 +33,21 @@ The `onesentence` tool provides a command-line interface for checking and fixing
 #### Commands
 
 ```bash
-  onesentence check <file_path>
+  onesentence check <file_path> [<file_path> ...]
 ```
 
-This command checks if the specified file adheres to the "one sentence per line" rule. It will return a non-zero exit code if any violations are found.
+This command checks whether each given file adheres to the "one sentence per line" rule.
+One or more files may be passed (for example, the filenames pre-commit hands to a hook).
+It returns a non-zero exit code if any file has a violation.
 
 ```bash
-  onesentence fix <file_path> [<dest_path>]
+  onesentence fix <file_path> [<file_path> ...] [--output <path>]
 ```
 
-This command corrects the specified file by splitting lines with multiple sentences into separate lines. If a dest_path is provided, the corrected file will be written to that path; otherwise, the original file will be overwritten.
+This command corrects each given file by splitting lines with multiple sentences onto separate lines.
+By default every file is corrected in place, so processing many files never lets one file overwrite another.
+Pass `--output <path>` to write a single corrected file to a separate destination; this is only valid with exactly one input file.
+It returns a non-zero exit code if any file required changes.
 
 ## Pre-commit hook
 
@@ -51,10 +56,28 @@ Install this pre-commit hook into your project with a block like the following:
 ```yaml
 repos:
   - repo: https://github.com/CU-DBMI/onesentence
-    rev: v0.0.1
+    rev: v0.1.1
     hooks:
         # run checks
         - id: check
         # run checks and fixes where possible
         - id: fix
 ```
+
+### Using onesentence with a Markdown formatter
+
+If you also run a Markdown formatter such as
+[`mdformat`](https://github.com/executablebooks/mdformat), configure it to
+preserve existing line breaks so it does not undo the one-sentence-per-line
+splitting.
+For `mdformat` this means using `--wrap=keep` (the default), and notably **not**
+`--wrap=no` or a fixed wrap width, either of which would rejoin sentences onto a
+single line.
+
+```yaml
+  - repo: https://github.com/executablebooks/mdformat
+    rev: 0.7.22
+    hooks:
+        - id: mdformat
+          args: ["--wrap=keep"]
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,13 +26,11 @@ scripts.onesentence = "onesentence.cli:trigger"
 [dependency-groups]
 dev = [
   "coverage>=7.6.12",
+  "mdformat>=0.7.22",
+  "pre-commit>=4",
+  "prek>=0.4.5",
   "pytest>=8.3.5",
 ]
 
 [tool.setuptools_scm]
 root = "."
-
-[tool.uv]
-dev-dependencies = [
-  "pre-commit>=4.0.0",
-]
diff --git a/src/onesentence/analyze.py b/src/onesentence/analyze.py
@@ -2,9 +2,91 @@
 Module for checking for one sentence per line and related.
 """
 
-import pysbd
 import re
-from typing import Optional
+from typing import List, Optional
+
+import pysbd
+
+# A single segmenter is reused across calls; constructing one per line is
+# needless overhead and the segmenter is stateless between ``segment`` calls.
+_SEGMENTER = pysbd.Segmenter(language="en", clean=False)
+
+# Non-prose Markdown / reStructuredText structures that are exempt from the
+# "one sentence per line" rule.  These are matched against the stripped line.
+_HEADING_RE = re.compile(r"^#{1,6}(\s|$)")  # ATX heading, incl. "### 1. Foo"
+_RULE_OR_UNDERLINE_RE = re.compile(
+    r"^[=\-~`#*_]+$"
+)  # setext underline, thematic break, or emphasis-only line
+_RST_DIRECTIVE_RE = re.compile(r"^\.\.\s+\w+::")  # ".. directive::"
+_LINK_DEFINITION_RE = re.compile(
+    r"^\[[^\]]+\]:\s+\S"
+)  # link reference definition, e.g. "[id]: https://example.com"
+_TABLE_ROW_RE = re.compile(r"^\|")  # "| cell | cell |"
+_TABLE_SEPARATOR_RE = re.compile(
+    r"^\|?[\s:|-]*\|[\s:|-]*$"
+)  # "|---|:--:|" style separators
+
+# Prose constructs whose internal punctuation must not be read as sentence
+# boundaries; each is replaced with a single opaque token before segmentation.
+_INLINE_CODE_RE = re.compile(r"``[^`]*``|`[^`]*`")  # `code` / ``code``
+_INLINE_LINK_RE = re.compile(r"!?\[[^\]]*\]\([^)]*\)")  # [text](url) / ![alt](url)
+_REFERENCE_LINK_RE = re.compile(r"\[[^\]]*\]\[[^\]]*\]")  # [text][ref]
+_AUTOLINK_RE = re.compile(r"<[^>\s]+>")  # <https://example.com>
+# Bare URLs, stopping before any trailing sentence punctuation so a period that
+# actually ends the sentence is preserved.
+_BARE_URL_RE = re.compile(
+    r"\b(?:https?|ftp)://[^\s]+?(?=[.,!?;:'\")\]]*(?:\s|$))"
+    r"|\bwww\.[^\s]+?(?=[.,!?;:'\")\]]*(?:\s|$))"
+)
+# Remaining markup (emphasis, heading markers, pipes, ...) that is not part of
+# sentence structure.
+_NON_SENTENCE_CHARS_RE = re.compile(r"[^a-zA-Z0-9\s.,!?\'\"()\-]")
+
+
+def _is_structural_line(stripped: str) -> bool:
+    """
+    Return True for non-prose Markdown/reST lines exempt from the rule.
+
+    Args:
+        stripped (str): The line with surrounding whitespace removed.
+
+    Returns:
+        bool: True if the line is a heading, horizontal rule / underline,
+        reST directive, link reference definition, or table row/separator.
+    """
+    return bool(
+        _HEADING_RE.match(stripped)
+        or _RULE_OR_UNDERLINE_RE.match(stripped)
+        or _RST_DIRECTIVE_RE.match(stripped)
+        or _LINK_DEFINITION_RE.match(stripped)
+        or _TABLE_ROW_RE.match(stripped)
+        or _TABLE_SEPARATOR_RE.match(stripped)
+    )
+
+
+def _mask_inline_constructs(text: str) -> str:
+    """
+    Replace inline code, links, and URLs with opaque tokens.
+
+    This keeps their internal punctuation (dots in URLs, abbreviations inside
+    code, ...) from being treated as sentence boundaries while leaving the
+    surrounding prose intact for segmentation.
+
+    Args:
+        text (str): The stripped line to mask.
+
+    Returns:
+        str: The line with inline constructs and stray markup removed.
+    """
+    # Order matters: code first, then links (which may wrap URLs), then any
+    # remaining autolinks / bare URLs.
+    text = _INLINE_CODE_RE.sub("INLINECODE", text)
+    text = _INLINE_LINK_RE.sub("LINK", text)
+    text = _REFERENCE_LINK_RE.sub("LINK", text)
+    text = _AUTOLINK_RE.sub("URL", text)
+    text = _BARE_URL_RE.sub("URL", text)
+    return _NON_SENTENCE_CHARS_RE.sub("", text)
+
 
 def is_single_sentence(line: str, ignore_block: bool) -> bool:
     """
@@ -29,34 +111,28 @@ def is_single_sentence(line: str, ignore_block: bool) -> bool:
     if ignore_block:
         return True
 
-    # Additional filtering for common reST and Markdown formatting
-    if re.match(r'^[=\-~`#\*]+$', line.strip()):
-        return True
-    if re.match(r'^\.\.\s+\w+::', line.strip()):
+    stripped = line.strip()
+
+    # Ignore non-prose structures (headings, rules, link definitions, tables,
+    # reST directives). Numbered headings such as "### 1. Foo" are covered here.
+    if _is_structural_line(stripped):
         return True
 
     # Allow multiple sentences in list items, their continuations, and blockquotes
-    if re.match(r'^\s*[-*+]\s+', line):  # Unordered list item
+    if re.match(r"^\s*[-*+]\s+", line):  # Unordered list item
         return True
-    if re.match(r'^\s*\d+\.\s+', line):  # Ordered list item
+    if re.match(r"^\s*\d+\.\s+", line):  # Ordered list item
         return True
-    if re.match(r'^\s+\S', line):  # Indented continuation of a list item
+    if re.match(r"^\s+\S", line):  # Indented continuation of a list item
         return True
-    if re.match(r'^>\s*', line):  # Blockquote
+    if re.match(r"^>\s*", line):  # Blockquote
         return True
 
-    line = line.strip()
-
-    # Mask inline code spans so their content doesn't trigger false sentence breaks
-    # Double backticks (reST) must be matched before single backticks (Markdown)
-    line = re.sub(r'``[^`]*``|`[^`]*`', 'INLINECODE', line)
+    # Mask inline code, links, and URLs so their punctuation does not trigger
+    # false sentence breaks, then count the remaining sentences.
+    masked = _mask_inline_constructs(stripped)
+    return len(_SEGMENTER.segment(masked)) == 1
 
-    # Remove special characters that do not pertain to sentence structure
-    line = re.sub(r'[^a-zA-Z0-9\s.,!?\'"()\-]', '', line)
-
-    segmenter = pysbd.Segmenter(language="en", clean=False)
-    sentences = segmenter.segment(line)
-    return len(sentences) == 1
 
 def check_file_for_one_sentence_per_line(file_path: str) -> bool:
     """
@@ -71,9 +147,9 @@ def check_file_for_one_sentence_per_line(file_path: str) -> bool:
     all_single_sentences = True
     ignore_block = False
     in_code_block = False
-    with open(file_path, 'r') as file:
+    with open(file_path, "r") as file:
         for line_number, line in enumerate(file, start=1):
-            if line.strip().startswith('```'):
+            if line.strip().startswith("```"):
                 in_code_block = not in_code_block
                 continue
             if "noqa: onesentence-start" in line:
@@ -82,12 +158,15 @@ def check_file_for_one_sentence_per_line(file_path: str) -> bool:
             if "noqa: onesentence-end" in line:
                 ignore_block = False
                 continue
-            if not is_single_sentence(line.rstrip('\n'), ignore_block or in_code_block):
+            if not is_single_sentence(line.rstrip("\n"), ignore_block or in_code_block):
                 print(f"Failed: line {line_number}: {line.strip()}")
                 all_single_sentences = False
     return all_single_sentences
 
-def correct_file_for_one_sentence_per_line(file_path: str, dest_path: Optional[str] = None) -> bool:
+
+def correct_file_for_one_sentence_per_line(
+    file_path: str, dest_path: Optional[str] = None
+) -> bool:
     """
     Check if each line in the given file contains only one sentence.
     If not, correct the file by replacing the contents with correctly segmented sentences.
@@ -105,16 +184,16 @@ def correct_file_for_one_sentence_per_line(file_path: str, dest_path: Optional[s
     all_single_sentences = True
     ignore_block = False
     in_code_block = False
-    corrected_lines = []
-
-    segmenter = pysbd.Segmenter(language="en", clean=False)
+    corrected_lines: List[str] = []
 
-    with open(file_path, 'r') as file:
+    with open(file_path, "r") as file:
         for line_number, line in enumerate(file, start=1):
-            original_indent = re.match(r'^\s*', line).group()  # Capture the original indentation
+            original_indent = re.match(
+                r"^\s*", line
+            ).group()  # Capture the original indentation
             stripped_line = line.strip()
 
-            if stripped_line.startswith('```'):
+            if stripped_line.startswith("```"):
                 in_code_block = not in_code_block
                 corrected_lines.append(line.rstrip())
                 continue
@@ -126,17 +205,19 @@ def correct_file_for_one_sentence_per_line(file_path: str, dest_path: Optional[s
                 ignore_block = False
                 corrected_lines.append(line.rstrip())
                 continue
-            if not is_single_sentence(line.rstrip('\n'), ignore_block or in_code_block):
+            if not is_single_sentence(line.rstrip("\n"), ignore_block or in_code_block):
                 print(f"Failed: line {line_number}: {stripped_line}")
                 all_single_sentences = False
                 if not ignore_block:
-                    sentences = segmenter.segment(stripped_line)
+                    sentences = _SEGMENTER.segment(stripped_line)
                     # Detect and move lines with only Markdown characters to the end of the second-to-last line
-                    if sentences and re.match(r'^[=\-~`#\*]+$', sentences[-1]):
+                    if sentences and re.match(r"^[=\-~`#\*]+$", sentences[-1]):
                         markdown_line = sentences.pop()
                         if sentences:
                             sentences[-1] += markdown_line
-                    corrected_lines.extend([original_indent + sentence.strip() for sentence in sentences])
+                    corrected_lines.extend(
+                        [original_indent + sentence.strip() for sentence in sentences]
+                    )
                 else:
                     corrected_lines.append(line.rstrip())
             else:
@@ -147,8 +228,8 @@ def correct_file_for_one_sentence_per_line(file_path: str, dest_path: Optional[s
         dest_path = file_path
 
     # Write the corrected content back to the file
-    with open(dest_path, 'w') as file:
+    with open(dest_path, "w") as file:
         for corrected_line in corrected_lines:
-            file.write(corrected_line + '\n')
+            file.write(corrected_line + "\n")
 
     return all_single_sentences