From f9076bb67e51f277550a0e73e2277fdda7765a9c Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 19:05:41 +0000 Subject: [PATCH 1/3] Add repo hygiene: packaging, CI, security, community files - pyproject.toml makes the project pip-installable with a substack-link-checker console entry point - GitHub Actions CI workflow (ruff lint, multi-version Python smoke tests, build artifact) - SECURITY.md with vulnerability reporting and session-cookie guidance - CONTRIBUTING.md, CODE_OF_CONDUCT.md, CHANGELOG.md - Issue and PR templates under .github/ - Dependabot for pip and github-actions - .env.example documenting supported env vars - Fix incorrect clone URL in README, add CI badge --- .env.example | 10 +++ .github/ISSUE_TEMPLATE/bug_report.yml | 51 +++++++++++++++ .github/ISSUE_TEMPLATE/config.yml | 5 ++ .github/ISSUE_TEMPLATE/feature_request.yml | 23 +++++++ .github/PULL_REQUEST_TEMPLATE.md | 20 ++++++ .github/dependabot.yml | 17 +++++ .github/workflows/ci.yml | 64 +++++++++++++++++++ CHANGELOG.md | 28 +++++++++ CODE_OF_CONDUCT.md | 46 ++++++++++++++ CONTRIBUTING.md | 42 +++++++++++++ README.md | 13 +++- SECURITY.md | 49 +++++++++++++++ pyproject.toml | 72 ++++++++++++++++++++++ 13 files changed, 437 insertions(+), 3 deletions(-) create mode 100644 .env.example create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yml create mode 100644 .github/ISSUE_TEMPLATE/config.yml create mode 100644 .github/ISSUE_TEMPLATE/feature_request.yml create mode 100644 .github/PULL_REQUEST_TEMPLATE.md create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/ci.yml create mode 100644 CHANGELOG.md create mode 100644 CODE_OF_CONDUCT.md create mode 100644 CONTRIBUTING.md create mode 100644 SECURITY.md create mode 100644 pyproject.toml diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..94b8fda --- /dev/null +++ b/.env.example @@ -0,0 +1,10 @@ +# Copy to .env (gitignored) and fill in. The CLI does not auto-load this +# file; these are documented for your own shell setup, e.g.: +# export $(grep -v '^#' .env | xargs) + +# Your Substack site, e.g. https://example.substack.com +SUBSTACK_BASE_URL= + +# Substack session cookie (substack.sid). Treat like a password. +# Get it from your browser DevTools -> Application -> Cookies after logging in. +SUBSTACK_COOKIE= diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..2abd140 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,51 @@ +name: Bug report +description: Report a problem with the link checker +title: "[Bug]: " +labels: [bug] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to file a bug! **Do not include your + Substack session cookie** anywhere in this report. Redact it from + any commands or logs you paste. + - type: textarea + id: what-happened + attributes: + label: What happened? + description: A clear description of the bug and what you expected instead. + validations: + required: true + - type: textarea + id: command + attributes: + label: Command you ran + description: Paste the command (with cookie value redacted as `REDACTED`). + render: bash + validations: + required: true + - type: textarea + id: output + attributes: + label: Output / error + description: Full output or stack trace. Redact any cookie values. + render: shell + - type: input + id: python + attributes: + label: Python version + placeholder: "e.g. 3.11.5" + validations: + required: true + - type: input + id: os + attributes: + label: Operating system + placeholder: "e.g. macOS 14.4, Ubuntu 22.04, Windows 11" + validations: + required: true + - type: input + id: version + attributes: + label: Tool version / commit + placeholder: "e.g. 0.1.0 or commit hash" diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..03bbf37 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,5 @@ +blank_issues_enabled: false +contact_links: + - name: Security vulnerability + url: https://github.com/jcddc83/substack-broken-link-checker/security/advisories/new + about: Please report security issues privately, not in public issues. See SECURITY.md. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..e026fe9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,23 @@ +name: Feature request +description: Suggest a new feature or improvement +title: "[Feature]: " +labels: [enhancement] +body: + - type: textarea + id: problem + attributes: + label: Problem + description: What problem would this solve? What's the use case? + validations: + required: true + - type: textarea + id: proposal + attributes: + label: Proposed solution + description: How might this work? CLI flags, behavior, output format, etc. + validations: + required: true + - type: textarea + id: alternatives + attributes: + label: Alternatives considered diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..51f18c2 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,20 @@ +## Summary + + + +## Type of change + +- [ ] Bug fix +- [ ] New feature +- [ ] Refactor / cleanup +- [ ] Docs only +- [ ] CI / tooling + +## Checklist + +- [ ] I ran `ruff check .` and `ruff format .` +- [ ] I ran `pytest` (if applicable) +- [ ] I verified `python substack_link_checker.py --help` still works +- [ ] I updated `README.md` / `USAGE.md` for any CLI changes +- [ ] I updated `CHANGELOG.md` +- [ ] No secrets / session cookies are included in this PR diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..bb9e7c3 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,17 @@ +version: 2 +updates: + - package-ecosystem: pip + directory: "/" + schedule: + interval: weekly + open-pull-requests-limit: 5 + labels: + - dependencies + + - package-ecosystem: github-actions + directory: "/" + schedule: + interval: weekly + labels: + - dependencies + - github-actions diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..1bd4c4d --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,64 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install ruff + run: pip install ruff + - name: Lint + run: ruff check . + - name: Format check + run: ruff format --check . + + test: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.8", "3.10", "3.12"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + - name: Install + run: | + python -m pip install --upgrade pip + pip install -e ".[dev]" + - name: Import smoke test + run: | + python -c "import substack_link_checker; print('import OK')" + python substack_link_checker.py --help + - name: Run tests + run: pytest -q + continue-on-error: true + + build: + runs-on: ubuntu-latest + needs: [lint, test] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Build distributions + run: | + pip install build + python -m build + - uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..f77fa53 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,28 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added +- `pyproject.toml` making the project pip-installable with a + `substack-link-checker` console entry point. +- GitHub Actions CI workflow (`.github/workflows/ci.yml`) running ruff + lint, multi-version Python smoke tests, and a build step. +- `SECURITY.md` documenting vulnerability reporting and safe handling of + Substack session cookies. +- `CONTRIBUTING.md` and `CODE_OF_CONDUCT.md`. +- Issue and pull request templates under `.github/`. +- Dependabot configuration for weekly dependency and Actions updates. +- `.env.example` documenting the supported environment variables. + +### Fixed +- Corrected the clone URL in `README.md` (was `substack-link-checker`, + now `substack-broken-link-checker`). + +## [0.1.0] - 2026-05-18 + +Initial public release. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..03a6a55 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,46 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment: + +- Demonstrating empathy and kindness toward others +- Being respectful of differing opinions, viewpoints, and experiences +- Giving and gracefully accepting constructive feedback +- Accepting responsibility and apologizing to those affected by mistakes +- Focusing on what is best for the overall community + +Examples of unacceptable behavior: + +- The use of sexualized language or imagery, and sexual attention or advances +- Trolling, insulting or derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information without their explicit permission +- Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the project maintainers through GitHub. All complaints will be +reviewed and investigated promptly and fairly. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +https://www.contributor-covenant.org/version/2/1/code_of_conduct.html. + +[homepage]: https://www.contributor-covenant.org diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..bfd4cc6 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,42 @@ +# Contributing + +Thanks for your interest in improving the Substack Broken Link Checker. + +## Development setup + +```bash +git clone https://github.com/jcddc83/substack-broken-link-checker.git +cd substack-broken-link-checker +python -m venv .venv +source .venv/bin/activate # Windows: .venv\Scripts\activate +pip install -e ".[dev]" +``` + +## Before opening a PR + +- Run the linter: `ruff check .` +- Run the formatter: `ruff format .` +- Run the tests: `pytest` +- Verify the CLI still launches: `python substack_link_checker.py --help` + +## Filing issues + +Please use the issue templates under `.github/ISSUE_TEMPLATE/`. For bugs, +include your Python version, OS, the command you ran (with the cookie +value redacted), and the full error output. + +## Reporting security issues + +See [SECURITY.md](SECURITY.md). Do **not** file public issues for security +vulnerabilities. + +## Pull requests + +- Keep PRs focused — one logical change per PR. +- Update the `README.md` and `USAGE.md` if you change CLI behavior. +- Add tests for new behavior where practical. +- Match the style of the surrounding code. + +## Code of Conduct + +This project follows the [Contributor Covenant](CODE_OF_CONDUCT.md). diff --git a/README.md b/README.md index a3bbf3e..362a60d 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/) +[![CI](https://github.com/jcddc83/substack-broken-link-checker/actions/workflows/ci.yml/badge.svg)](https://github.com/jcddc83/substack-broken-link-checker/actions/workflows/ci.yml) A fast, async Python tool to find broken links in your Substack newsletter archive. @@ -38,11 +39,17 @@ python substack_link_checker.py --base-url https://YOUR.substack.com --url-file ## Installation ```bash -git clone https://github.com/jcddc83/substack-link-checker.git -cd substack-link-checker +git clone https://github.com/jcddc83/substack-broken-link-checker.git +cd substack-broken-link-checker pip install -r requirements.txt ``` +Or install as a package (provides a `substack-link-checker` CLI): + +```bash +pip install git+https://github.com/jcddc83/substack-broken-link-checker.git +``` + **Requirements**: Python 3.8+ ## Authentication (Optional) @@ -209,4 +216,4 @@ MIT License - see [LICENSE](LICENSE) file. ## Contributing -Issues and pull requests welcome at [github.com/jcddc83/substack-link-checker](https://github.com/jcddc83/substack-link-checker). +Issues and pull requests welcome at [github.com/jcddc83/substack-broken-link-checker](https://github.com/jcddc83/substack-broken-link-checker). See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines and [SECURITY.md](SECURITY.md) for reporting security issues. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..1e9f748 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,49 @@ +# Security Policy + +## Reporting a Vulnerability + +If you discover a security vulnerability in this project, please report it +privately rather than opening a public issue. + +- Preferred: use GitHub's [private vulnerability reporting](https://github.com/jcddc83/substack-broken-link-checker/security/advisories/new). +- Alternatively, open a minimal public issue requesting a private contact channel — do not include exploit details. + +Please include: + +- A description of the issue and its impact +- Steps to reproduce +- Affected versions / commit +- Any suggested mitigation + +You can expect an initial response within 7 days. + +## Supported Versions + +Only the latest release on `main` receives security fixes. + +## Handling Session Cookies + +This tool accepts a Substack session cookie (`substack.sid`) via the +`--cookie` flag in order to access bot-protected or paywalled posts. +**Treat this value like a password.** + +Recommended practices: + +- Do **not** commit cookies to source control or paste them into public + logs, screenshots, or issue reports. +- Prefer passing the cookie via an environment variable or a local file + ignored by `.gitignore` rather than your shell history. +- Rotate the cookie by logging out and back in if you suspect it was + exposed. Substack session cookies typically expire after a few weeks. +- The tool sends the cookie only to the `--base-url` you specify. Verify + that URL before running. + +If you find the tool logging the cookie value to disk or transmitting it +to any host other than the configured Substack domain, please report it +through the channels above. + +## Dependencies + +Dependencies are pinned with minimum versions in `pyproject.toml` and +monitored via Dependabot (`.github/dependabot.yml`). Please update to the +latest release to pick up upstream security fixes. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f82db0b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,72 @@ +[build-system] +requires = ["setuptools>=64", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "substack-broken-link-checker" +version = "0.1.0" +description = "A fast, async Python tool to find broken links in your Substack newsletter archive." +readme = "README.md" +license = { file = "LICENSE" } +requires-python = ">=3.8" +authors = [{ name = "jcddc83" }] +keywords = ["substack", "link-checker", "broken-links", "async", "newsletter", "seo"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: End Users/Desktop", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Software Development :: Quality Assurance", +] +dependencies = [ + "aiohttp>=3.9.0", + "beautifulsoup4>=4.12.0", + "requests>=2.31.0", + "lxml>=4.9.0", + "openpyxl>=3.1.0", + "pandas>=2.0.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0", + "pytest-asyncio>=0.21", + "ruff>=0.1.0", + "mypy>=1.0", +] + +[project.urls] +Homepage = "https://github.com/jcddc83/substack-broken-link-checker" +Issues = "https://github.com/jcddc83/substack-broken-link-checker/issues" +Repository = "https://github.com/jcddc83/substack-broken-link-checker" + +[project.scripts] +substack-link-checker = "substack_link_checker:main" + +[tool.setuptools] +py-modules = [ + "substack_link_checker", + "compare_posts", + "demo_link_checker", + "fetch_archive_urls", + "import_checked_posts", +] + +[tool.ruff] +line-length = 100 +target-version = "py38" + +[tool.ruff.lint] +select = ["E", "F", "W", "I", "B", "UP"] +ignore = ["E501"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +asyncio_mode = "auto" From cc60b3c991c5298e51fd0db85a0e336a73044126 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 19:13:22 +0000 Subject: [PATCH 2/3] Correct version and changelog to match live v1.0.0 release - pyproject.toml: 0.1.0 -> 1.0.0 to match the existing git tag and GitHub Release - CHANGELOG.md: replace fabricated 0.1.0 entry with a real 1.0.0 - 2026-01-01 section sourced from the v1.0.0 release notes; keep Unreleased section for the audit-branch additions - SECURITY.md: clarify the private-advisory link may 404 if the setting is not enabled, and reinforce no-exploit-details rule --- CHANGELOG.md | 29 +++++++++++++++++++++++++++-- SECURITY.md | 8 ++++++-- pyproject.toml | 2 +- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f77fa53..e935947 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Corrected the clone URL in `README.md` (was `substack-link-checker`, now `substack-broken-link-checker`). -## [0.1.0] - 2026-05-18 +## [1.0.0] - 2026-01-01 -Initial public release. +Major rewrite of the Substack broken link checker with significant +performance improvements and new features. See the +[GitHub Release](https://github.com/jcddc83/substack-broken-link-checker/releases/tag/v1.0.0) +for the full announcement. + +### Added +- Async concurrent link checking with `aiohttp` (10-20x faster than + sequential). +- Smart link caching — the same URL across multiple posts is checked once. +- Retry logic with exponential backoff for transient failures. +- Incremental scanning: `--history-file` to track checked posts and + `--only-new` to skip ones already covered. +- `import_checked_posts.py` to import previous results from Excel/CSV. +- Domain filtering: `--skip-domains` / `--skip-domains-file` to assume OK + for bot-blocking sites; `--broken-domains` / `--broken-domains-file` to + auto-flag known broken domains. +- `--cookie` flag for Substack session cookie authentication (works with + paywalled / bot-protected content). +- Helper scripts: `compare_posts.py` to find unchecked posts, + `fetch_archive_urls.py` as an archive-page fallback, and + `run_link_checker.ps1` for Windows Task Scheduler automation. +- Complete `README.md` / `USAGE.md` rewrite with security considerations + and expanded troubleshooting. + +[Unreleased]: https://github.com/jcddc83/substack-broken-link-checker/compare/v1.0.0...HEAD +[1.0.0]: https://github.com/jcddc83/substack-broken-link-checker/releases/tag/v1.0.0 diff --git a/SECURITY.md b/SECURITY.md index 1e9f748..8b26573 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -5,8 +5,12 @@ If you discover a security vulnerability in this project, please report it privately rather than opening a public issue. -- Preferred: use GitHub's [private vulnerability reporting](https://github.com/jcddc83/substack-broken-link-checker/security/advisories/new). -- Alternatively, open a minimal public issue requesting a private contact channel — do not include exploit details. +- Preferred: use GitHub's [private vulnerability reporting](https://github.com/jcddc83/substack-broken-link-checker/security/advisories/new) + if it is enabled on this repository. +- If that link returns a 404 (private reporting not enabled), open a + **minimal** public issue asking the maintainer for a private contact + channel — do **not** include exploit details, proof-of-concept code, or + any session cookies in the public issue. Please include: diff --git a/pyproject.toml b/pyproject.toml index f82db0b..394dbf0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "substack-broken-link-checker" -version = "0.1.0" +version = "1.0.0" description = "A fast, async Python tool to find broken links in your Substack newsletter archive." readme = "README.md" license = { file = "LICENSE" } From 2ff2899fe38d4de07e22edf2785165c36d48d4e2 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 19:18:50 +0000 Subject: [PATCH 3/3] B2 + B1: cookie audit fixes and initial pytest suite Cookie handling (B2): - main() now reads SUBSTACK_COOKIE env var; --cookie still wins if both are set so users can override ad-hoc. This is the path README and .env.example were already promising. - --cookie help text now warns about shell-history / ps aux exposure and points at the env var. - README: env var is documented as the recommended path; --cookie is shown as the fallback. - SECURITY.md: explicit guidance on env-var-vs-CLI; documents that the cookie is .substack.com-scoped on the requests session and that the outbound aiohttp link-check session is cookie-less. Tests (B1): - New tests/ suite with 29 tests covering: - should_skip_domain / is_broken_domain (exact, subdomain, lookalike, case-insensitive, malformed URL) - load_domains_from_file (comments, blanks, whitespace, missing file) - generate_report (empty results skip file; CSV header + rows; comma escaping in titles) - load/save history round-trip + corrupt-JSON recovery + filter_unchecked_posts - cookie handling: scoped to substack.com, not leaked via _log, history file, or repr; env-var fallback and CLI override - pytest now runs without continue-on-error in CI. - Also: ruff format auto-applied across the repo (cosmetic only), so CI's `ruff format --check .` passes. --- .github/workflows/ci.yml | 1 - CHANGELOG.md | 12 +- README.md | 11 +- SECURITY.md | 14 +- compare_posts.py | 49 ++-- demo_link_checker.py | 18 +- fetch_archive_urls.py | 24 +- import_checked_posts.py | 50 ++-- substack_link_checker.py | 332 ++++++++++++++------------- tests/__init__.py | 0 tests/conftest.py | 8 + tests/test_cookie_handling.py | 128 +++++++++++ tests/test_domain_filtering.py | 84 +++++++ tests/test_history.py | 58 +++++ tests/test_load_domains_from_file.py | 28 +++ tests/test_report_generation.py | 61 +++++ 16 files changed, 640 insertions(+), 238 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/test_cookie_handling.py create mode 100644 tests/test_domain_filtering.py create mode 100644 tests/test_history.py create mode 100644 tests/test_load_domains_from_file.py create mode 100644 tests/test_report_generation.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1bd4c4d..a65b5b7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -44,7 +44,6 @@ jobs: python substack_link_checker.py --help - name: Run tests run: pytest -q - continue-on-error: true build: runs-on: ubuntu-latest diff --git a/CHANGELOG.md b/CHANGELOG.md index e935947..01f8e4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `pyproject.toml` making the project pip-installable with a `substack-link-checker` console entry point. - GitHub Actions CI workflow (`.github/workflows/ci.yml`) running ruff - lint, multi-version Python smoke tests, and a build step. + lint, multi-version Python smoke tests, a real `pytest` suite, and a + build step. +- Initial `pytest` test suite (`tests/`) covering domain filtering, + CSV report generation, history persistence, the + `load_domains_from_file` helper, and cookie-handling guarantees. - `SECURITY.md` documenting vulnerability reporting and safe handling of Substack session cookies. - `CONTRIBUTING.md` and `CODE_OF_CONDUCT.md`. @@ -19,6 +23,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Dependabot configuration for weekly dependency and Actions updates. - `.env.example` documenting the supported environment variables. +### Security +- `SUBSTACK_COOKIE` environment variable is now supported as a safer + alternative to the `--cookie` CLI flag (which leaks the cookie into + shell history and `ps aux`). README and `SECURITY.md` updated to + recommend the env-var path. + ### Fixed - Corrected the clone URL in `README.md` (was `substack-link-checker`, now `substack-broken-link-checker`). diff --git a/README.md b/README.md index 362a60d..1863add 100644 --- a/README.md +++ b/README.md @@ -59,13 +59,22 @@ If Substack blocks your requests or you need to check paywalled content, use you 1. Log into your Substack in a browser 2. Open Developer Tools (F12) → Application → Cookies 3. Find the `substack.sid` cookie and copy its value -4. Use it with the `--cookie` flag: +4. Provide it via the `SUBSTACK_COOKIE` environment variable (recommended) or the `--cookie` flag: ```bash +# Recommended: env var (keeps cookie out of shell history / ps aux) +export SUBSTACK_COOKIE="your-substack-sid-cookie-value" +python substack_link_checker.py --base-url https://YOUR.substack.com --year 2024 + +# Alternative: --cookie flag (visible in process listings) python substack_link_checker.py --base-url https://YOUR.substack.com --year 2024 \ --cookie "your-substack-sid-cookie-value" ``` +**Security:** Treat the session cookie like a password. Prefer the env var +so it does not end up in your shell history or in `ps aux`. See +[SECURITY.md](SECURITY.md) for full guidance. + **Note:** Your session cookie expires after a few weeks. If you start getting 403 errors, get a fresh cookie from your browser. ## Usage diff --git a/SECURITY.md b/SECURITY.md index 8b26573..f3709f8 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -35,12 +35,18 @@ Recommended practices: - Do **not** commit cookies to source control or paste them into public logs, screenshots, or issue reports. -- Prefer passing the cookie via an environment variable or a local file - ignored by `.gitignore` rather than your shell history. +- Prefer the `SUBSTACK_COOKIE` environment variable over the `--cookie` + CLI flag. CLI arguments are visible in shell history (`~/.bash_history`, + `~/.zsh_history`) and in process listings (`ps aux`), where any other + user on the machine can read them. - Rotate the cookie by logging out and back in if you suspect it was exposed. Substack session cookies typically expire after a few weeks. -- The tool sends the cookie only to the `--base-url` you specify. Verify - that URL before running. +- The cookie is scoped to `.substack.com` and will be sent by the + synchronous `requests` session to any `*.substack.com` host the tool + fetches. In normal use that is only your own Substack (the + `--base-url`), but be aware of this if you point the tool elsewhere. +- Outbound link checks use a **separate** `aiohttp` session with no + cookies attached, so external links are checked anonymously. If you find the tool logging the cookie value to disk or transmitting it to any host other than the configured Substack domain, please report it diff --git a/compare_posts.py b/compare_posts.py index 57c2ae6..3669659 100644 --- a/compare_posts.py +++ b/compare_posts.py @@ -3,34 +3,38 @@ import json import sys -import requests import xml.etree.ElementTree as ET +import requests + + def get_sitemap_posts(base_url): """Get all post URLs from the sitemap.""" sitemap_url = f"{base_url}/sitemap.xml" - headers = {'User-Agent': 'Mozilla/5.0'} - + headers = {"User-Agent": "Mozilla/5.0"} + response = requests.get(sitemap_url, headers=headers, timeout=10) response.raise_for_status() - + root = ET.fromstring(response.content) - namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'} - - urls = root.findall('.//ns:url/ns:loc', namespace) + namespace = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"} + + urls = root.findall(".//ns:url/ns:loc", namespace) # Filter to only /p/ posts (not /about, /archive, etc.) - posts = [url.text for url in urls if '/p/' in url.text] + posts = [url.text for url in urls if "/p/" in url.text] return posts + def load_history(history_file): """Load checked posts from history file.""" try: - with open(history_file, 'r') as f: + with open(history_file) as f: data = json.load(f) - return set(data.get('checked_posts', {}).keys()) + return set(data.get("checked_posts", {}).keys()) except FileNotFoundError: return set() + def main(): if len(sys.argv) < 2: print("Usage: python compare_posts.py [history-file]") @@ -43,29 +47,30 @@ def main(): print(f"Fetching posts from {base_url}/sitemap.xml...") sitemap_posts = get_sitemap_posts(base_url) checked_posts = load_history(history_file) - + sitemap_set = set(sitemap_posts) unchecked = sitemap_set - checked_posts checked = sitemap_set & checked_posts - - print(f"\n{'='*50}") - print(f"COMPARISON RESULTS") - print(f"{'='*50}") + + print(f"\n{'=' * 50}") + print("COMPARISON RESULTS") + print(f"{'=' * 50}") print(f"Total posts in sitemap: {len(sitemap_posts)}") print(f"Already checked: {len(checked)}") print(f"Not yet checked: {len(unchecked)}") - print(f"{'='*50}\n") - + print(f"{'=' * 50}\n") + if unchecked: print("UNCHECKED POSTS:") for url in sorted(unchecked): print(f" {url}") - + # Optionally save unchecked to file - with open('unchecked_posts.txt', 'w') as f: + with open("unchecked_posts.txt", "w") as f: for url in sorted(unchecked): - f.write(url + '\n') - print(f"\nSaved unchecked URLs to: unchecked_posts.txt") + f.write(url + "\n") + print("\nSaved unchecked URLs to: unchecked_posts.txt") + -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/demo_link_checker.py b/demo_link_checker.py index baa15a1..7e87125 100644 --- a/demo_link_checker.py +++ b/demo_link_checker.py @@ -6,7 +6,9 @@ """ import asyncio + import aiohttp + from substack_link_checker import SubstackLinkChecker @@ -24,11 +26,7 @@ async def demo_check_links(): ] # Initialize checker (base_url doesn't matter for direct link tests) - checker = SubstackLinkChecker( - base_url="https://example.substack.com", - timeout=10, - verbose=True - ) + checker = SubstackLinkChecker(base_url="https://example.substack.com", timeout=10, verbose=True) print("=" * 60) print("SUBSTACK LINK CHECKER - DEMO") @@ -38,10 +36,8 @@ async def demo_check_links(): # Create aiohttp session for checking connector = aiohttp.TCPConnector(limit=5, ssl=True) async with aiohttp.ClientSession( - connector=connector, - headers=checker.DEFAULT_HEADERS + connector=connector, headers=checker.DEFAULT_HEADERS ) as session: - for url, description in test_urls: print(f"Testing: {description}") print(f" URL: {url}") @@ -58,7 +54,9 @@ async def demo_check_links(): print("Demo complete!") print() print("To check your own Substack, run:") - print(" python substack_link_checker.py --base-url https://YOUR-SUBSTACK.substack.com --year 2024") + print( + " python substack_link_checker.py --base-url https://YOUR-SUBSTACK.substack.com --year 2024" + ) def main(): @@ -66,5 +64,5 @@ def main(): asyncio.run(demo_check_links()) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/fetch_archive_urls.py b/fetch_archive_urls.py index 1c362cb..8b77b1d 100644 --- a/fetch_archive_urls.py +++ b/fetch_archive_urls.py @@ -10,8 +10,8 @@ python fetch_archive_urls.py https://example.substack.com 2024 """ -import re import sys + import requests from bs4 import BeautifulSoup @@ -30,8 +30,8 @@ def fetch_archive_urls(base_url, year=None): archive_url = f"{base_url.rstrip('/')}/archive" headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', - 'Accept': 'text/html,application/xhtml+xml', + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Accept": "text/html,application/xhtml+xml", } print(f"Fetching archive from {archive_url}...") @@ -44,16 +44,16 @@ def fetch_archive_urls(base_url, year=None): print(f"\nTry visiting {archive_url} in your browser and manually copying URLs.") return [] - soup = BeautifulSoup(response.text, 'html.parser') + soup = BeautifulSoup(response.text, "html.parser") # Find all links that look like posts (/p/ pattern) post_urls = set() - for link in soup.find_all('a', href=True): - href = link['href'] - if '/p/' in href: + for link in soup.find_all("a", href=True): + href = link["href"] + if "/p/" in href: # Make absolute URL if relative - if href.startswith('/'): - href = base_url.rstrip('/') + href + if href.startswith("/"): + href = base_url.rstrip("/") + href # Filter by year if specified if year: year_str = str(year) @@ -85,13 +85,13 @@ def main(): if urls: # Save to file filename = f"archive_urls{'_' + str(year) if year else ''}.txt" - with open(filename, 'w') as f: + with open(filename, "w") as f: for url in urls: - f.write(url + '\n') + f.write(url + "\n") print(f"\nSaved to: {filename}") print("\nTo check these posts, run:") print(f" python substack_link_checker.py --base-url {base_url} --url-file {filename}") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/import_checked_posts.py b/import_checked_posts.py index b18de8b..a136b49 100644 --- a/import_checked_posts.py +++ b/import_checked_posts.py @@ -14,14 +14,15 @@ import sys from datetime import datetime + def load_existing_history(history_file): """Load existing history file if it exists.""" if os.path.exists(history_file): try: - with open(history_file, 'r', encoding='utf-8') as f: + with open(history_file, encoding="utf-8") as f: data = json.load(f) - return data.get('checked_posts', {}) - except (json.JSONDecodeError, IOError) as e: + return data.get("checked_posts", {}) + except (OSError, json.JSONDecodeError) as e: print(f"Warning: Could not load existing history: {e}") return {} @@ -41,7 +42,7 @@ def import_from_excel(excel_file): # Find the Post URL column (case-insensitive) url_column = None for col in df.columns: - if 'post url' in col.lower() or 'post_url' in col.lower(): + if "post url" in col.lower() or "post_url" in col.lower(): url_column = col break @@ -62,13 +63,13 @@ def import_from_csv(csv_file): print(f"Reading CSV file: {csv_file}") urls = set() - with open(csv_file, 'r', encoding='utf-8') as f: + with open(csv_file, encoding="utf-8") as f: reader = csv.DictReader(f) # Find the Post URL column (case-insensitive) url_column = None for col in reader.fieldnames: - if 'post url' in col.lower() or 'post_url' in col.lower(): + if "post url" in col.lower() or "post_url" in col.lower(): url_column = col break @@ -77,8 +78,8 @@ def import_from_csv(csv_file): sys.exit(1) for row in reader: - url = row.get(url_column, '').strip() - if url and url.startswith('http'): + url = row.get(url_column, "").strip() + if url and url.startswith("http"): urls.add(url) print(f"Found {len(urls)} unique post URLs") @@ -87,32 +88,27 @@ def import_from_csv(csv_file): def save_history(history_file, checked_posts): """Save updated history to file.""" - data = { - 'last_updated': datetime.now().isoformat(), - 'checked_posts': checked_posts - } - with open(history_file, 'w', encoding='utf-8') as f: + data = {"last_updated": datetime.now().isoformat(), "checked_posts": checked_posts} + with open(history_file, "w", encoding="utf-8") as f: json.dump(data, f, indent=2) print(f"Saved history with {len(checked_posts)} checked posts to {history_file}") def main(): parser = argparse.ArgumentParser( - description='Import checked posts from Excel/CSV into history file.' - ) - parser.add_argument( - 'input_file', - help='Excel (.xlsx) or CSV file with Post URL column' + description="Import checked posts from Excel/CSV into history file." ) + parser.add_argument("input_file", help="Excel (.xlsx) or CSV file with Post URL column") parser.add_argument( - '--history-file', '-H', - default='checked_posts.json', - help='History file to update (default: checked_posts.json)' + "--history-file", + "-H", + default="checked_posts.json", + help="History file to update (default: checked_posts.json)", ) parser.add_argument( - '--date', - default=datetime.now().strftime('%Y-%m-%dT00:00:00'), - help='Date to use for imported posts (default: today)' + "--date", + default=datetime.now().strftime("%Y-%m-%dT00:00:00"), + help="Date to use for imported posts (default: today)", ) args = parser.parse_args() @@ -123,9 +119,9 @@ def main(): print(f"Existing history: {existing_count} posts") # Import from file - if args.input_file.endswith('.xlsx') or args.input_file.endswith('.xls'): + if args.input_file.endswith(".xlsx") or args.input_file.endswith(".xls"): urls = import_from_excel(args.input_file) - elif args.input_file.endswith('.csv'): + elif args.input_file.endswith(".csv"): urls = import_from_csv(args.input_file) else: print("Error: File must be .xlsx, .xls, or .csv") @@ -145,5 +141,5 @@ def main(): save_history(args.history_file, checked_posts) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/substack_link_checker.py b/substack_link_checker.py index fb3dae7..80398d8 100644 --- a/substack_link_checker.py +++ b/substack_link_checker.py @@ -18,11 +18,11 @@ import re import sys import time -from dataclasses import dataclass, field +import xml.etree.ElementTree as ET +from dataclasses import dataclass from datetime import datetime from typing import Dict, List, Optional, Set, Tuple from urllib.parse import urljoin, urlparse -import xml.etree.ElementTree as ET import aiohttp import requests @@ -32,6 +32,7 @@ @dataclass class LinkCheckResult: """Result of checking a single link.""" + is_broken: bool error_type: str from_cache: bool = False @@ -40,6 +41,7 @@ class LinkCheckResult: @dataclass class BrokenLinkRecord: """Record of a broken link for reporting.""" + post_title: str post_url: str broken_link: str @@ -58,26 +60,32 @@ class SubstackLinkChecker: """ DEFAULT_HEADERS = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'DNT': '1', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1' + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "DNT": "1", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", } # Soft 404 detection patterns SOFT_404_PATTERNS = [ - '404 error', 'page not found', 'not found', '404', - 'page doesn\'t exist', 'page does not exist', - 'no longer available', 'has been removed', - 'couldn\'t find', 'could not find' + "404 error", + "page not found", + "not found", + "404", + "page doesn't exist", + "page does not exist", + "no longer available", + "has been removed", + "couldn't find", + "could not find", ] # Default domains to skip (known bot-blockers) DEFAULT_SKIP_DOMAINS = [ - 'wikipedia.org', - 'en.wikipedia.org', + "wikipedia.org", + "en.wikipedia.org", ] def __init__( @@ -90,7 +98,7 @@ def __init__( verbose: bool = False, skip_domains: Optional[List[str]] = None, broken_domains: Optional[List[str]] = None, - cookie: Optional[str] = None + cookie: Optional[str] = None, ): """ Initialize the link checker. @@ -106,7 +114,7 @@ def __init__( broken_domains: List of domains to auto-flag as broken without checking (e.g., ['local.example.com']) cookie: Substack session cookie (substack.sid) for authenticated access """ - self.base_url = base_url.rstrip('/') + self.base_url = base_url.rstrip("/") self.timeout = timeout self.concurrency = concurrency self.max_retries = max_retries @@ -121,7 +129,7 @@ def __init__( # Set authentication cookie if provided if cookie: - self.session.cookies.set('substack.sid', cookie, domain='.substack.com') + self.session.cookies.set("substack.sid", cookie, domain=".substack.com") # Global link cache: url -> LinkCheckResult self.link_cache: Dict[str, LinkCheckResult] = {} @@ -131,13 +139,13 @@ def __init__( # Statistics self.stats = { - 'total_links_checked': 0, - 'cache_hits': 0, - 'broken_links': 0, - 'retries': 0, - 'posts_skipped': 0, - 'links_skipped': 0, - 'links_auto_broken': 0 + "total_links_checked": 0, + "cache_hits": 0, + "broken_links": 0, + "retries": 0, + "posts_skipped": 0, + "links_skipped": 0, + "links_auto_broken": 0, } # History tracking @@ -149,15 +157,18 @@ def load_history(self, history_file: str) -> None: self.history_file = history_file if os.path.exists(history_file): try: - with open(history_file, 'r', encoding='utf-8') as f: + with open(history_file, encoding="utf-8") as f: data = json.load(f) - self.checked_posts = data.get('checked_posts', {}) - self._log(f"Loaded history: {len(self.checked_posts)} previously checked posts", force=True) - except (json.JSONDecodeError, IOError) as e: + self.checked_posts = data.get("checked_posts", {}) + self._log( + f"Loaded history: {len(self.checked_posts)} previously checked posts", + force=True, + ) + except (OSError, json.JSONDecodeError) as e: print(f"Warning: Could not load history file: {e}") self.checked_posts = {} else: - self._log(f"No history file found, starting fresh", force=True) + self._log("No history file found, starting fresh", force=True) def save_history(self) -> None: """Save checked posts history to JSON file.""" @@ -165,14 +176,11 @@ def save_history(self) -> None: return try: - data = { - 'last_updated': datetime.now().isoformat(), - 'checked_posts': self.checked_posts - } - with open(self.history_file, 'w', encoding='utf-8') as f: + data = {"last_updated": datetime.now().isoformat(), "checked_posts": self.checked_posts} + with open(self.history_file, "w", encoding="utf-8") as f: json.dump(data, f, indent=2) self._log(f"Saved history: {len(self.checked_posts)} checked posts", force=True) - except IOError as e: + except OSError as e: print(f"Warning: Could not save history file: {e}") def mark_post_checked(self, post_url: str) -> None: @@ -183,7 +191,7 @@ def filter_unchecked_posts(self, post_urls: List[str]) -> List[str]: """Filter out posts that have already been checked.""" unchecked = [url for url in post_urls if url not in self.checked_posts] skipped = len(post_urls) - len(unchecked) - self.stats['posts_skipped'] = skipped + self.stats["posts_skipped"] = skipped if skipped > 0: self._log(f"Skipping {skipped} previously checked posts", force=True) return unchecked @@ -197,7 +205,7 @@ def should_skip_domain(self, url: str) -> bool: domain = parsed.netloc.lower() # Check if domain matches or is a subdomain of any skip domain for skip_domain in self.skip_domains: - if domain == skip_domain or domain.endswith('.' + skip_domain): + if domain == skip_domain or domain.endswith("." + skip_domain): return True return False except Exception: @@ -212,7 +220,7 @@ def is_broken_domain(self, url: str) -> bool: domain = parsed.netloc.lower() # Check if domain matches or is a subdomain of any broken domain for broken_domain in self.broken_domains: - if domain == broken_domain or domain.endswith('.' + broken_domain): + if domain == broken_domain or domain.endswith("." + broken_domain): return True return False except Exception: @@ -233,16 +241,16 @@ def fetch_sitemap(self) -> List[str]: response.raise_for_status() root = ET.fromstring(response.content) - namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'} + namespace = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"} # Check if this is a sitemap index - sitemaps = root.findall('.//ns:sitemap/ns:loc', namespace) + sitemaps = root.findall(".//ns:sitemap/ns:loc", namespace) if sitemaps: self._log(f"Found sitemap index with {len(sitemaps)} sitemaps", force=True) return [sitemap.text for sitemap in sitemaps] # Otherwise, get URLs from this sitemap - urls = root.findall('.//ns:url/ns:loc', namespace) + urls = root.findall(".//ns:url/ns:loc", namespace) return [url.text for url in urls] except requests.exceptions.RequestException as e: @@ -268,7 +276,7 @@ def get_post_urls_from_year_sitemap(self, year: int, limit: Optional[int] = None # If we got a sitemap index, fetch the year-specific one year_sitemap = None for url in all_urls: - if str(year) in url and 'sitemap' in url: + if str(year) in url and "sitemap" in url: year_sitemap = url break @@ -279,8 +287,8 @@ def get_post_urls_from_year_sitemap(self, year: int, limit: Optional[int] = None response.raise_for_status() root = ET.fromstring(response.content) - namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'} - urls = root.findall('.//ns:url/ns:loc', namespace) + namespace = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"} + urls = root.findall(".//ns:url/ns:loc", namespace) post_urls = [url.text for url in urls] if limit: @@ -302,10 +310,9 @@ def load_urls_from_file(self, file_path: str, limit: Optional[int] = None) -> Li self._log(f"Loading URLs from {file_path}...", force=True) try: - with open(file_path, 'r') as f: + with open(file_path) as f: urls = [ - line.strip() for line in f - if line.strip() and line.strip().startswith('http') + line.strip() for line in f if line.strip() and line.strip().startswith("http") ] if limit: @@ -316,7 +323,7 @@ def load_urls_from_file(self, file_path: str, limit: Optional[int] = None) -> Li except FileNotFoundError: print(f"Error: File not found: {file_path}") return [] - except IOError as e: + except OSError as e: print(f"Error loading URLs from file: {e}") return [] @@ -328,33 +335,37 @@ def extract_links_from_post(self, post_url: str) -> Tuple[str, List[str]]: response = self.session.get(post_url, timeout=self.timeout) response.raise_for_status() - soup = BeautifulSoup(response.text, 'html.parser') + soup = BeautifulSoup(response.text, "html.parser") # Get post title - title_tag = soup.find('h1') or soup.find('title') + title_tag = soup.find("h1") or soup.find("title") title = title_tag.get_text(strip=True) if title_tag else "Unknown Title" # Extract all links from the post content - content_area = soup.find('article') or soup.find('div', class_=re.compile('post|article|content')) + content_area = soup.find("article") or soup.find( + "div", class_=re.compile("post|article|content") + ) if content_area: - links = [a['href'] for a in content_area.find_all('a', href=True)] + links = [a["href"] for a in content_area.find_all("a", href=True)] else: - links = [a['href'] for a in soup.find_all('a', href=True)] + links = [a["href"] for a in soup.find_all("a", href=True)] # Filter and normalize links external_links = [] for link in links: # Skip anchors, mailto, tel, etc. - if link.startswith('#') or link.startswith('mailto:') or link.startswith('tel:'): + if link.startswith("#") or link.startswith("mailto:") or link.startswith("tel:"): continue # Skip Substack internal links (comments, share, etc.) - if 'substack.com' in link and any(x in link for x in ['/subscribe', '/comments', '/share']): + if "substack.com" in link and any( + x in link for x in ["/subscribe", "/comments", "/share"] + ): continue # Make relative URLs absolute - if link.startswith('/') or not link.startswith('http'): + if link.startswith("/") or not link.startswith("http"): link = urljoin(post_url, link) external_links.append(link) @@ -375,9 +386,7 @@ def extract_links_from_post(self, post_url: str) -> Tuple[str, List[str]]: return "Error fetching post", [] async def _check_link_once( - self, - session: aiohttp.ClientSession, - link: str + self, session: aiohttp.ClientSession, link: str ) -> Tuple[bool, str, bool]: """ Check a link once (no retries). @@ -389,7 +398,7 @@ async def _check_link_once( link, timeout=aiohttp.ClientTimeout(total=self.timeout), allow_redirects=True, - ssl=True + ssl=True, ) as response: # Check for HTTP errors if response.status == 404: @@ -405,8 +414,8 @@ async def _check_link_once( # Check for soft 404s in the page title try: content = await response.text() - soup = BeautifulSoup(content, 'html.parser') - title = soup.find('title') + soup = BeautifulSoup(content, "html.parser") + title = soup.find("title") if title: title_text = title.get_text().lower() if any(phrase in title_text for phrase in self.SOFT_404_PATTERNS): @@ -423,7 +432,7 @@ async def _check_link_once( except aiohttp.ClientConnectorError as e: error_str = str(e) # DNS failures are not retryable - if 'Name or service not known' in error_str or 'nodename nor servname' in error_str: + if "Name or service not known" in error_str or "nodename nor servname" in error_str: return True, "DNS Failure", False # Other connection errors might be transient return True, f"Connection Error: {error_str[:80]}", True @@ -433,9 +442,7 @@ async def _check_link_once( return True, f"Unknown Error: {str(e)[:80]}", False async def check_link_with_retry( - self, - session: aiohttp.ClientSession, - link: str + self, session: aiohttp.ClientSession, link: str ) -> LinkCheckResult: """ Check a link with retry logic and exponential backoff. @@ -444,22 +451,22 @@ async def check_link_with_retry( """ # Auto-flag known broken domains without checking if self.is_broken_domain(link): - self.stats['links_auto_broken'] += 1 - self.stats['broken_links'] += 1 + self.stats["links_auto_broken"] += 1 + self.stats["broken_links"] += 1 return LinkCheckResult(True, "Known broken domain") # Skip domains that block bots (assume OK) if self.should_skip_domain(link): - self.stats['links_skipped'] += 1 + self.stats["links_skipped"] += 1 return LinkCheckResult(False, "Skipped (bot-blocking domain)") # Check cache first if link in self.link_cache: - self.stats['cache_hits'] += 1 + self.stats["cache_hits"] += 1 cached = self.link_cache[link] return LinkCheckResult(cached.is_broken, cached.error_type, from_cache=True) - self.stats['total_links_checked'] += 1 + self.stats["total_links_checked"] += 1 delay = self.retry_delay last_error = "Unknown" @@ -478,21 +485,20 @@ async def check_link_with_retry( break # Exponential backoff - self.stats['retries'] += 1 - self._log(f" Retry {attempt + 1}/{self.max_retries} for {link[:60]}... (waiting {delay:.1f}s)") + self.stats["retries"] += 1 + self._log( + f" Retry {attempt + 1}/{self.max_retries} for {link[:60]}... (waiting {delay:.1f}s)" + ) await asyncio.sleep(delay) delay *= 2 # Exponential backoff result = LinkCheckResult(True, last_error) self.link_cache[link] = result - self.stats['broken_links'] += 1 + self.stats["broken_links"] += 1 return result async def check_links_batch( - self, - links: List[str], - post_title: str, - post_url: str + self, links: List[str], post_title: str, post_url: str ) -> List[BrokenLinkRecord]: """ Check a batch of links concurrently. @@ -503,8 +509,7 @@ async def check_links_batch( connector = aiohttp.TCPConnector(limit=self.concurrency, ssl=True) async with aiohttp.ClientSession( - connector=connector, - headers=self.DEFAULT_HEADERS + connector=connector, headers=self.DEFAULT_HEADERS ) as session: # Create semaphore to limit concurrency semaphore = asyncio.Semaphore(self.concurrency) @@ -529,12 +534,14 @@ async def check_with_semaphore(link: str) -> Tuple[str, LinkCheckResult]: if result.is_broken: cache_note = " (cached)" if result.from_cache else "" self._log(f" ✗ BROKEN{cache_note}: {link[:70]}... ({result.error_type})") - broken_records.append(BrokenLinkRecord( - post_title=post_title, - post_url=post_url, - broken_link=link, - error_type=result.error_type - )) + broken_records.append( + BrokenLinkRecord( + post_title=post_title, + post_url=post_url, + broken_link=link, + error_type=result.error_type, + ) + ) return broken_records @@ -543,7 +550,7 @@ async def check_post_links_async(self, post_url: str): title, links = self.extract_links_from_post(post_url) if not links: - self._log(f" No links found in this post\n") + self._log(" No links found in this post\n") return # Count how many are cached @@ -557,7 +564,7 @@ async def check_post_links_async(self, post_url: str): self._log(f" Found {len(broken_records)} broken links in this post\n") - def generate_report(self, output_file: str = 'broken_links_report.csv'): + def generate_report(self, output_file: str = "broken_links_report.csv"): """Generate a CSV report of broken links.""" print(f"\n{'=' * 50}") print("SUMMARY") @@ -575,19 +582,20 @@ def generate_report(self, output_file: str = 'broken_links_report.csv'): print(f"\nGenerating report: {output_file}") - with open(output_file, 'w', newline='', encoding='utf-8') as f: + with open(output_file, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter( - f, - fieldnames=['post_title', 'post_url', 'broken_link', 'error_type'] + f, fieldnames=["post_title", "post_url", "broken_link", "error_type"] ) writer.writeheader() for record in self.results: - writer.writerow({ - 'post_title': record.post_title, - 'post_url': record.post_url, - 'broken_link': record.broken_link, - 'error_type': record.error_type - }) + writer.writerow( + { + "post_title": record.post_title, + "post_url": record.post_url, + "broken_link": record.broken_link, + "error_type": record.error_type, + } + ) print(f"Report generated with {len(self.results)} broken links") @@ -595,10 +603,10 @@ async def run_async( self, year: Optional[int] = None, limit: Optional[int] = None, - output_file: str = 'broken_links_report.csv', + output_file: str = "broken_links_report.csv", url_file: Optional[str] = None, history_file: Optional[str] = None, - only_new: bool = False + only_new: bool = False, ): """ Main async entry point to run the link checker. @@ -622,15 +630,15 @@ async def run_async( self.load_history(history_file) print(f"History file: {history_file}") if only_new: - print(f"Mode: Only new posts (skipping previously checked)") + print("Mode: Only new posts (skipping previously checked)") # Get post URLs if url_file: - print(f"Input: File") + print("Input: File") print(f"URL file: {url_file}") post_urls = self.load_urls_from_file(url_file, limit) elif year: - print(f"Input: Sitemap") + print("Input: Sitemap") print(f"Year: {year}") post_urls = self.get_post_urls_from_year_sitemap(year, limit) else: @@ -644,7 +652,9 @@ async def run_async( if only_new and history_file: original_count = len(post_urls) post_urls = self.filter_unchecked_posts(post_urls) - print(f"Posts to check: {len(post_urls)} new (skipped {original_count - len(post_urls)} already checked)") + print( + f"Posts to check: {len(post_urls)} new (skipped {original_count - len(post_urls)} already checked)" + ) print(f"{'=' * 50}\n") @@ -678,10 +688,10 @@ def run( self, year: Optional[int] = None, limit: Optional[int] = None, - output_file: str = 'broken_links_report.csv', + output_file: str = "broken_links_report.csv", url_file: Optional[str] = None, history_file: Optional[str] = None, - only_new: bool = False + only_new: bool = False, ): """ Synchronous wrapper for run_async. @@ -694,7 +704,7 @@ def run( def parse_args() -> argparse.Namespace: """Parse command-line arguments.""" parser = argparse.ArgumentParser( - description='Check for broken links in Substack newsletter posts.', + description="Check for broken links in Substack newsletter posts.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: @@ -721,96 +731,94 @@ def parse_args() -> argparse.Namespace: %(prog)s --base-url https://example.substack.com --url-file posts.txt \\ --skip-domains wikipedia.org ko-fi.com \\ --broken-domains local.example.com defunct.site.com - """ + """, ) # Required arguments parser.add_argument( - '--base-url', '-b', + "--base-url", + "-b", required=True, - help='Base URL of the Substack (e.g., https://example.substack.com)' + help="Base URL of the Substack (e.g., https://example.substack.com)", ) # Input source (one required) input_group = parser.add_mutually_exclusive_group(required=True) input_group.add_argument( - '--year', '-y', - type=int, - help='Year to check posts from (uses sitemap)' + "--year", "-y", type=int, help="Year to check posts from (uses sitemap)" ) input_group.add_argument( - '--url-file', '-f', - help='Path to file containing post URLs (one per line)' + "--url-file", "-f", help="Path to file containing post URLs (one per line)" ) # Optional arguments + parser.add_argument("--limit", "-l", type=int, help="Maximum number of posts to check") parser.add_argument( - '--limit', '-l', - type=int, - help='Maximum number of posts to check' - ) - parser.add_argument( - '--output', '-o', - default='broken_links_report.csv', - help='Output CSV filename (default: broken_links_report.csv)' + "--output", + "-o", + default="broken_links_report.csv", + help="Output CSV filename (default: broken_links_report.csv)", ) parser.add_argument( - '--concurrency', '-c', + "--concurrency", + "-c", type=int, default=10, - help='Maximum concurrent requests (default: 10)' + help="Maximum concurrent requests (default: 10)", ) parser.add_argument( - '--timeout', '-t', - type=int, - default=10, - help='Request timeout in seconds (default: 10)' + "--timeout", "-t", type=int, default=10, help="Request timeout in seconds (default: 10)" ) parser.add_argument( - '--max-retries', '-r', + "--max-retries", + "-r", type=int, default=3, - help='Maximum retry attempts for transient failures (default: 3)' - ) - parser.add_argument( - '--verbose', '-v', - action='store_true', - help='Enable verbose output' + help="Maximum retry attempts for transient failures (default: 3)", ) + parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose output") # History tracking arguments parser.add_argument( - '--history-file', '-H', - help='Path to JSON file for tracking checked posts (enables incremental scanning)' + "--history-file", + "-H", + help="Path to JSON file for tracking checked posts (enables incremental scanning)", ) parser.add_argument( - '--only-new', - action='store_true', - help='Only check posts not in history (requires --history-file)' + "--only-new", + action="store_true", + help="Only check posts not in history (requires --history-file)", ) parser.add_argument( - '--skip-domains', '-S', - nargs='+', - default=['wikipedia.org'], - help='Domains to skip checking and assume OK (default: wikipedia.org). Use --skip-domains none to check all.' + "--skip-domains", + "-S", + nargs="+", + default=["wikipedia.org"], + help="Domains to skip checking and assume OK (default: wikipedia.org). Use --skip-domains none to check all.", ) parser.add_argument( - '--skip-domains-file', - help='File containing domains to skip (one per line)' + "--skip-domains-file", help="File containing domains to skip (one per line)" ) parser.add_argument( - '--broken-domains', '-B', - nargs='+', + "--broken-domains", + "-B", + nargs="+", default=[], - help='Domains to auto-flag as broken without checking (e.g., local.example.com)' + help="Domains to auto-flag as broken without checking (e.g., local.example.com)", ) parser.add_argument( - '--broken-domains-file', - help='File containing domains to auto-flag as broken (one per line)' + "--broken-domains-file", + help="File containing domains to auto-flag as broken (one per line)", ) parser.add_argument( - '--cookie', '-C', - help='Substack session cookie (substack.sid) for authenticated access to paywalled content' + "--cookie", + "-C", + help=( + "Substack session cookie (substack.sid) for authenticated access " + "to paywalled content. WARNING: passing this on the command line " + "exposes it in shell history and process listings; prefer the " + "SUBSTACK_COOKIE environment variable instead." + ), ) return parser.parse_args() @@ -820,17 +828,17 @@ def load_domains_from_file(file_path: str) -> List[str]: """Load domains from a text file (one domain per line).""" domains = [] try: - with open(file_path, 'r', encoding='utf-8') as f: + with open(file_path, encoding="utf-8") as f: for line in f: line = line.strip() # Skip empty lines and comments - if line and not line.startswith('#'): + if line and not line.startswith("#"): domains.append(line) return domains except FileNotFoundError: print(f"Warning: Domain file not found: {file_path}") return [] - except IOError as e: + except OSError as e: print(f"Warning: Error reading domain file: {e}") return [] @@ -846,7 +854,7 @@ def main(): # Handle skip_domains: 'none' means check all domains # Merge command-line domains with file domains - skip_domains = [] if args.skip_domains == ['none'] else list(args.skip_domains) + skip_domains = [] if args.skip_domains == ["none"] else list(args.skip_domains) if args.skip_domains_file: file_domains = load_domains_from_file(args.skip_domains_file) skip_domains.extend(file_domains) @@ -859,6 +867,10 @@ def main(): broken_domains.extend(file_domains) broken_domains = broken_domains if broken_domains else None + # Prefer SUBSTACK_COOKIE env var; --cookie takes precedence if both set + # so users can override an exported env var ad-hoc. + cookie = args.cookie or os.environ.get("SUBSTACK_COOKIE") + checker = SubstackLinkChecker( base_url=args.base_url, timeout=args.timeout, @@ -867,7 +879,7 @@ def main(): verbose=args.verbose, skip_domains=skip_domains, broken_domains=broken_domains, - cookie=args.cookie + cookie=cookie, ) checker.run( @@ -876,9 +888,9 @@ def main(): output_file=args.output, url_file=args.url_file, history_file=args.history_file, - only_new=args.only_new + only_new=args.only_new, ) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..651f282 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,8 @@ +"""Pytest configuration: make the top-level scripts importable.""" + +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) diff --git a/tests/test_cookie_handling.py b/tests/test_cookie_handling.py new file mode 100644 index 0000000..30077ff --- /dev/null +++ b/tests/test_cookie_handling.py @@ -0,0 +1,128 @@ +"""Tests verifying safe cookie handling. + +These tests pin down the SECURITY.md promises: +- The cookie is scoped to .substack.com on the requests session. +- The cookie value never appears in any logged output (verbose or not). +- The async outbound link-check session is constructed without cookies. +""" + +from substack_link_checker import SubstackLinkChecker + +COOKIE_SENTINEL = "s%3Asentinelcookievalue1234567890.signaturepart" + + +def test_cookie_set_on_requests_session_with_substack_domain(): + checker = SubstackLinkChecker( + base_url="https://example.substack.com", + cookie=COOKIE_SENTINEL, + ) + jar = checker.session.cookies + matching = [c for c in jar if c.name == "substack.sid"] + assert len(matching) == 1 + assert matching[0].value == COOKIE_SENTINEL + assert matching[0].domain.endswith("substack.com") + + +def test_no_cookie_means_empty_jar(): + checker = SubstackLinkChecker(base_url="https://example.substack.com") + assert [c for c in checker.session.cookies if c.name == "substack.sid"] == [] + + +def test_cookie_not_emitted_by_log(capsys): + """`_log` with verbose=True must never include the cookie value.""" + checker = SubstackLinkChecker( + base_url="https://example.substack.com", + verbose=True, + cookie=COOKIE_SENTINEL, + ) + checker._log("processing https://example.substack.com/p/one") + checker._log("forced message", force=True) + out = capsys.readouterr().out + assert COOKIE_SENTINEL not in out + + +def test_cookie_not_emitted_when_saving_history(tmp_path): + """History JSON must not leak the cookie.""" + checker = SubstackLinkChecker( + base_url="https://example.substack.com", + cookie=COOKIE_SENTINEL, + ) + history = tmp_path / "history.json" + checker.load_history(str(history)) + checker.mark_post_checked("https://example.substack.com/p/one") + checker.save_history() + assert COOKIE_SENTINEL not in history.read_text(encoding="utf-8") + + +def test_repr_does_not_leak_cookie(): + """The default repr of the checker should not expose the cookie.""" + checker = SubstackLinkChecker( + base_url="https://example.substack.com", + cookie=COOKIE_SENTINEL, + ) + assert COOKIE_SENTINEL not in repr(checker) + + +def test_env_var_used_when_cli_flag_absent(monkeypatch, tmp_path): + """`main()` should read SUBSTACK_COOKIE when --cookie is not passed.""" + import substack_link_checker as m + + monkeypatch.setenv("SUBSTACK_COOKIE", COOKIE_SENTINEL) + monkeypatch.setattr( + "sys.argv", + [ + "substack_link_checker.py", + "--base-url", + "https://example.substack.com", + "--url-file", + str(tmp_path / "empty.txt"), + ], + ) + (tmp_path / "empty.txt").write_text("") + + captured = {} + + class FakeChecker: + def __init__(self, **kwargs): + captured.update(kwargs) + + def run(self, **kwargs): + pass + + monkeypatch.setattr(m, "SubstackLinkChecker", FakeChecker) + m.main() + + assert captured["cookie"] == COOKIE_SENTINEL + + +def test_cli_flag_overrides_env_var(monkeypatch, tmp_path): + import substack_link_checker as m + + monkeypatch.setenv("SUBSTACK_COOKIE", "env-value") + monkeypatch.setattr( + "sys.argv", + [ + "substack_link_checker.py", + "--base-url", + "https://example.substack.com", + "--url-file", + str(tmp_path / "empty.txt"), + "--cookie", + "cli-value", + ], + ) + (tmp_path / "empty.txt").write_text("") + + captured = {} + + class FakeChecker: + def __init__(self, **kwargs): + captured.update(kwargs) + + def run(self, **kwargs): + pass + + monkeypatch.setattr(m, "SubstackLinkChecker", FakeChecker) + m.main() + + assert captured["cookie"] == "cli-value" diff --git a/tests/test_domain_filtering.py b/tests/test_domain_filtering.py new file mode 100644 index 0000000..e1020c9 --- /dev/null +++ b/tests/test_domain_filtering.py @@ -0,0 +1,84 @@ +"""Tests for domain filtering helpers on SubstackLinkChecker.""" + +import pytest + +from substack_link_checker import SubstackLinkChecker + + +@pytest.fixture +def checker(): + return SubstackLinkChecker(base_url="https://example.substack.com") + + +class TestShouldSkipDomain: + def test_no_skip_domains_returns_false(self, checker): + assert checker.should_skip_domain("https://example.com/page") is False + + def test_exact_match(self): + c = SubstackLinkChecker( + base_url="https://example.substack.com", + skip_domains=["wikipedia.org"], + ) + assert c.should_skip_domain("https://wikipedia.org/wiki/Foo") is True + + def test_subdomain_match(self): + c = SubstackLinkChecker( + base_url="https://example.substack.com", + skip_domains=["wikipedia.org"], + ) + assert c.should_skip_domain("https://en.wikipedia.org/wiki/Foo") is True + + def test_non_match(self): + c = SubstackLinkChecker( + base_url="https://example.substack.com", + skip_domains=["wikipedia.org"], + ) + assert c.should_skip_domain("https://example.com/page") is False + + def test_lookalike_domain_not_matched(self): + """notwikipedia.org should NOT match wikipedia.org.""" + c = SubstackLinkChecker( + base_url="https://example.substack.com", + skip_domains=["wikipedia.org"], + ) + assert c.should_skip_domain("https://notwikipedia.org/foo") is False + + def test_case_insensitive(self): + c = SubstackLinkChecker( + base_url="https://example.substack.com", + skip_domains=["wikipedia.org"], + ) + assert c.should_skip_domain("https://Wikipedia.ORG/wiki/Foo") is True + + def test_malformed_url_returns_false(self): + c = SubstackLinkChecker( + base_url="https://example.substack.com", + skip_domains=["wikipedia.org"], + ) + assert c.should_skip_domain("not a url at all") is False + + +class TestIsBrokenDomain: + def test_no_broken_domains_returns_false(self, checker): + assert checker.is_broken_domain("https://example.com/page") is False + + def test_exact_match(self): + c = SubstackLinkChecker( + base_url="https://example.substack.com", + broken_domains=["defunct.example.com"], + ) + assert c.is_broken_domain("https://defunct.example.com/x") is True + + def test_subdomain_match(self): + c = SubstackLinkChecker( + base_url="https://example.substack.com", + broken_domains=["example.com"], + ) + assert c.is_broken_domain("https://sub.example.com/x") is True + + def test_non_match(self): + c = SubstackLinkChecker( + base_url="https://example.substack.com", + broken_domains=["defunct.example.com"], + ) + assert c.is_broken_domain("https://live.example.com/x") is False diff --git a/tests/test_history.py b/tests/test_history.py new file mode 100644 index 0000000..e4bac2a --- /dev/null +++ b/tests/test_history.py @@ -0,0 +1,58 @@ +"""Tests for the history-tracking persistence and filtering.""" + +import json + +from substack_link_checker import SubstackLinkChecker + + +def test_load_history_missing_file_starts_empty(tmp_path): + checker = SubstackLinkChecker(base_url="https://example.substack.com") + checker.load_history(str(tmp_path / "no-such-file.json")) + assert checker.checked_posts == {} + + +def test_save_and_reload_roundtrip(tmp_path): + history = tmp_path / "history.json" + + a = SubstackLinkChecker(base_url="https://example.substack.com") + a.load_history(str(history)) + a.mark_post_checked("https://example.substack.com/p/one") + a.mark_post_checked("https://example.substack.com/p/two") + a.save_history() + + on_disk = json.loads(history.read_text(encoding="utf-8")) + assert "last_updated" in on_disk + assert set(on_disk["checked_posts"].keys()) == { + "https://example.substack.com/p/one", + "https://example.substack.com/p/two", + } + + b = SubstackLinkChecker(base_url="https://example.substack.com") + b.load_history(str(history)) + assert set(b.checked_posts.keys()) == set(on_disk["checked_posts"].keys()) + + +def test_filter_unchecked_posts_skips_known_urls(tmp_path): + checker = SubstackLinkChecker(base_url="https://example.substack.com") + checker.checked_posts = {"https://example.substack.com/p/one": "2026-01-01T00:00:00"} + + inputs = [ + "https://example.substack.com/p/one", + "https://example.substack.com/p/two", + "https://example.substack.com/p/three", + ] + result = checker.filter_unchecked_posts(inputs) + assert result == [ + "https://example.substack.com/p/two", + "https://example.substack.com/p/three", + ] + assert checker.stats["posts_skipped"] == 1 + + +def test_load_history_corrupt_json_starts_empty(tmp_path, capsys): + history = tmp_path / "bad.json" + history.write_text("not valid json {") + checker = SubstackLinkChecker(base_url="https://example.substack.com") + checker.load_history(str(history)) + assert checker.checked_posts == {} + assert "Warning" in capsys.readouterr().out diff --git a/tests/test_load_domains_from_file.py b/tests/test_load_domains_from_file.py new file mode 100644 index 0000000..64a38f1 --- /dev/null +++ b/tests/test_load_domains_from_file.py @@ -0,0 +1,28 @@ +"""Tests for the load_domains_from_file helper.""" + +from substack_link_checker import load_domains_from_file + + +def test_loads_one_per_line(tmp_path): + p = tmp_path / "domains.txt" + p.write_text("example.com\nwikipedia.org\n") + assert load_domains_from_file(str(p)) == ["example.com", "wikipedia.org"] + + +def test_skips_blank_lines_and_comments(tmp_path): + p = tmp_path / "domains.txt" + p.write_text("# header comment\n\nexample.com\n \n# another comment\nwikipedia.org\n") + assert load_domains_from_file(str(p)) == ["example.com", "wikipedia.org"] + + +def test_strips_whitespace(tmp_path): + p = tmp_path / "domains.txt" + p.write_text(" example.com \n\twikipedia.org\t\n") + assert load_domains_from_file(str(p)) == ["example.com", "wikipedia.org"] + + +def test_missing_file_returns_empty_list(tmp_path, capsys): + result = load_domains_from_file(str(tmp_path / "does-not-exist.txt")) + assert result == [] + captured = capsys.readouterr() + assert "not found" in captured.out.lower() diff --git a/tests/test_report_generation.py b/tests/test_report_generation.py new file mode 100644 index 0000000..64f9991 --- /dev/null +++ b/tests/test_report_generation.py @@ -0,0 +1,61 @@ +"""Tests for CSV report generation.""" + +import csv + +from substack_link_checker import BrokenLinkRecord, SubstackLinkChecker + + +def test_empty_results_skips_file_creation(tmp_path, capsys): + checker = SubstackLinkChecker(base_url="https://example.substack.com") + output = tmp_path / "report.csv" + checker.generate_report(str(output)) + assert not output.exists(), "report file should not be created when there are no broken links" + assert "No broken links found" in capsys.readouterr().out + + +def test_writes_csv_with_expected_columns_and_rows(tmp_path): + checker = SubstackLinkChecker(base_url="https://example.substack.com") + checker.results = [ + BrokenLinkRecord( + post_title="My Post", + post_url="https://example.substack.com/p/my-post", + broken_link="https://defunct.example.com/x", + error_type="HTTP 404", + ), + BrokenLinkRecord( + post_title="Another, with comma", + post_url="https://example.substack.com/p/other", + broken_link="https://no-dns.invalid/y", + error_type="DNS Failure", + ), + ] + output = tmp_path / "report.csv" + checker.generate_report(str(output)) + + assert output.exists() + with output.open(newline="", encoding="utf-8") as f: + rows = list(csv.DictReader(f)) + + assert len(rows) == 2 + assert rows[0]["post_title"] == "My Post" + assert rows[0]["broken_link"] == "https://defunct.example.com/x" + assert rows[0]["error_type"] == "HTTP 404" + assert rows[1]["post_title"] == "Another, with comma" + assert rows[1]["error_type"] == "DNS Failure" + + +def test_csv_header_present(tmp_path): + checker = SubstackLinkChecker(base_url="https://example.substack.com") + checker.results = [ + BrokenLinkRecord( + post_title="t", + post_url="u", + broken_link="b", + error_type="HTTP 404", + ), + ] + output = tmp_path / "report.csv" + checker.generate_report(str(output)) + + first_line = output.read_text(encoding="utf-8").splitlines()[0] + assert first_line == "post_title,post_url,broken_link,error_type"