diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..7e0222f --- /dev/null +++ b/.flake8 @@ -0,0 +1,6 @@ +[flake8] +# Black owns formatting and line wrapping (88 cols). Long, unsplittable lines +# (inline HTML templates, URLs) are left to Black, so E501 is not enforced here; +# E203 and W503 are disabled for Black compatibility. +max-line-length = 88 +extend-ignore = E203, W503, E501 diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 0867c2b..650dcab 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -17,3 +17,12 @@ updates: labels: - "dependencies" - "docker" + + # Python (pinned runtime deps in requirements.txt and dev tools) + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + labels: + - "dependencies" + - "python" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 48b891b..2f06c17 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,14 +52,14 @@ jobs: - name: Run Flake8 (style guide) run: flake8 wanwatcher tests scripts --count --select=E9,F63,F7,F82 --show-source --statistics - - name: Run Flake8 (complexity) - run: flake8 wanwatcher tests scripts --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Run Flake8 (full, config-driven) + run: flake8 wanwatcher tests scripts --count --exit-zero --max-complexity=10 --statistics - name: Run Pylint - run: pylint wanwatcher --exit-zero --max-line-length=127 + run: pylint wanwatcher --exit-zero --max-line-length=88 - name: Run MyPy (type checking) - run: mypy wanwatcher --ignore-missing-imports --no-strict-optional + run: mypy wanwatcher --ignore-missing-imports # ============================================================================ # Security Scanning diff --git a/CHANGELOG.md b/CHANGELOG.md index 404570c..46a28f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,33 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [2.5.0] - 2026-06-13 + +Reliability and code-quality hardening from the comprehensive review. No +configuration changes are required. + +### Changed + +- `/healthz` now returns 503 when the monitor loop has gone stale (no successful + check within a generous multiple of `CHECK_INTERVAL`) instead of always 200, + so a wedged loop is reported unhealthy. `/api/status` gains + `seconds_since_last_check` and `check_interval`. +- Notification, DDNS, and MQTT failures during a check are now logged with stack + traces and no longer counted as check failures; only detection failures drive + outage detection and the adaptive backoff. +- The status snapshot served to the API is taken under a lock, preventing + inconsistent reads while the loop updates state. + +### Fixed + +- A failed geo lookup no longer overwrites previously known geographic data. + +### Internal + +- mypy now runs with strict optional checking (removed `--no-strict-optional`); + flake8 line length is aligned to 88 via a `.flake8` config; Dependabot now + tracks pip dependencies. + ## [2.4.1] - 2026-06-13 ### Security diff --git a/DOCKER_HUB_DESCRIPTION.md b/DOCKER_HUB_DESCRIPTION.md index d586e85..363b4d5 100644 --- a/DOCKER_HUB_DESCRIPTION.md +++ b/DOCKER_HUB_DESCRIPTION.md @@ -16,6 +16,7 @@ aimed at homelabs and small servers on connections where the ISP changes your IP ## Recent releases +- 2.5.0: reliability hardening (stuck-loop /healthz, isolated notifier failures, locked status reads). - 2.4.1: security fix - escape untrusted geo/release-note strings in notifications. - 2.3.0: AWS Route53 DDNS provider (SigV4, no AWS SDK bundled). - 2.2.0: secrets from files (`_FILE`), plus Trivy scanning, CycloneDX SBOM and Cosign keyless image signing. @@ -67,8 +68,8 @@ Operations | Architecture | Tags | Status | |--------------|------|--------| -| x86-64 (AMD64) | `latest`, `2.4.1` | Supported | -| ARM64 (aarch64) | `latest`, `2.4.1` | Supported | +| x86-64 (AMD64) | `latest`, `2.5.0` | Supported | +| ARM64 (aarch64) | `latest`, `2.5.0` | Supported | Docker pulls the correct image for your platform automatically. ARM64 covers Raspberry Pi 4 and newer, Apple Silicon, and AWS Graviton. @@ -95,7 +96,7 @@ docker run -d \ -e SERVER_NAME="My Server" \ -v ./data:/data \ -v ./logs:/logs \ - noxied/wanwatcher:2.4.1 + noxied/wanwatcher:2.5.0 ``` ### docker compose @@ -103,7 +104,7 @@ docker run -d \ ```yaml services: wanwatcher: - image: noxied/wanwatcher:2.4.1 + image: noxied/wanwatcher:2.5.0 container_name: wanwatcher restart: unless-stopped environment: @@ -345,7 +346,7 @@ docker buildx build \ | Tag | Meaning | |-----|---------| -| `2.4.1` | This exact release | +| `2.5.0` | This exact release | | `2.0` | Latest 2.0.x patch | | `2` | Latest 2.x release | | `latest` | Latest stable release | @@ -356,6 +357,7 @@ docker buildx build \ | Version | Date | Highlights | |---------|------|------------| +| 2.5.0 | 2026-06-13 | Reliability hardening (/healthz staleness, isolated side-effect failures) | | 2.4.1 | 2026-06-13 | Security: escape untrusted strings in notifications | | 2.3.0 | 2026-06-13 | AWS Route53 DDNS provider | | 2.2.0 | 2026-06-13 | Secrets from files, Trivy/SBOM/Cosign supply-chain security | diff --git a/Dockerfile b/Dockerfile index e68130b..bec6c1f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,10 +2,10 @@ FROM python:3.14-slim LABEL maintainer="noxied" LABEL description="WAN IP monitoring with notifications, DDNS updates and Home Assistant integration" -LABEL version="2.4.1" +LABEL version="2.5.0" LABEL org.opencontainers.image.title="WANwatcher" LABEL org.opencontainers.image.description="Monitor WAN IPv4/IPv6 addresses with notifications, DDNS and MQTT" -LABEL org.opencontainers.image.version="2.4.1" +LABEL org.opencontainers.image.version="2.5.0" LABEL org.opencontainers.image.authors="noxied" LABEL org.opencontainers.image.url="https://github.com/noxied/wanwatcher" LABEL org.opencontainers.image.source="https://github.com/noxied/wanwatcher" diff --git a/README.md b/README.md index 2c91a1a..9437467 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ docker run -d \ -e SERVER_NAME="My Server" \ -v ./data:/data \ -v ./logs:/logs \ - noxied/wanwatcher:2.4.1 + noxied/wanwatcher:2.5.0 ``` Or with compose: @@ -52,7 +52,7 @@ Or with compose: ```yaml services: wanwatcher: - image: noxied/wanwatcher:2.4.1 + image: noxied/wanwatcher:2.5.0 container_name: wanwatcher restart: unless-stopped environment: @@ -214,7 +214,7 @@ Supported: `DISCORD_WEBHOOK_URL_FILE`, `TELEGRAM_BOT_TOKEN_FILE`, ```yaml services: wanwatcher: - image: noxied/wanwatcher:2.4.1 + image: noxied/wanwatcher:2.5.0 environment: DISCORD_ENABLED: "true" DISCORD_WEBHOOK_URL_FILE: /run/secrets/discord_webhook @@ -316,8 +316,8 @@ ROUTE53_TTL: "300" Set `API_ENABLED=true` and publish the port (`-p 8080:8080`). Endpoints: -- `GET /healthz` returns `{"status": "ok", ...}` when the loop is healthy -- `GET /api/status` returns the full state: current IPs, last check, last change, uptime +- `GET /healthz` returns `{"status": "ok", ...}` (200) when the loop is healthy, or `{"status": "stale", ...}` (503) if no successful check has happened within a generous multiple of `CHECK_INTERVAL`, so a wedged loop is detectable +- `GET /api/status` returns the full state: current IPs, last check, `seconds_since_last_check`, `check_interval`, last change, uptime, and recent change history - `GET /metrics` returns Prometheus metrics ```bash @@ -325,6 +325,8 @@ curl http://localhost:8080/api/status curl http://localhost:8080/metrics ``` +Geographic data in `/api/status` (and over MQTT) reflects the most recent IP change, since the lookup only runs when the address changes; it is null until the first change is recorded. + Exported metrics include `wanwatcher_checks_total`, `wanwatcher_check_failures_total`, `wanwatcher_ip_changes_total`, `wanwatcher_notifications_total`, `wanwatcher_ddns_updates_total`, `wanwatcher_last_change_timestamp_seconds`, `wanwatcher_last_check_timestamp_seconds`, and `wanwatcher_up`. A Prometheus scrape job pointed at `wanwatcher:8080` works as-is; no extra exporter needed. When the API is enabled, the container healthcheck queries `/healthz`. Otherwise it verifies that the state file exists, is valid JSON, and was refreshed recently. diff --git a/SECURITY.md b/SECURITY.md index b7c83b0..dcafc7b 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -42,7 +42,7 @@ Environment variables: ```bash export DISCORD_WEBHOOK_URL="your_webhook" -docker run -e DISCORD_WEBHOOK_URL ... noxied/wanwatcher:2.4.1 +docker run -e DISCORD_WEBHOOK_URL ... noxied/wanwatcher:2.5.0 ``` A `.env` file (add it to `.gitignore`): @@ -71,7 +71,7 @@ Published images are signed with Cosign using keyless signing. You can verify that an image came from this repository's CI before running it: ```bash -cosign verify noxied/wanwatcher:2.4.1 \ +cosign verify noxied/wanwatcher:2.5.0 \ --certificate-identity-regexp "https://github.com/noxied/wanwatcher/.*" \ --certificate-oidc-issuer https://token.actions.githubusercontent.com ``` @@ -105,7 +105,7 @@ Reasonable extras for the paranoid: ```yaml services: wanwatcher: - image: noxied/wanwatcher:2.4.1 # pin a version, avoid :latest + image: noxied/wanwatcher:2.5.0 # pin a version, avoid :latest security_opt: - no-new-privileges:true deploy: diff --git a/UPGRADING.md b/UPGRADING.md index 4d4769b..fb1d749 100644 --- a/UPGRADING.md +++ b/UPGRADING.md @@ -2,6 +2,26 @@ Version-specific upgrade notes. The newest upgrade path is at the top. +## 2.4.x to 2.5.0 + +No configuration changes. Pull the new image and restart: + +```bash +docker compose pull +docker compose up -d +``` + +What changed: + +- `/healthz` now reports 503 when the monitoring loop is stale (no successful + check within a generous multiple of `CHECK_INTERVAL`). If you run with + `API_ENABLED=true`, the container healthcheck will now correctly mark a wedged + loop as unhealthy instead of healthy. `/api/status` adds + `seconds_since_last_check` and `check_interval`. +- Notifier, DDNS, and MQTT errors are no longer counted as check failures, so + `wanwatcher_check_failures_total` now reflects only IP-detection failures. +- Everything else is internal hardening; there is nothing to change. + ## 2.4.0 to 2.4.1 A security patch with no configuration changes. Pull the new image and restart: diff --git a/docker-compose.yml b/docker-compose.yml index da5ef65..cbdd7c5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,6 @@ services: wanwatcher: - image: noxied/wanwatcher:2.4.1 + image: noxied/wanwatcher:2.5.0 container_name: wanwatcher restart: unless-stopped diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index 1e0e9f1..81c82b5 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -293,7 +293,7 @@ creating datasets for `/data` and `/logs`. Synology: create the shared folders in File Station first, give uid 1000 write access, and use absolute paths in Container Manager. -Raspberry Pi: the published image is multi-arch; `noxied/wanwatcher:2.4.1` +Raspberry Pi: the published image is multi-arch; `noxied/wanwatcher:2.5.0` pulls the ARM64 variant automatically on a 64-bit OS. 32-bit ARM is not published; build locally if you need it. diff --git a/tests/test_api.py b/tests/test_api.py index 4dfb17f..146b4c9 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -125,3 +125,54 @@ def test_stop_after_failed_start_is_safe(): srv.stop() # must not raise finally: blocker.close() + + +# -- /healthz staleness ------------------------------------------------------- + + +def _serve(provider): + port = _free_port() + srv = StatusServer( + APIConfig(enabled=True, bind="127.0.0.1", port=port), + status_provider=provider, + metrics=Metrics(), + ) + srv.start() + time.sleep(0.3) + return srv, port + + +def test_healthz_ok_when_fresh(): + srv, port = _serve( + lambda: {"seconds_since_last_check": 12.0, "check_interval": 900} + ) + try: + status, _, body = _get(port, "/healthz") + assert status == 200 + assert json.loads(body)["status"] == "ok" + finally: + srv.stop() + + +def test_healthz_ok_before_first_check(): + srv, port = _serve( + lambda: {"seconds_since_last_check": None, "check_interval": 900} + ) + try: + status, _, _ = _get(port, "/healthz") + assert status == 200 + finally: + srv.stop() + + +def test_healthz_stale_returns_503(): + srv, port = _serve( + lambda: {"seconds_since_last_check": 999999, "check_interval": 900} + ) + try: + with pytest.raises(urllib.error.HTTPError) as exc: + _get(port, "/healthz") + assert exc.value.code == 503 + assert json.loads(exc.value.read())["status"] == "stale" + finally: + srv.stop() diff --git a/tests/test_app.py b/tests/test_app.py index 4cb6931..715b379 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -152,3 +152,68 @@ def test_heartbeat_emitted_when_due(config): app.maybe_heartbeat() app.notifications.notify_event.assert_called_once() assert app.notifications.notify_event.call_args.args[0] == "Heartbeat" + + +def test_status_snapshot_has_freshness_fields(app): + app.detector.get_ipv4.return_value = "1.2.3.4" + app.check_ip() + snap = app.status_snapshot() + assert snap["check_interval"] == app.config.check_interval + assert isinstance(snap["seconds_since_last_check"], float) + assert snap["seconds_since_last_check"] >= 0 + + +def test_notification_exception_is_isolated(app): + # A notifier blowing up must not count as a check failure or trigger outage. + app.detector.get_ipv4.return_value = "1.2.3.4" + app.notifications.send_to_all.side_effect = RuntimeError("notifier exploded") + result = app.check_ip() + assert result is True # detection succeeded + assert app.consecutive_failures == 0 # no false outage + counters = app.metrics._counters + failures = sum( + v for (n, _), v in counters.items() if n == "wanwatcher_check_failures_total" + ) + assert failures == 0 + + +def test_geo_not_clobbered_by_failed_lookup(app, monkeypatch): + app.detector.get_ipv4.return_value = "1.2.3.4" + monkeypatch.setattr( + "wanwatcher.app.get_geo_data", lambda *a, **k: {"city": "Lisbon"} + ) + app.check_ip() # first run, geo set + assert app.geo_data == {"city": "Lisbon"} + # A later change whose geo lookup fails must keep the previous geo. + monkeypatch.setattr("wanwatcher.app.get_geo_data", lambda *a, **k: None) + app.detector.get_ipv4.return_value = "5.6.7.8" + app.check_ip() + assert app.geo_data == {"city": "Lisbon"} + + +def test_status_snapshot_concurrent_with_check_is_consistent(app): + import threading + + app.detector.get_ipv4.return_value = "1.2.3.4" + app.check_ip() + errors = [] + + def reader(): + for _ in range(100): + try: + snap = app.status_snapshot() + assert isinstance(snap["history"], list) + except Exception as exc: # noqa: BLE001 + errors.append(exc) + + def writer(): + for i in range(100): + app.detector.get_ipv4.return_value = f"1.2.3.{i % 254 + 1}" + app.check_ip() + + t1, t2 = threading.Thread(target=reader), threading.Thread(target=writer) + t1.start() + t2.start() + t1.join(5) + t2.join(5) + assert not errors diff --git a/tests/test_notifier_escaping.py b/tests/test_notifier_escaping.py index 4385253..d478ee8 100644 --- a/tests/test_notifier_escaping.py +++ b/tests/test_notifier_escaping.py @@ -9,8 +9,6 @@ from email.message import Message from unittest.mock import Mock, patch -import pytest - from wanwatcher.notifiers import DiscordNotifier, EmailNotifier, TelegramNotifier from wanwatcher.notifiers._escape import ( discord_escape, diff --git a/wanwatcher/__init__.py b/wanwatcher/__init__.py index 1eb25c3..abb61f1 100644 --- a/wanwatcher/__init__.py +++ b/wanwatcher/__init__.py @@ -1,5 +1,5 @@ """WANwatcher - WAN IP address monitor with notifications, DDNS and integrations.""" -VERSION = "2.4.1" +VERSION = "2.5.0" __version__ = VERSION diff --git a/wanwatcher/api.py b/wanwatcher/api.py index 35b24f1..ca69d38 100644 --- a/wanwatcher/api.py +++ b/wanwatcher/api.py @@ -77,14 +77,22 @@ def _handle_healthz(self) -> None: logger.exception("Status provider failed for /healthz") self._send_json(500, {"status": "error"}) return - self._send_json( - 200, - { - "status": "ok", - "last_check": status.get("last_check"), - "uptime_seconds": status.get("uptime_seconds"), - }, - ) + + # Liveness: a stuck loop stops refreshing last_check. Once at least one + # check has run, flag the loop as stale (503) when the gap since the last + # successful check exceeds a generous multiple of the check interval. + age = status.get("seconds_since_last_check") + interval = status.get("check_interval") or 0 + threshold = max(3 * interval, 1800) + 120 + stale = age is not None and age > threshold + + payload = { + "status": "stale" if stale else "ok", + "last_check": status.get("last_check"), + "seconds_since_last_check": age, + "uptime_seconds": status.get("uptime_seconds"), + } + self._send_json(503 if stale else 200, payload) def _handle_status(self) -> None: try: diff --git a/wanwatcher/app.py b/wanwatcher/app.py index f62ac94..0efd8c7 100644 --- a/wanwatcher/app.py +++ b/wanwatcher/app.py @@ -8,7 +8,7 @@ import threading import time from datetime import datetime, timezone -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional from wanwatcher import VERSION from wanwatcher.config import Config, SecretFileError @@ -20,6 +20,11 @@ from wanwatcher.state import State, StateStore from wanwatcher.updates import check_for_updates +if TYPE_CHECKING: + from wanwatcher.api import StatusServer + from wanwatcher.ddns.base import DDNSClient + from wanwatcher.mqtt import MQTTPublisher + logger = logging.getLogger(__name__) LEGACY_UPDATE_NOTIFIED_FILE = "/data/update_notified.txt" @@ -49,6 +54,8 @@ def __init__(self, config: Config): self.state: State = State() self.notifications = build_manager(config) self.shutdown_event = threading.Event() + # Guards the shared state read by the API thread and written by the loop. + self._lock = threading.Lock() self.check_count = 0 self.consecutive_failures = 0 self.outage_since: Optional[float] = None @@ -56,9 +63,10 @@ def __init__(self, config: Config): self.last_update_check = time.time() self.last_heartbeat = time.time() self.last_check_at: Optional[str] = None + self.last_check_ts: Optional[float] = None self.geo_data: Optional[Dict[str, Any]] = None - self.ddns_client = None + self.ddns_client: Optional["DDNSClient"] = None if config.ddns.enabled: from wanwatcher.ddns import build_ddns_client @@ -66,19 +74,19 @@ def __init__(self, config: Config): config.ddns, timeout=config.http_timeout, metrics=self.metrics ) - self.api_server = None + self.api_server: Optional["StatusServer"] = None if config.api.enabled: - from wanwatcher.api import StatusServer + from wanwatcher import api as _api - self.api_server = StatusServer( + self.api_server = _api.StatusServer( config.api, status_provider=self.status_snapshot, metrics=self.metrics ) - self.mqtt = None + self.mqtt: Optional["MQTTPublisher"] = None if config.mqtt.enabled: - from wanwatcher.mqtt import MQTTPublisher + from wanwatcher import mqtt as _mqtt - self.mqtt = MQTTPublisher(config.mqtt, server_name=config.server_name) + self.mqtt = _mqtt.MQTTPublisher(config.mqtt, server_name=config.server_name) # -- signals ----------------------------------------------------------- @@ -96,28 +104,42 @@ def handler(signum, _frame): # -- status for the API ------------------------------------------------ def status_snapshot(self) -> Dict[str, Any]: - return { - "version": VERSION, - "server_name": self.config.server_name, - "ipv4": self.state.ipv4, - "ipv6": self.state.ipv6, - "geo": self.geo_data, - "last_check": self.last_check_at, - "last_change": self.state.last_change, - "checks_performed": self.check_count, - "uptime_seconds": round(time.time() - self.metrics.started_at), - "outage": self.outage_since is not None, - "history": self.state.history, - "notifiers": [p.__class__.__name__ for p in self.notifications.providers], - "ddns_enabled": self.ddns_client is not None, - "mqtt_enabled": self.mqtt is not None, - } + now = time.time() + with self._lock: + seconds_since_last_check: Optional[float] = ( + round(now - self.last_check_ts, 1) + if self.last_check_ts is not None + else None + ) + return { + "version": VERSION, + "server_name": self.config.server_name, + "ipv4": self.state.ipv4, + "ipv6": self.state.ipv6, + "geo": self.geo_data, + "last_check": self.last_check_at, + "seconds_since_last_check": seconds_since_last_check, + "check_interval": self.config.check_interval, + "last_change": self.state.last_change, + "checks_performed": self.check_count, + "uptime_seconds": round(now - self.metrics.started_at), + "outage": self.outage_since is not None, + "history": list(self.state.history), + "notifiers": [ + p.__class__.__name__ for p in self.notifications.providers + ], + "ddns_enabled": self.ddns_client is not None, + "mqtt_enabled": self.mqtt is not None, + } # -- core check -------------------------------------------------------- def check_ip(self) -> bool: self.check_count += 1 self.metrics.inc("wanwatcher_checks_total") + + # 1. Detection. Only a failure here counts as a check failure and feeds + # outage detection / adaptive backoff. try: current_ipv4 = ( self.detector.get_ipv4(previous=self.state.ipv4) @@ -129,67 +151,85 @@ def check_ip(self) -> bool: if self.config.monitor_ipv6 else None ) + except Exception as exc: # noqa: BLE001 - detection must not crash the loop + logger.error("Error during IP detection: %s", exc, exc_info=True) + self.metrics.inc("wanwatcher_check_failures_total") + self._handle_check_failure() + return False - expected_any = self.config.monitor_ipv4 or self.config.monitor_ipv6 - if expected_any and current_ipv4 is None and current_ipv6 is None: - self.metrics.inc("wanwatcher_check_failures_total") - self._handle_check_failure() - return False + expected_any = self.config.monitor_ipv4 or self.config.monitor_ipv6 + if expected_any and current_ipv4 is None and current_ipv6 is None: + self.metrics.inc("wanwatcher_check_failures_total") + self._handle_check_failure() + return False - self._handle_check_success() - self.last_check_at = datetime.now(timezone.utc).isoformat() - self.metrics.set_gauge( - "wanwatcher_last_check_timestamp_seconds", time.time() + self._handle_check_success() + + # 2. Decide what changed. A None from the detector while monitoring is + # enabled means "could not determine", not "address gone". + is_first_run = self.state.is_empty() + old_ipv4, old_ipv6 = self.state.ipv4, self.state.ipv6 + ipv4_changed = self.config.monitor_ipv4 and current_ipv4 != old_ipv4 + ipv6_changed = self.config.monitor_ipv6 and current_ipv6 != old_ipv6 + if self.config.monitor_ipv4 and current_ipv4 is None: + ipv4_changed = False + current_ipv4 = old_ipv4 + if self.config.monitor_ipv6 and current_ipv6 is None: + ipv6_changed = False + current_ipv6 = old_ipv6 + + current_ips = {"ipv4": current_ipv4, "ipv6": current_ipv6} + previous_ips = {"ipv4": old_ipv4, "ipv6": old_ipv6} + logger.info("Current IPv4: %s, IPv6: %s", current_ipv4, current_ipv6) + changed = is_first_run or ipv4_changed or ipv6_changed + + # 3. Geo lookup (network) happens outside the state lock, only on change. + new_geo: Optional[Dict[str, Any]] = None + if changed: + if is_first_run: + logger.info("First run detected, recording initial addresses") + else: + logger.warning("IP ADDRESS CHANGE DETECTED") + if ipv4_changed: + logger.warning(" IPv4: %s -> %s", old_ipv4, current_ipv4) + self.metrics.inc("wanwatcher_ip_changes_total", {"family": "ipv4"}) + if ipv6_changed: + logger.warning(" IPv6: %s -> %s", old_ipv6, current_ipv6) + self.metrics.inc("wanwatcher_ip_changes_total", {"family": "ipv6"}) + self.metrics.set_gauge( + "wanwatcher_last_change_timestamp_seconds", time.time() + ) + new_geo = get_geo_data( + self.config.ipinfo_token, timeout=self.config.http_timeout ) - is_first_run = self.state.is_empty() - old_ipv4, old_ipv6 = self.state.ipv4, self.state.ipv6 - ipv4_changed = self.config.monitor_ipv4 and current_ipv4 != old_ipv4 - ipv6_changed = self.config.monitor_ipv6 and current_ipv6 != old_ipv6 - - # A None from the detector while monitoring is enabled means - # "could not determine", not "address gone"; keep the stored value. - if self.config.monitor_ipv4 and current_ipv4 is None: - ipv4_changed = False - current_ipv4 = old_ipv4 - if self.config.monitor_ipv6 and current_ipv6 is None: - ipv6_changed = False - current_ipv6 = old_ipv6 - - current_ips = {"ipv4": current_ipv4, "ipv6": current_ipv6} - previous_ips = {"ipv4": old_ipv4, "ipv6": old_ipv6} - - logger.info("Current IPv4: %s, IPv6: %s", current_ipv4, current_ipv6) - - if is_first_run or ipv4_changed or ipv6_changed: - if is_first_run: - logger.info("First run detected, recording initial addresses") + # 4. Persist and update the shared state under the lock. The state file + # is rewritten every check so its mtime tells the healthcheck the + # loop is alive. + now = time.time() + try: + with self._lock: + self.last_check_at = datetime.now(timezone.utc).isoformat() + self.last_check_ts = now + self.metrics.set_gauge("wanwatcher_last_check_timestamp_seconds", now) + if changed: + # Do not clobber good geo data with a failed lookup. + if new_geo is not None: + self.geo_data = new_geo + self.state.ipv4 = current_ipv4 + self.state.ipv6 = current_ipv6 + if not is_first_run: + self.store.record_change(self.state, old_ipv4, old_ipv6) else: - logger.warning("IP ADDRESS CHANGE DETECTED") - if ipv4_changed: - logger.warning(" IPv4: %s -> %s", old_ipv4, current_ipv4) - self.metrics.inc( - "wanwatcher_ip_changes_total", {"family": "ipv4"} - ) - if ipv6_changed: - logger.warning(" IPv6: %s -> %s", old_ipv6, current_ipv6) - self.metrics.inc( - "wanwatcher_ip_changes_total", {"family": "ipv6"} - ) - self.metrics.set_gauge( - "wanwatcher_last_change_timestamp_seconds", time.time() - ) - - self.geo_data = get_geo_data( - self.config.ipinfo_token, timeout=self.config.http_timeout - ) - - self.state.ipv4 = current_ipv4 - self.state.ipv6 = current_ipv6 - if not is_first_run: - self.store.record_change(self.state, old_ipv4, old_ipv6) + logger.info("No IP address changes detected") self.store.save(self.state) + except OSError as exc: + logger.error("Failed to persist state: %s", exc, exc_info=True) + # 5. Side effects (network). Each is isolated so a failure is logged but + # never counted as a check failure nor blocks the others. + if changed: + try: results = self.notifications.send_to_all( current_ips, previous_ips, @@ -203,26 +243,24 @@ def check_ip(self) -> bool: "wanwatcher_notifications_total", {"provider": provider, "result": "ok" if ok else "error"}, ) - else: - logger.info("No IP address changes detected") - # Refresh the state file timestamp so the container healthcheck - # can tell a live loop from a stuck one. - self.store.save(self.state) + except Exception as exc: # noqa: BLE001 + logger.error("Notification dispatch failed: %s", exc, exc_info=True) - if self.ddns_client is not None: + if self.ddns_client is not None: + try: self.ddns_client.update(current_ipv4, current_ipv6) + except Exception as exc: # noqa: BLE001 + logger.error("DDNS update failed: %s", exc, exc_info=True) - if self.mqtt is not None: + if self.mqtt is not None: + try: self.mqtt.publish_state( current_ipv4, current_ipv6, self.geo_data, self.state.last_change ) + except Exception as exc: # noqa: BLE001 + logger.error("MQTT publish failed: %s", exc, exc_info=True) - return True - - except Exception as exc: # noqa: BLE001 - the loop must survive anything - logger.error("Error during IP check: %s", exc, exc_info=True) - self.metrics.inc("wanwatcher_check_failures_total") - return False + return True # -- outage tracking ---------------------------------------------------- diff --git a/wanwatcher/metrics.py b/wanwatcher/metrics.py index f8b1f96..cd96175 100644 --- a/wanwatcher/metrics.py +++ b/wanwatcher/metrics.py @@ -6,7 +6,7 @@ import threading import time -from typing import Dict, Tuple +from typing import Dict, Optional, Tuple class Metrics: @@ -61,13 +61,17 @@ def _key( ) -> Tuple[str, Tuple[Tuple[str, str], ...]]: return (name, tuple(sorted(labels.items()))) - def inc(self, name: str, labels: Dict[str, str] = None, value: float = 1) -> None: + def inc( + self, name: str, labels: Optional[Dict[str, str]] = None, value: float = 1 + ) -> None: labels = labels or {} with self._lock: key = self._key(name, labels) self._counters[key] = self._counters.get(key, 0) + value - def set_gauge(self, name: str, value: float, labels: Dict[str, str] = None) -> None: + def set_gauge( + self, name: str, value: float, labels: Optional[Dict[str, str]] = None + ) -> None: labels = labels or {} with self._lock: self._gauges[self._key(name, labels)] = value