From 72e86dca3c1ec7c18b28a18b00b0e20edb6deac3 Mon Sep 17 00:00:00 2001 From: jichao wang Date: Sat, 27 Jun 2026 17:58:43 +0100 Subject: [PATCH] Fix ISO 8601 date pattern accepting impossible month/day values The "ISO 8601 datetime" pattern in DateRecognizer used `[01]\d` for the month and `[0-3]\d` for the day. These ranges admit impossible values: month `00` and `13`-`19`, and day `00` and `32`-`39`. As a result strings such as `2024-13-15T14:30:00Z` and `2024-12-32T14:30Z` were detected as DATE_TIME. Every other date pattern in this same file already constrains the month to `01`-`12` and the day to `01`-`31`; only the ISO 8601 pattern was loose. Tighten the ISO month/day fields to match (using non-capturing groups so existing capture-group positions are unaffected). No valid ISO 8601 datetime is lost, since those values are not valid dates to begin with. Adds parametrized cases for invalid month (00, 13) and day (00, 32). --- .../predefined_recognizers/generic/date_recognizer.py | 2 +- presidio-analyzer/tests/test_date_recognizer.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/generic/date_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/generic/date_recognizer.py index 08bc4aa1de..94a609155b 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/generic/date_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/generic/date_recognizer.py @@ -16,7 +16,7 @@ class DateRecognizer(PatternRecognizer): PATTERNS = [ Pattern( "ISO 8601 datetime", - r"\b(\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d\.\d+([+-][0-2]\d:[0-5]\d|Z))|(\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))|(\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))\b", + r"\b(\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])T[0-2]\d:[0-5]\d:[0-5]\d\.\d+([+-][0-2]\d:[0-5]\d|Z))|(\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])T[0-2]\d:[0-5]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))|(\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])T[0-2]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))\b", 0.8, ), Pattern( diff --git a/presidio-analyzer/tests/test_date_recognizer.py b/presidio-analyzer/tests/test_date_recognizer.py index 62a6c700d9..b9b458ad7d 100644 --- a/presidio-analyzer/tests/test_date_recognizer.py +++ b/presidio-analyzer/tests/test_date_recognizer.py @@ -49,6 +49,11 @@ def entities(): ("Today is 2024-03-15T14:30:00Z\r or not?", 1, ((9, 29),), ((0.6, 0.81),),), ("Today is 2024-03-15T14:30Z\n or not?", 1, ((9, 26),), ((0.6, 0.81),),), ("2024-03-15T14:30Z", 1, ((0, 17),), ((0.6, 1),),), + # Invalid ISO 8601 month/day values must not be detected as a date + ("2024-13-15T14:30:00Z", 0, (), (),), + ("2024-00-15T14:30:00Z", 0, (), (),), + ("2024-12-32T14:30Z", 0, (), (),), + ("2024-12-00T14:30Z", 0, (), (),), ("Today is2024-06-05T09:15:30.500-07:00", 0, (), (),), # Word boundary tests ("Today is5/21", 0, (), (),),