diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index ebb876b2b8..17a525b13a 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -3,9 +3,9 @@ use core::{ panic::{RefUnwindSafe, UnwindSafe}, }; -use alloc::sync::Arc; +use alloc::{boxed::Box, sync::Arc, vec, vec::Vec}; -use regex_syntax::hir::{literal, Hir}; +use regex_syntax::hir::{literal, Class, Hir, HirKind, Literal, Look}; use crate::{ meta::{ @@ -160,6 +160,13 @@ pub(super) fn new( // might give up or quit for reasons. If we had, e.g., a PikeVM that // supported reverse searching, then we could avoid building a full Core // engine for this case. + core = match LiteralPrefixCapture::new(core, hirs) { + Err(core) => core, + Ok(lpc) => { + debug!("using literal-prefix-capture strategy"); + return Ok(Arc::new(lpc)); + } + }; core = match ReverseAnchored::new(core) { Err(core) => core, Ok(ra) => { @@ -1903,3 +1910,426 @@ fn copy_match_to_slots(m: Match, slots: &mut [Option]) { *slot = NonMaxUsize::new(m.end()); } } + +/// A specialized strategy for anchored, fully-bounded regexes of the form +/// +/// ```text +/// ^([^X]+)X.*$ +/// ``` +/// +/// where the prefix reduces to a finite set of literal byte alternatives, +/// the capture is a greedy `[^X]+` for a single ASCII byte X, and the trailing +/// `.*$` is the standard "rest of line, then end of haystack" tail. The +/// motivating instance is the ClickBench Q28 pattern +/// `^https?://(?:www\.)?([^/]+)/.*$` -> `${1}`, but the recognizer applies to +/// any pattern of this shape (single-literal prefixes, alternation, and +/// `?`-optional segments). +/// +/// For inputs that match, capture 1's bounds are structurally trivial — skip +/// the prefix, find the terminator with `memchr` — so we can avoid the full +/// engine's capture-tracking entirely. For inputs that don't match (e.g., a +/// newline in the tail breaks `.*$`, or no prefix matches), we report no +/// match: that result is identical to what the full engine would compute, so +/// no fallback is required. +#[derive(Debug)] +struct LiteralPrefixCapture { + core: Core, + /// Distinct literal byte prefixes in regex-priority order. Bounded to + /// `MAX_PREFIX_VARIANTS` at construction time. + prefixes: Box<[Box<[u8]>]>, + /// Single ASCII byte ending the capture (also the literal that must + /// follow the capture in the original regex). + terminator: u8, + /// Whether capture/tail classes were Unicode classes. When true, the + /// byte fast path must still reject invalid UTF-8 haystacks. + requires_valid_utf8: bool, +} + +/// Each `(?:...)?` doubles the count and each `(a|b|c)` multiplies it, +/// so this caps the explosion for adversarial patterns. 32 fits roughly +/// 8 levels of optional/alternation past Q28's 4 variants on one cache +/// line of `Box<[u8]>`. +const MAX_PREFIX_VARIANTS: usize = 32; + +impl LiteralPrefixCapture { + fn new(core: Core, hirs: &[&Hir]) -> Result { + if hirs.len() != 1 { + return Err(core); + } + if core.info.config().get_match_kind() != MatchKind::LeftmostFirst { + return Err(core); + } + if !core.info.is_always_anchored_start() + || !core.info.is_always_anchored_end() + { + return Err(core); + } + // `.*$` excludes the line terminator; the runtime newline check + // hard-codes `b'\n'`, so reject non-default line terminators. + if core.info.config().get_line_terminator() != b'\n' { + return Err(core); + } + let allow_unicode_classes = core.info.config().get_utf8_empty(); + let Some((prefixes, terminator, requires_valid_utf8)) = + try_recognize_prefix_capture(hirs[0], allow_unicode_classes) + else { + return Err(core); + }; + Ok(LiteralPrefixCapture { + core, + prefixes, + terminator, + requires_valid_utf8, + }) + } + + /// Returns capture 1's byte offsets if the input matches, else `None`. + /// The overall match always spans `0..input.haystack().len()` because + /// the regex is `^...$`. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn try_fast_match(&self, input: &Input<'_>) -> Option<(usize, usize)> { + if let Some(pid) = input.get_anchored().pattern() { + if pid != PatternID::ZERO { + return None; + } + } + if input.start() != 0 || input.end() != input.haystack().len() { + return None; + } + let bytes = input.haystack(); + let must_validate_utf8 = + self.requires_valid_utf8 && !input.haystack_is_known_valid_utf8(); + 'prefix: for prefix in self.prefixes.iter() { + if !bytes.starts_with(prefix) { + continue; + } + let cap_start = prefix.len(); + let mut scan_start = cap_start; + let off = loop { + let Some(next) = crate::util::memchr::memchr2( + self.terminator, + b'\n', + &bytes[scan_start..], + ) else { + continue 'prefix; + }; + let found = scan_start + next; + if bytes[found] == self.terminator { + break found - cap_start; + } + scan_start = found + 1; + }; + if off == 0 { + // `[^X]+` requires >= 1 byte; try the next possible prefix. + continue; + } + let cap_end = cap_start + off; + // Anything past the terminator must also be `\n`-free for + // `.*$` to reach end-of-haystack. + if crate::util::memchr::memchr(b'\n', &bytes[cap_end + 1..]) + .is_some() + { + continue; + } + if must_validate_utf8 && core::str::from_utf8(bytes).is_err() { + return None; + } + return Some((cap_start, cap_end)); + } + None + } +} + +impl Strategy for LiteralPrefixCapture { + #[cfg_attr(feature = "perf-inline", inline(always))] + fn group_info(&self) -> &GroupInfo { + self.core.group_info() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn create_cache(&self) -> Cache { + self.core.create_cache() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn reset_cache(&self, cache: &mut Cache) { + self.core.reset_cache(cache); + } + + fn is_accelerated(&self) -> bool { + true + } + + fn memory_usage(&self) -> usize { + let prefix_bytes: usize = self.prefixes.iter().map(|p| p.len()).sum(); + self.core.memory_usage() + + self.prefixes.len() * core::mem::size_of::>() + + prefix_bytes + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option { + self.try_fast_match(input)?; + Some(Match::new(PatternID::ZERO, 0..input.haystack().len())) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search_half( + &self, + _cache: &mut Cache, + input: &Input<'_>, + ) -> Option { + self.try_fast_match(input)?; + Some(HalfMatch::new(PatternID::ZERO, input.haystack().len())) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, _cache: &mut Cache, input: &Input<'_>) -> bool { + self.try_fast_match(input).is_some() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search_slots( + &self, + _cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option], + ) -> Option { + let (cap_start, cap_end) = self.try_fast_match(input)?; + let match_end = input.haystack().len(); + if let Some(slot) = slots.get_mut(0) { + *slot = NonMaxUsize::new(0); + } + if let Some(slot) = slots.get_mut(1) { + *slot = NonMaxUsize::new(match_end); + } + if let Some(slot) = slots.get_mut(2) { + *slot = NonMaxUsize::new(cap_start); + } + if let Some(slot) = slots.get_mut(3) { + *slot = NonMaxUsize::new(cap_end); + } + Some(PatternID::ZERO) + } + + fn which_overlapping_matches( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + self.core.which_overlapping_matches(cache, input, patset) + } +} + +/// Recognizes `^([^X]+)X.*$` (default flags) and returns the +/// enumerated prefix set together with the terminator byte X. +fn try_recognize_prefix_capture( + hir: &Hir, + allow_unicode_classes: bool, +) -> Option<(Box<[Box<[u8]>]>, u8, bool)> { + let HirKind::Concat(parts) = hir.kind() else { + return None; + }; + let mut iter = parts.iter(); + + // Multiline `(?m)` lowers `^` to `Look::StartLF`, which would break + // the byte-level fast path; require text-start specifically. + if !matches!(iter.next()?.kind(), HirKind::Look(Look::Start)) { + return None; + } + + let mut prefixes: Vec> = vec![Vec::new()]; + let capture = loop { + let part = iter.next()?; + if matches!(part.kind(), HirKind::Capture(_)) { + break part; + } + prefixes = concat_prefix_variants(prefixes, prefix_variants(part)?)?; + }; + + let HirKind::Capture(cap) = capture.kind() else { unreachable!() }; + if cap.index != 1 { + return None; + } + let (terminator, capture_requires_utf8) = + capture_terminator_byte(&cap.sub, allow_unicode_classes)?; + + let HirKind::Literal(Literal(lit)) = iter.next()?.kind() else { + return None; + }; + if lit.as_ref() != [terminator] { + return None; + } + + let dot_star_requires_utf8 = + dot_star_requires_valid_utf8(iter.next()?, allow_unicode_classes)?; + + if !matches!(iter.next()?.kind(), HirKind::Look(Look::End)) { + return None; + } + if iter.next().is_some() { + return None; + } + + let mut deduped = Vec::with_capacity(prefixes.len()); + for prefix in prefixes { + if !deduped.iter().any(|seen| seen == &prefix) { + deduped.push(prefix); + } + } + let prefixes: Vec> = + deduped.into_iter().map(Vec::into_boxed_slice).collect(); + + let requires_valid_utf8 = capture_requires_utf8 || dot_star_requires_utf8; + Some((prefixes.into_boxed_slice(), terminator, requires_valid_utf8)) +} + +/// Return all literal variants for one prefix segment in regex-priority +/// order. Returns `None` if the segment isn't a finite literal shape +/// (literal / concat / alternation / `?`-optional combination of those). +fn prefix_variants(hir: &Hir) -> Option>> { + match hir.kind() { + HirKind::Literal(Literal(bytes)) => Some(vec![bytes.to_vec()]), + HirKind::Concat(parts) => { + let mut variants = vec![Vec::new()]; + for part in parts { + variants = + concat_prefix_variants(variants, prefix_variants(part)?)?; + } + Some(variants) + } + HirKind::Repetition(rep) if rep.min == 0 && rep.max == Some(1) => { + let mut variants = prefix_variants(&rep.sub)?; + if variants.len() + 1 > MAX_PREFIX_VARIANTS { + return None; + } + if rep.greedy { + variants.push(Vec::new()); + } else { + variants.insert(0, Vec::new()); + } + Some(variants) + } + HirKind::Alternation(branches) => { + let mut variants = Vec::new(); + for branch in branches { + let local = prefix_variants(branch)?; + if variants.len() + local.len() > MAX_PREFIX_VARIANTS { + return None; + } + variants.extend(local); + } + Some(variants) + } + _ => None, + } +} + +/// Concatenate two already-prioritized prefix variant lists. For regex +/// concatenation, every suffix priority is exhausted before backtracking to +/// the next prefix priority. +fn concat_prefix_variants( + prefixes: Vec>, + suffixes: Vec>, +) -> Option>> { + if prefixes.len().checked_mul(suffixes.len())? > MAX_PREFIX_VARIANTS { + return None; + } + let mut variants = Vec::with_capacity(prefixes.len() * suffixes.len()); + for prefix in prefixes { + for suffix in &suffixes { + let mut variant = Vec::with_capacity(prefix.len() + suffix.len()); + variant.extend_from_slice(&prefix); + variant.extend_from_slice(suffix); + variants.push(variant); + } + } + Some(variants) +} + +/// Capture must be a greedy `[^X]+` over a single ASCII byte X. +fn capture_terminator_byte( + hir: &Hir, + allow_unicode_classes: bool, +) -> Option<(u8, bool)> { + let HirKind::Repetition(rep) = hir.kind() else { + return None; + }; + if rep.min != 1 || rep.max.is_some() || !rep.greedy { + return None; + } + let HirKind::Class(class) = rep.sub.kind() else { + return None; + }; + single_excluded_ascii_byte(class, allow_unicode_classes) +} + +/// `.*` for default-flag regexes: any byte except `\n`, zero or more, greedy. +fn dot_star_requires_valid_utf8( + hir: &Hir, + allow_unicode_classes: bool, +) -> Option { + let HirKind::Repetition(rep) = hir.kind() else { + return None; + }; + if rep.min != 0 || rep.max.is_some() || !rep.greedy { + return None; + } + let HirKind::Class(class) = rep.sub.kind() else { + return None; + }; + let (excluded, requires_valid_utf8) = + single_excluded_ascii_byte(class, allow_unicode_classes)?; + if excluded == b'\n' { + Some(requires_valid_utf8) + } else { + None + } +} + +/// Returns `Some(b)` iff `class` matches every codepoint or byte except a +/// single ASCII byte `b`. ASCII-only because the runtime matcher uses +/// `memchr` over byte slices. +fn single_excluded_ascii_byte( + class: &Class, + allow_unicode_classes: bool, +) -> Option<(u8, bool)> { + match class { + Class::Unicode(uc) => { + if !allow_unicode_classes { + return None; + } + let ranges = uc.ranges(); + if ranges.len() != 2 { + return None; + } + let (r0, r1) = (&ranges[0], &ranges[1]); + if (r0.start() as u32) != 0 || (r1.end() as u32) != 0x10FFFF { + return None; + } + let gap_start = r0.end() as u32 + 1; + let gap_end = r1.start() as u32 - 1; + if gap_start != gap_end || gap_start > 0x7F { + return None; + } + Some((gap_start as u8, true)) + } + Class::Bytes(bc) => { + let ranges = bc.ranges(); + if ranges.len() != 2 { + return None; + } + let (r0, r1) = (&ranges[0], &ranges[1]); + if r0.start() != 0 || r1.end() != 0xFF { + return None; + } + let gap_start = r0.end() as u16 + 1; + let gap_end = r1.start() as u16 - 1; + if gap_start != gap_end || gap_start > 0x7F { + return None; + } + Some((gap_start as u8, false)) + } + } +} diff --git a/regex-automata/src/util/search.rs b/regex-automata/src/util/search.rs index 3ece11d155..7cc41a2983 100644 --- a/regex-automata/src/util/search.rs +++ b/regex-automata/src/util/search.rs @@ -104,6 +104,7 @@ pub struct Input<'h> { span: Span, anchored: Anchored, earliest: bool, + haystack_known_valid_utf8: bool, } impl<'h> Input<'h> { @@ -120,6 +121,24 @@ impl<'h> Input<'h> { span: Span { start: 0, end: haystack.len() }, anchored: Anchored::No, earliest: false, + haystack_known_valid_utf8: false, + } + } + + /// Create a new search configuration for the given UTF-8 haystack. + /// + /// This is like [`Input::new`], but records the fact that the haystack is + /// already known to be valid UTF-8. This lets regex engines avoid + /// redundant UTF-8 validation when Unicode matching semantics require it. + #[inline] + pub fn new_utf8(haystack: &'h str) -> Input<'h> { + let haystack = haystack.as_bytes(); + Input { + haystack, + span: Span { start: 0, end: haystack.len() }, + anchored: Anchored::No, + earliest: false, + haystack_known_valid_utf8: true, } } @@ -767,6 +786,11 @@ impl<'h> Input<'h> { pub fn is_char_boundary(&self, offset: usize) -> bool { utf8::is_boundary(self.haystack(), offset) } + + #[inline] + pub(crate) fn haystack_is_known_valid_utf8(&self) -> bool { + self.haystack_known_valid_utf8 + } } impl<'h> core::fmt::Debug for Input<'h> { diff --git a/src/regex/string.rs b/src/regex/string.rs index e066d7630c..46f0d30eaf 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -262,7 +262,10 @@ impl Regex { /// ``` #[inline] pub fn find_iter<'r, 'h>(&'r self, haystack: &'h str) -> Matches<'r, 'h> { - Matches { haystack, it: self.meta.find_iter(haystack) } + Matches { + haystack, + it: self.meta.find_iter(Input::new_utf8(haystack)), + } } /// This routine searches for the first match of this regex in the haystack @@ -421,7 +424,10 @@ impl Regex { &'r self, haystack: &'h str, ) -> CaptureMatches<'r, 'h> { - CaptureMatches { haystack, it: self.meta.captures_iter(haystack) } + CaptureMatches { + haystack, + it: self.meta.captures_iter(Input::new_utf8(haystack)), + } } /// Returns an iterator of substrings of the haystack given, delimited by a @@ -551,7 +557,7 @@ impl Regex { /// ``` #[inline] pub fn split<'r, 'h>(&'r self, haystack: &'h str) -> Split<'r, 'h> { - Split { haystack, it: self.meta.split(haystack) } + Split { haystack, it: self.meta.split(Input::new_utf8(haystack)) } } /// Returns an iterator of at most `limit` substrings of the haystack @@ -630,7 +636,10 @@ impl Regex { haystack: &'h str, limit: usize, ) -> SplitN<'r, 'h> { - SplitN { haystack, it: self.meta.splitn(haystack, limit) } + SplitN { + haystack, + it: self.meta.splitn(Input::new_utf8(haystack), limit), + } } /// Replaces the leftmost-first match in the given haystack with the @@ -938,6 +947,59 @@ impl Regex { return Cow::Owned(new); } + // When the replacement is exactly a single capture reference + // (`$N` / `${N}`), each match's output is just the captured slice, + // so we can skip `Captures::expand`. + if let Some(group_idx) = rep.single_capture_ref() { + if limit == 1 { + let Some(cap) = self.captures(haystack) else { + return Cow::Borrowed(haystack); + }; + let m = cap.get(0).unwrap(); + let g = cap.get(group_idx); + if m.start() == 0 && m.end() == haystack.len() { + return match g { + Some(g) => { + if g.start() == 0 && g.end() == haystack.len() { + Cow::Borrowed(haystack) + } else { + Cow::Owned(String::from( + &haystack[g.start()..g.end()], + )) + } + } + None => Cow::Owned(String::new()), + }; + } + let mut new = String::with_capacity(haystack.len()); + new.push_str(&haystack[..m.start()]); + if let Some(g) = g { + new.push_str(&haystack[g.start()..g.end()]); + } + new.push_str(&haystack[m.end()..]); + return Cow::Owned(new); + } + let mut it = self.captures_iter(haystack).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(haystack); + } + let mut new = String::with_capacity(haystack.len()); + let mut last_match = 0; + for (i, cap) in it { + let m = cap.get(0).unwrap(); + new.push_str(&haystack[last_match..m.start()]); + if let Some(g) = cap.get(group_idx) { + new.push_str(&haystack[g.start()..g.end()]); + } + last_match = m.end(); + if limit > 0 && i + 1 >= limit { + break; + } + } + new.push_str(&haystack[last_match..]); + return Cow::Owned(new); + } + // The slower path, which we use if the replacement may need access to // capture groups. let mut it = self.captures_iter(haystack).enumerate().peekable(); @@ -1035,8 +1097,9 @@ impl Regex { haystack: &str, start: usize, ) -> Option { - let input = - Input::new(haystack).earliest(true).span(start..haystack.len()); + let input = Input::new_utf8(haystack) + .earliest(true) + .span(start..haystack.len()); self.meta.search_half(&input).map(|hm| hm.offset()) } @@ -1069,8 +1132,9 @@ impl Regex { /// ``` #[inline] pub fn is_match_at(&self, haystack: &str, start: usize) -> bool { - let input = - Input::new(haystack).earliest(true).span(start..haystack.len()); + let input = Input::new_utf8(haystack) + .earliest(true) + .span(start..haystack.len()); self.meta.search_half(&input).is_some() } @@ -1107,7 +1171,7 @@ impl Regex { haystack: &'h str, start: usize, ) -> Option> { - let input = Input::new(haystack).span(start..haystack.len()); + let input = Input::new_utf8(haystack).span(start..haystack.len()); self.meta .search(&input) .map(|m| Match::new(haystack, m.start(), m.end())) @@ -1146,7 +1210,7 @@ impl Regex { haystack: &'h str, start: usize, ) -> Option> { - let input = Input::new(haystack).span(start..haystack.len()); + let input = Input::new_utf8(haystack).span(start..haystack.len()); let mut caps = self.meta.create_captures(); self.meta.search_captures(&input, &mut caps); if caps.is_match() { @@ -1237,7 +1301,7 @@ impl Regex { haystack: &'h str, start: usize, ) -> Option> { - let input = Input::new(haystack).span(start..haystack.len()); + let input = Input::new_utf8(haystack).span(start..haystack.len()); self.meta.search_captures(&input, &mut locs.0); locs.0.get_match().map(|m| Match::new(haystack, m.start(), m.end())) } @@ -2470,6 +2534,15 @@ pub trait Replacer { None } + /// Returns `Some(group_index)` if this replacement is *exactly* a single + /// capture reference (`$N` or `${N}`) with no surrounding text. + /// + /// Replacement routines use this to skip [`Captures::expand`] entirely: + /// each match's output is just the captured slice. + fn single_capture_ref(&mut self) -> Option { + None + } + /// Returns a type that implements `Replacer`, but that borrows and wraps /// this `Replacer`. /// @@ -2505,6 +2578,10 @@ impl<'a> Replacer for &'a str { fn no_expansion(&mut self) -> Option> { no_expansion(self) } + + fn single_capture_ref(&mut self) -> Option { + single_capture_ref(self) + } } impl<'a> Replacer for &'a String { @@ -2515,6 +2592,10 @@ impl<'a> Replacer for &'a String { fn no_expansion(&mut self) -> Option> { no_expansion(self) } + + fn single_capture_ref(&mut self) -> Option { + single_capture_ref(self) + } } impl Replacer for String { @@ -2525,6 +2606,10 @@ impl Replacer for String { fn no_expansion(&mut self) -> Option> { no_expansion(self) } + + fn single_capture_ref(&mut self) -> Option { + single_capture_ref(self) + } } impl<'a> Replacer for Cow<'a, str> { @@ -2535,6 +2620,10 @@ impl<'a> Replacer for Cow<'a, str> { fn no_expansion(&mut self) -> Option> { no_expansion(self) } + + fn single_capture_ref(&mut self) -> Option { + single_capture_ref(self) + } } impl<'a> Replacer for &'a Cow<'a, str> { @@ -2545,6 +2634,10 @@ impl<'a> Replacer for &'a Cow<'a, str> { fn no_expansion(&mut self) -> Option> { no_expansion(self) } + + fn single_capture_ref(&mut self) -> Option { + single_capture_ref(self) + } } impl Replacer for F @@ -2574,6 +2667,10 @@ impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { fn no_expansion(&mut self) -> Option> { self.0.no_expansion() } + + fn single_capture_ref(&mut self) -> Option { + self.0.single_capture_ref() + } } /// A helper type for forcing literal string replacement. @@ -2623,3 +2720,26 @@ fn no_expansion>(replacement: &T) -> Option> { None => Some(Cow::Borrowed(replacement)), } } + +/// Returns `Some(N)` iff the replacement is *exactly* a single capture +/// reference of the form `$N` or `${N}` for a numeric group index. Named +/// refs (`${name}`) and `$$` (escaped `$`) are rejected. +/// +/// This is meant to be used to implement the [`Replacer::single_capture_ref`] +/// method in its various trait impls. +fn single_capture_ref>(replacement: &T) -> Option { + let rest = replacement.as_ref().strip_prefix('$')?; + if rest.starts_with('$') { + return None; + } + let digits = match rest.strip_prefix('{') { + Some(inner) => inner.strip_suffix('}')?, + None => rest, + }; + // `parse::` accepts a leading `+`, which would let `${+1}` + // masquerade as group 1; check explicitly. + if digits.is_empty() || digits.bytes().any(|b| !b.is_ascii_digit()) { + return None; + } + digits.parse().ok() +} diff --git a/src/regexset/string.rs b/src/regexset/string.rs index 5126a4661e..c842063839 100644 --- a/src/regexset/string.rs +++ b/src/regexset/string.rs @@ -242,7 +242,8 @@ impl RegexSet { /// ``` #[inline] pub fn is_match_at(&self, haystack: &str, start: usize) -> bool { - self.meta.is_match(Input::new(haystack).span(start..haystack.len())) + self.meta + .is_match(Input::new_utf8(haystack).span(start..haystack.len())) } /// Returns the set of regexes that match in the given haystack. @@ -323,7 +324,7 @@ impl RegexSet { /// ``` #[inline] pub fn matches_at(&self, haystack: &str, start: usize) -> SetMatches { - let input = Input::new(haystack).span(start..haystack.len()); + let input = Input::new_utf8(haystack).span(start..haystack.len()); let mut patset = PatternSet::new(self.meta.pattern_len()); self.meta.which_overlapping_matches(&input, &mut patset); SetMatches(patset) @@ -357,7 +358,7 @@ impl RegexSet { // is in regex-automata, not regex. So maybe we should just accept a // 'SetMatches', which is basically just a newtype around PatternSet. let mut patset = PatternSet::new(self.meta.pattern_len()); - let mut input = Input::new(haystack); + let mut input = Input::new_utf8(haystack); input.set_start(start); self.meta.which_overlapping_matches(&input, &mut patset); for pid in patset.iter() { diff --git a/tests/misc.rs b/tests/misc.rs index c04c9c9fe2..46cd71f7b1 100644 --- a/tests/misc.rs +++ b/tests/misc.rs @@ -141,3 +141,37 @@ fn dfa_handles_pathological_case() { }; assert!(re.is_match(&text)); } + +#[test] +fn literal_prefix_capture_requires_exact_plus() { + let re = regex!(r"^a([^/]{2,})/.*$"); + assert!(!re.is_match("ab/x")); +} + +#[test] +fn literal_prefix_capture_respects_invalid_pattern_id() { + let re = regex_automata::meta::Regex::new(r"^a([^/]+)/.*$").unwrap(); + let input = regex_automata::Input::new("abc/x").anchored( + regex_automata::Anchored::Pattern(regex_automata::PatternID::must(1)), + ); + let mut cache = re.create_cache(); + assert_eq!(None, re.search_with(&mut cache, &input)); +} + +#[test] +fn literal_prefix_capture_meta_unicode_rejects_invalid_utf8() { + let re = regex_automata::meta::Regex::new(r"^a([^/]+)/.*$").unwrap(); + assert!(!re.is_match(regex_automata::Input::new(b"a\xFF/x"))); +} + +#[test] +fn literal_prefix_capture_bytes_unicode_rejects_invalid_utf8() { + let re = regex::bytes::Regex::new(r"^a([^/]+)/.*$").unwrap(); + assert!(!re.is_match(b"a\xFF/x")); + + let re = regex::bytes::RegexBuilder::new(r"^a([^/]+)/.*$") + .unicode(false) + .build() + .unwrap(); + assert!(re.is_match(b"a\xFF/x")); +} diff --git a/tests/replace.rs b/tests/replace.rs index f26ae46030..8faa5ff42e 100644 --- a/tests/replace.rs +++ b/tests/replace.rs @@ -181,3 +181,47 @@ fn replacen_with_captures() { let re = regex::Regex::new(r"([0-9])").unwrap(); assert_eq!(re.replacen("age: 1234", 2, "${1}Z"), "age: 1Z2Z34"); } + +#[test] +fn replace_single_capture_ref_borrow_contract() { + let re = regex::Regex::new(r"^a([^/]+)/.*$").unwrap(); + let got = re.replace("abc/x", "$1"); + assert_eq!(got, "bc"); + assert!(matches!(got, std::borrow::Cow::Owned(_))); +} + +#[test] +fn replace_literal_prefix_capture_allows_newline_in_capture() { + let re = regex::Regex::new(r"^a([^/]+)/.*$").unwrap(); + assert_eq!(re.replace("a\nb/x", "$1"), "\nb"); +} + +#[test] +fn replace_literal_prefix_capture_respects_prefix_priority() { + let re = regex::Regex::new(r"^(?:a|ab)([^/]+)/.*$").unwrap(); + assert_eq!(re.replace("abc/x", "$1"), "bc"); +} + +#[test] +fn replace_literal_prefix_capture_respects_ungreedy_optional() { + let re = regex::Regex::new(r"^a??([^/]+)/.*$").unwrap(); + assert_eq!(re.replace("abc/x", "$1"), "abc"); +} + +#[test] +fn replace_literal_prefix_capture_backtracks_greedy_optional() { + let re = regex::Regex::new(r"^a?([^/]+)/.*$").unwrap(); + assert_eq!(re.replace("a/x", "$1"), "a"); +} + +#[test] +fn replace_literal_prefix_capture_concat_optional_priority() { + let re = regex::Regex::new(r"^a?(?:ab)?([^/]+)/.*$").unwrap(); + assert_eq!(re.replace("abx/y", "$1"), "bx"); +} + +#[test] +fn replace_literal_prefix_capture_backtracks_after_tail_newline() { + let re = regex::Regex::new(r"^(?:a|ab/b\nc)([^/]+)/.*$").unwrap(); + assert_eq!(re.replace("ab/b\ncd/e", "$1"), "d"); +}