From 54e8d0258167bce7efe55b50e2a91c316044231f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 19 May 2026 09:33:57 +0200 Subject: [PATCH 1/6] perf: literal-prefix capture-extraction fast path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For anchored patterns of the shape ^([^X]+)X.*$ with replacement `${1}` (or `$1`) capture 1's bounds are structurally trivial — skip the prefix, find the terminator with memchr — so the engine doesn't need to track captures at all. Two changes work together: 1. A new `LiteralPrefixCapture` strategy in `regex-automata`'s meta engine recognizes the shape via HIR walking (single-pattern only, anchored at both ends, default flags, ASCII terminator, finite literal-alternation prefix set capped at 32 variants). Strategy methods extract the match and capture-1 slots directly with memchr, bypassing PikeVM / BoundedBacktracker. Wires in alongside the existing reverse strategies. 2. `Regex::replacen` gets a borrowed-output fast path for replacements that are exactly `$N` / `${N}`. Detected via a new `Replacer::single_capture_ref` method (default `None`, opted into for `&str`/`String`/`Cow`). For `limit == 1` with a match covering the whole haystack, returns `Cow::Borrowed` of the captured slice — no `Captures::expand`, no output string allocation. Bench (500k synthetic Referer rows, 5-iter mean, on the same machine): Regex::replacen, q28 pattern, 80% match before: 281 ms after: 39 ms (7.3x) Regex::replacen, ^key=([^,]+),.*$, 100% match before: 113 ms after: 27 ms (4.2x) Tests: 257 / 257 pass (regex-automata --lib + --test integration, regex --test integration). No regressions. --- regex-automata/src/meta/strategy.rs | 367 +++++++++++++++++++++++++++- src/regex/string.rs | 103 ++++++++ 2 files changed, 468 insertions(+), 2 deletions(-) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index ebb876b2b8..297eb3d674 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -3,9 +3,9 @@ use core::{ panic::{RefUnwindSafe, UnwindSafe}, }; -use alloc::sync::Arc; +use alloc::{boxed::Box, sync::Arc, vec, vec::Vec}; -use regex_syntax::hir::{literal, Hir}; +use regex_syntax::hir::{literal, Class, Hir, HirKind, Literal, Look}; use crate::{ meta::{ @@ -160,6 +160,13 @@ pub(super) fn new( // might give up or quit for reasons. If we had, e.g., a PikeVM that // supported reverse searching, then we could avoid building a full Core // engine for this case. + core = match LiteralPrefixCapture::new(core, hirs) { + Err(core) => core, + Ok(lpc) => { + debug!("using literal-prefix-capture strategy"); + return Ok(Arc::new(lpc)); + } + }; core = match ReverseAnchored::new(core) { Err(core) => core, Ok(ra) => { @@ -1903,3 +1910,359 @@ fn copy_match_to_slots(m: Match, slots: &mut [Option]) { *slot = NonMaxUsize::new(m.end()); } } + +/// A specialized strategy for anchored, fully-bounded regexes of the form +/// +/// ```text +/// ^([^X]+)X.*$ +/// ``` +/// +/// where the prefix reduces to a finite set of literal byte alternatives, +/// the capture is a greedy `[^X]+` for a single ASCII byte X, and the trailing +/// `.*$` is the standard "rest of line, then end of haystack" tail. The +/// motivating instance is the ClickBench Q28 pattern +/// `^https?://(?:www\.)?([^/]+)/.*$` -> `${1}`, but the recognizer applies to +/// any pattern of this shape (single-literal prefixes, alternation, and +/// `?`-optional segments). +/// +/// For inputs that match, capture 1's bounds are structurally trivial — skip +/// the prefix, find the terminator with `memchr` — so we can avoid the full +/// engine's capture-tracking entirely. For inputs that don't match (e.g., a +/// newline in the tail breaks `.*$`, or no prefix matches), we report no +/// match: that result is identical to what the full engine would compute, so +/// no fallback is required. +#[derive(Debug)] +struct LiteralPrefixCapture { + core: Core, + /// Distinct literal byte prefixes, longest-first so the runtime probe + /// is greedy. Bounded to `MAX_PREFIX_VARIANTS` at construction time. + prefixes: Box<[Box<[u8]>]>, + /// Single ASCII byte ending the capture (also the literal that must + /// follow the capture in the original regex). + terminator: u8, +} + +/// Each `(?:...)?` doubles the count and each `(a|b|c)` multiplies it, +/// so this caps the explosion for adversarial patterns. 32 fits roughly +/// 8 levels of optional/alternation past Q28's 4 variants on one cache +/// line of `Box<[u8]>`. +const MAX_PREFIX_VARIANTS: usize = 32; + +impl LiteralPrefixCapture { + fn new(core: Core, hirs: &[&Hir]) -> Result { + if hirs.len() != 1 { + return Err(core); + } + if !core.info.is_always_anchored_start() + || !core.info.is_always_anchored_end() + { + return Err(core); + } + // `.*$` excludes the line terminator; the runtime newline check + // hard-codes `b'\n'`, so reject non-default line terminators. + if core.info.config().get_line_terminator() != b'\n' { + return Err(core); + } + let Some((prefixes, terminator)) = + try_recognize_prefix_capture(hirs[0]) + else { + return Err(core); + }; + Ok(LiteralPrefixCapture { core, prefixes, terminator }) + } + + /// Returns capture 1's byte offsets if the input matches, else `None`. + /// The overall match always spans `0..input.haystack().len()` because + /// the regex is `^...$`. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn try_fast_match(&self, input: &Input<'_>) -> Option<(usize, usize)> { + if input.start() != 0 || input.end() != input.haystack().len() { + return None; + } + let bytes = input.haystack(); + for prefix in self.prefixes.iter() { + if !bytes.starts_with(prefix) { + continue; + } + let cap_start = prefix.len(); + // Fused scan: the first byte that matters in the tail is either + // the terminator (success) or `\n` (failure for `.*$`). + let off = crate::util::memchr::memchr2( + self.terminator, + b'\n', + &bytes[cap_start..], + )?; + if bytes[cap_start + off] != self.terminator { + return None; + } + if off == 0 { + // `[^X]+` requires >= 1 byte; try a shorter prefix. + continue; + } + let cap_end = cap_start + off; + // Anything past the terminator must also be `\n`-free for + // `.*$` to reach end-of-haystack. + if crate::util::memchr::memchr(b'\n', &bytes[cap_end + 1..]) + .is_some() + { + return None; + } + return Some((cap_start, cap_end)); + } + None + } +} + +impl Strategy for LiteralPrefixCapture { + #[cfg_attr(feature = "perf-inline", inline(always))] + fn group_info(&self) -> &GroupInfo { + self.core.group_info() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn create_cache(&self) -> Cache { + self.core.create_cache() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn reset_cache(&self, cache: &mut Cache) { + self.core.reset_cache(cache); + } + + fn is_accelerated(&self) -> bool { + true + } + + fn memory_usage(&self) -> usize { + let prefix_bytes: usize = self.prefixes.iter().map(|p| p.len()).sum(); + self.core.memory_usage() + + self.prefixes.len() * core::mem::size_of::>() + + prefix_bytes + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option { + self.try_fast_match(input)?; + Some(Match::new(PatternID::ZERO, 0..input.haystack().len())) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search_half( + &self, + _cache: &mut Cache, + input: &Input<'_>, + ) -> Option { + self.try_fast_match(input)?; + Some(HalfMatch::new(PatternID::ZERO, input.haystack().len())) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match(&self, _cache: &mut Cache, input: &Input<'_>) -> bool { + self.try_fast_match(input).is_some() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn search_slots( + &self, + _cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option], + ) -> Option { + let (cap_start, cap_end) = self.try_fast_match(input)?; + let match_end = input.haystack().len(); + if let Some(slot) = slots.get_mut(0) { + *slot = NonMaxUsize::new(0); + } + if let Some(slot) = slots.get_mut(1) { + *slot = NonMaxUsize::new(match_end); + } + if let Some(slot) = slots.get_mut(2) { + *slot = NonMaxUsize::new(cap_start); + } + if let Some(slot) = slots.get_mut(3) { + *slot = NonMaxUsize::new(cap_end); + } + Some(PatternID::ZERO) + } + + fn which_overlapping_matches( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + self.core.which_overlapping_matches(cache, input, patset) + } +} + +/// Recognizes `^([^X]+)X.*$` (default flags) and returns the +/// enumerated prefix set together with the terminator byte X. +fn try_recognize_prefix_capture(hir: &Hir) -> Option<(Box<[Box<[u8]>]>, u8)> { + let HirKind::Concat(parts) = hir.kind() else { + return None; + }; + let mut iter = parts.iter(); + + // Multiline `(?m)` lowers `^` to `Look::StartLF`, which would break + // the byte-level fast path; require text-start specifically. + if !matches!(iter.next()?.kind(), HirKind::Look(Look::Start)) { + return None; + } + + let mut prefixes: Vec> = vec![Vec::new()]; + let capture = loop { + let part = iter.next()?; + if matches!(part.kind(), HirKind::Capture(_)) { + break part; + } + extend_prefix(&mut prefixes, part)?; + if prefixes.len() > MAX_PREFIX_VARIANTS { + return None; + } + }; + + let HirKind::Capture(cap) = capture.kind() else { unreachable!() }; + if cap.index != 1 { + return None; + } + let terminator = capture_terminator_byte(&cap.sub)?; + + let HirKind::Literal(Literal(lit)) = iter.next()?.kind() else { + return None; + }; + if lit.as_ref() != [terminator] { + return None; + } + + if !is_dot_star(iter.next()?) { + return None; + } + + if !matches!(iter.next()?.kind(), HirKind::Look(Look::End)) { + return None; + } + if iter.next().is_some() { + return None; + } + + prefixes.sort_unstable(); + prefixes.dedup(); + let mut prefixes: Vec> = + prefixes.into_iter().map(Vec::into_boxed_slice).collect(); + prefixes.sort_unstable_by_key(|p| core::cmp::Reverse(p.len())); + + Some((prefixes.into_boxed_slice(), terminator)) +} + +/// Extend the accumulator with one prefix segment. Returns `None` if the +/// segment isn't a finite literal shape (literal / concat / alternation / +/// `?`-optional combination of those). +fn extend_prefix(variants: &mut Vec>, hir: &Hir) -> Option<()> { + match hir.kind() { + HirKind::Literal(Literal(bytes)) => { + for v in variants.iter_mut() { + v.extend_from_slice(bytes); + } + Some(()) + } + HirKind::Concat(parts) => { + for part in parts { + extend_prefix(variants, part)?; + if variants.len() > MAX_PREFIX_VARIANTS { + return None; + } + } + Some(()) + } + HirKind::Repetition(rep) if rep.min == 0 && rep.max == Some(1) => { + let mut with = variants.clone(); + extend_prefix(&mut with, &rep.sub)?; + if variants.len() + with.len() > MAX_PREFIX_VARIANTS { + return None; + } + variants.extend(with); + Some(()) + } + HirKind::Alternation(branches) => { + let base = core::mem::take(variants); + for branch in branches { + let mut local = base.clone(); + extend_prefix(&mut local, branch)?; + if variants.len() + local.len() > MAX_PREFIX_VARIANTS { + return None; + } + variants.extend(local); + } + Some(()) + } + _ => None, + } +} + +/// Capture must be a greedy `[^X]+` over a single ASCII byte X. +fn capture_terminator_byte(hir: &Hir) -> Option { + let HirKind::Repetition(rep) = hir.kind() else { + return None; + }; + if rep.min < 1 || rep.max.is_some() || !rep.greedy { + return None; + } + let HirKind::Class(class) = rep.sub.kind() else { + return None; + }; + single_excluded_ascii_byte(class) +} + +/// `.*` for default-flag regexes: any byte except `\n`, zero or more, greedy. +fn is_dot_star(hir: &Hir) -> bool { + let HirKind::Repetition(rep) = hir.kind() else { + return false; + }; + if rep.min != 0 || rep.max.is_some() || !rep.greedy { + return false; + } + let HirKind::Class(class) = rep.sub.kind() else { + return false; + }; + single_excluded_ascii_byte(class) == Some(b'\n') +} + +/// Returns `Some(b)` iff `class` matches every codepoint or byte except a +/// single ASCII byte `b`. ASCII-only because the runtime matcher uses +/// `memchr` over byte slices. +fn single_excluded_ascii_byte(class: &Class) -> Option { + match class { + Class::Unicode(uc) => { + let ranges = uc.ranges(); + if ranges.len() != 2 { + return None; + } + let (r0, r1) = (&ranges[0], &ranges[1]); + if (r0.start() as u32) != 0 || (r1.end() as u32) != 0x10FFFF { + return None; + } + let gap_start = r0.end() as u32 + 1; + let gap_end = r1.start() as u32 - 1; + if gap_start != gap_end || gap_start > 0x7F { + return None; + } + Some(gap_start as u8) + } + Class::Bytes(bc) => { + let ranges = bc.ranges(); + if ranges.len() != 2 { + return None; + } + let (r0, r1) = (&ranges[0], &ranges[1]); + if r0.start() != 0 || r1.end() != 0xFF { + return None; + } + let gap_start = r0.end() as u16 + 1; + let gap_end = r1.start() as u16 - 1; + if gap_start != gap_end || gap_start > 0x7F { + return None; + } + Some(gap_start as u8) + } + } +} diff --git a/src/regex/string.rs b/src/regex/string.rs index e066d7630c..ba8acea961 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -938,6 +938,55 @@ impl Regex { return Cow::Owned(new); } + // When the replacement is exactly a single capture reference + // (`$N` / `${N}`), each match's output is just the captured slice + // — no `Captures::expand`. For a single match covering the whole + // haystack (common with anchored regexes), this returns a + // `Cow::Borrowed` with no output allocation at all. + if let Some(group_idx) = rep.single_capture_ref() { + if limit == 1 { + let Some(cap) = self.captures(haystack) else { + return Cow::Borrowed(haystack); + }; + let m = cap.get(0).unwrap(); + let g = cap.get(group_idx); + if m.start() == 0 && m.end() == haystack.len() { + return match g { + Some(g) => { + Cow::Borrowed(&haystack[g.start()..g.end()]) + } + None => Cow::Borrowed(""), + }; + } + let mut new = String::with_capacity(haystack.len()); + new.push_str(&haystack[..m.start()]); + if let Some(g) = g { + new.push_str(&haystack[g.start()..g.end()]); + } + new.push_str(&haystack[m.end()..]); + return Cow::Owned(new); + } + let mut it = self.captures_iter(haystack).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(haystack); + } + let mut new = String::with_capacity(haystack.len()); + let mut last_match = 0; + for (i, cap) in it { + let m = cap.get(0).unwrap(); + new.push_str(&haystack[last_match..m.start()]); + if let Some(g) = cap.get(group_idx) { + new.push_str(&haystack[g.start()..g.end()]); + } + last_match = m.end(); + if limit > 0 && i + 1 >= limit { + break; + } + } + new.push_str(&haystack[last_match..]); + return Cow::Owned(new); + } + // The slower path, which we use if the replacement may need access to // capture groups. let mut it = self.captures_iter(haystack).enumerate().peekable(); @@ -2470,6 +2519,17 @@ pub trait Replacer { None } + /// Returns `Some(group_index)` if this replacement is *exactly* a single + /// capture reference (`$N` or `${N}`) with no surrounding text. + /// + /// Replacement routines use this to skip [`Captures::expand`] entirely + /// — each match's output is just the captured slice, and when the + /// match covers the whole haystack the result is a `Cow::Borrowed` + /// with no output allocation. + fn single_capture_ref(&mut self) -> Option { + None + } + /// Returns a type that implements `Replacer`, but that borrows and wraps /// this `Replacer`. /// @@ -2505,6 +2565,10 @@ impl<'a> Replacer for &'a str { fn no_expansion(&mut self) -> Option> { no_expansion(self) } + + fn single_capture_ref(&mut self) -> Option { + single_capture_ref(self) + } } impl<'a> Replacer for &'a String { @@ -2515,6 +2579,10 @@ impl<'a> Replacer for &'a String { fn no_expansion(&mut self) -> Option> { no_expansion(self) } + + fn single_capture_ref(&mut self) -> Option { + single_capture_ref(self) + } } impl Replacer for String { @@ -2525,6 +2593,10 @@ impl Replacer for String { fn no_expansion(&mut self) -> Option> { no_expansion(self) } + + fn single_capture_ref(&mut self) -> Option { + single_capture_ref(self) + } } impl<'a> Replacer for Cow<'a, str> { @@ -2535,6 +2607,10 @@ impl<'a> Replacer for Cow<'a, str> { fn no_expansion(&mut self) -> Option> { no_expansion(self) } + + fn single_capture_ref(&mut self) -> Option { + single_capture_ref(self) + } } impl<'a> Replacer for &'a Cow<'a, str> { @@ -2545,6 +2621,10 @@ impl<'a> Replacer for &'a Cow<'a, str> { fn no_expansion(&mut self) -> Option> { no_expansion(self) } + + fn single_capture_ref(&mut self) -> Option { + single_capture_ref(self) + } } impl Replacer for F @@ -2623,3 +2703,26 @@ fn no_expansion>(replacement: &T) -> Option> { None => Some(Cow::Borrowed(replacement)), } } + +/// Returns `Some(N)` iff the replacement is *exactly* a single capture +/// reference of the form `$N` or `${N}` for a numeric group index. Named +/// refs (`${name}`) and `$$` (escaped `$`) are rejected. +/// +/// This is meant to be used to implement the [`Replacer::single_capture_ref`] +/// method in its various trait impls. +fn single_capture_ref>(replacement: &T) -> Option { + let rest = replacement.as_ref().strip_prefix('$')?; + if rest.starts_with('$') { + return None; + } + let digits = match rest.strip_prefix('{') { + Some(inner) => inner.strip_suffix('}')?, + None => rest, + }; + // `parse::` accepts a leading `+`, which would let `${+1}` + // masquerade as group 1; check explicitly. + if digits.is_empty() || digits.bytes().any(|b| !b.is_ascii_digit()) { + return None; + } + digits.parse().ok() +} From 73262c9df12b501b553e7889727ed4df3bbd33e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Thu, 28 May 2026 12:12:45 +0200 Subject: [PATCH 2/6] Fix literal prefix capture fast path correctness --- regex-automata/src/meta/strategy.rs | 76 ++++++++++++++++++----------- src/regex/string.rs | 26 ++++++---- tests/misc.rs | 18 +++++++ tests/replace.rs | 38 +++++++++++++++ 4 files changed, 120 insertions(+), 38 deletions(-) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 297eb3d674..95e3315ed5 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -1934,8 +1934,8 @@ fn copy_match_to_slots(m: Match, slots: &mut [Option]) { #[derive(Debug)] struct LiteralPrefixCapture { core: Core, - /// Distinct literal byte prefixes, longest-first so the runtime probe - /// is greedy. Bounded to `MAX_PREFIX_VARIANTS` at construction time. + /// Distinct literal byte prefixes in regex-priority order. Bounded to + /// `MAX_PREFIX_VARIANTS` at construction time. prefixes: Box<[Box<[u8]>]>, /// Single ASCII byte ending the capture (also the literal that must /// follow the capture in the original regex). @@ -1953,6 +1953,9 @@ impl LiteralPrefixCapture { if hirs.len() != 1 { return Err(core); } + if core.info.config().get_match_kind() != MatchKind::LeftmostFirst { + return Err(core); + } if !core.info.is_always_anchored_start() || !core.info.is_always_anchored_end() { @@ -1963,8 +1966,9 @@ impl LiteralPrefixCapture { if core.info.config().get_line_terminator() != b'\n' { return Err(core); } + let allow_unicode_classes = core.info.config().get_utf8_empty(); let Some((prefixes, terminator)) = - try_recognize_prefix_capture(hirs[0]) + try_recognize_prefix_capture(hirs[0], allow_unicode_classes) else { return Err(core); }; @@ -1985,18 +1989,14 @@ impl LiteralPrefixCapture { continue; } let cap_start = prefix.len(); - // Fused scan: the first byte that matters in the tail is either - // the terminator (success) or `\n` (failure for `.*$`). - let off = crate::util::memchr::memchr2( + let Some(off) = crate::util::memchr::memchr( self.terminator, - b'\n', &bytes[cap_start..], - )?; - if bytes[cap_start + off] != self.terminator { - return None; - } + ) else { + continue; + }; if off == 0 { - // `[^X]+` requires >= 1 byte; try a shorter prefix. + // `[^X]+` requires >= 1 byte; try the next possible prefix. continue; } let cap_end = cap_start + off; @@ -2005,7 +2005,7 @@ impl LiteralPrefixCapture { if crate::util::memchr::memchr(b'\n', &bytes[cap_end + 1..]) .is_some() { - return None; + continue; } return Some((cap_start, cap_end)); } @@ -2097,7 +2097,10 @@ impl Strategy for LiteralPrefixCapture { /// Recognizes `^([^X]+)X.*$` (default flags) and returns the /// enumerated prefix set together with the terminator byte X. -fn try_recognize_prefix_capture(hir: &Hir) -> Option<(Box<[Box<[u8]>]>, u8)> { +fn try_recognize_prefix_capture( + hir: &Hir, + allow_unicode_classes: bool, +) -> Option<(Box<[Box<[u8]>]>, u8)> { let HirKind::Concat(parts) = hir.kind() else { return None; }; @@ -2125,7 +2128,7 @@ fn try_recognize_prefix_capture(hir: &Hir) -> Option<(Box<[Box<[u8]>]>, u8)> { if cap.index != 1 { return None; } - let terminator = capture_terminator_byte(&cap.sub)?; + let terminator = capture_terminator_byte(&cap.sub, allow_unicode_classes)?; let HirKind::Literal(Literal(lit)) = iter.next()?.kind() else { return None; @@ -2134,7 +2137,7 @@ fn try_recognize_prefix_capture(hir: &Hir) -> Option<(Box<[Box<[u8]>]>, u8)> { return None; } - if !is_dot_star(iter.next()?) { + if !is_dot_star(iter.next()?, allow_unicode_classes) { return None; } @@ -2145,11 +2148,14 @@ fn try_recognize_prefix_capture(hir: &Hir) -> Option<(Box<[Box<[u8]>]>, u8)> { return None; } - prefixes.sort_unstable(); - prefixes.dedup(); - let mut prefixes: Vec> = - prefixes.into_iter().map(Vec::into_boxed_slice).collect(); - prefixes.sort_unstable_by_key(|p| core::cmp::Reverse(p.len())); + let mut deduped = Vec::with_capacity(prefixes.len()); + for prefix in prefixes { + if !deduped.iter().any(|seen| seen == &prefix) { + deduped.push(prefix); + } + } + let prefixes: Vec> = + deduped.into_iter().map(Vec::into_boxed_slice).collect(); Some((prefixes.into_boxed_slice(), terminator)) } @@ -2180,7 +2186,12 @@ fn extend_prefix(variants: &mut Vec>, hir: &Hir) -> Option<()> { if variants.len() + with.len() > MAX_PREFIX_VARIANTS { return None; } - variants.extend(with); + if rep.greedy { + let without = core::mem::replace(variants, with); + variants.extend(without); + } else { + variants.extend(with); + } Some(()) } HirKind::Alternation(branches) => { @@ -2200,21 +2211,24 @@ fn extend_prefix(variants: &mut Vec>, hir: &Hir) -> Option<()> { } /// Capture must be a greedy `[^X]+` over a single ASCII byte X. -fn capture_terminator_byte(hir: &Hir) -> Option { +fn capture_terminator_byte( + hir: &Hir, + allow_unicode_classes: bool, +) -> Option { let HirKind::Repetition(rep) = hir.kind() else { return None; }; - if rep.min < 1 || rep.max.is_some() || !rep.greedy { + if rep.min != 1 || rep.max.is_some() || !rep.greedy { return None; } let HirKind::Class(class) = rep.sub.kind() else { return None; }; - single_excluded_ascii_byte(class) + single_excluded_ascii_byte(class, allow_unicode_classes) } /// `.*` for default-flag regexes: any byte except `\n`, zero or more, greedy. -fn is_dot_star(hir: &Hir) -> bool { +fn is_dot_star(hir: &Hir, allow_unicode_classes: bool) -> bool { let HirKind::Repetition(rep) = hir.kind() else { return false; }; @@ -2224,15 +2238,21 @@ fn is_dot_star(hir: &Hir) -> bool { let HirKind::Class(class) = rep.sub.kind() else { return false; }; - single_excluded_ascii_byte(class) == Some(b'\n') + single_excluded_ascii_byte(class, allow_unicode_classes) == Some(b'\n') } /// Returns `Some(b)` iff `class` matches every codepoint or byte except a /// single ASCII byte `b`. ASCII-only because the runtime matcher uses /// `memchr` over byte slices. -fn single_excluded_ascii_byte(class: &Class) -> Option { +fn single_excluded_ascii_byte( + class: &Class, + allow_unicode_classes: bool, +) -> Option { match class { Class::Unicode(uc) => { + if !allow_unicode_classes { + return None; + } let ranges = uc.ranges(); if ranges.len() != 2 { return None; diff --git a/src/regex/string.rs b/src/regex/string.rs index ba8acea961..6b3aa0527e 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -939,10 +939,8 @@ impl Regex { } // When the replacement is exactly a single capture reference - // (`$N` / `${N}`), each match's output is just the captured slice - // — no `Captures::expand`. For a single match covering the whole - // haystack (common with anchored regexes), this returns a - // `Cow::Borrowed` with no output allocation at all. + // (`$N` / `${N}`), each match's output is just the captured slice, + // so we can skip `Captures::expand`. if let Some(group_idx) = rep.single_capture_ref() { if limit == 1 { let Some(cap) = self.captures(haystack) else { @@ -953,9 +951,15 @@ impl Regex { if m.start() == 0 && m.end() == haystack.len() { return match g { Some(g) => { - Cow::Borrowed(&haystack[g.start()..g.end()]) + if g.start() == 0 && g.end() == haystack.len() { + Cow::Borrowed(haystack) + } else { + Cow::Owned(String::from( + &haystack[g.start()..g.end()], + )) + } } - None => Cow::Borrowed(""), + None => Cow::Owned(String::new()), }; } let mut new = String::with_capacity(haystack.len()); @@ -2522,10 +2526,8 @@ pub trait Replacer { /// Returns `Some(group_index)` if this replacement is *exactly* a single /// capture reference (`$N` or `${N}`) with no surrounding text. /// - /// Replacement routines use this to skip [`Captures::expand`] entirely - /// — each match's output is just the captured slice, and when the - /// match covers the whole haystack the result is a `Cow::Borrowed` - /// with no output allocation. + /// Replacement routines use this to skip [`Captures::expand`] entirely: + /// each match's output is just the captured slice. fn single_capture_ref(&mut self) -> Option { None } @@ -2654,6 +2656,10 @@ impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { fn no_expansion(&mut self) -> Option> { self.0.no_expansion() } + + fn single_capture_ref(&mut self) -> Option { + self.0.single_capture_ref() + } } /// A helper type for forcing literal string replacement. diff --git a/tests/misc.rs b/tests/misc.rs index c04c9c9fe2..1b3329eee0 100644 --- a/tests/misc.rs +++ b/tests/misc.rs @@ -141,3 +141,21 @@ fn dfa_handles_pathological_case() { }; assert!(re.is_match(&text)); } + +#[test] +fn literal_prefix_capture_requires_exact_plus() { + let re = regex!(r"^a([^/]{2,})/.*$"); + assert!(!re.is_match("ab/x")); +} + +#[test] +fn literal_prefix_capture_bytes_unicode_rejects_invalid_utf8() { + let re = regex::bytes::Regex::new(r"^a([^/]+)/.*$").unwrap(); + assert!(!re.is_match(b"a\xFF/x")); + + let re = regex::bytes::RegexBuilder::new(r"^a([^/]+)/.*$") + .unicode(false) + .build() + .unwrap(); + assert!(re.is_match(b"a\xFF/x")); +} diff --git a/tests/replace.rs b/tests/replace.rs index f26ae46030..b656decf63 100644 --- a/tests/replace.rs +++ b/tests/replace.rs @@ -181,3 +181,41 @@ fn replacen_with_captures() { let re = regex::Regex::new(r"([0-9])").unwrap(); assert_eq!(re.replacen("age: 1234", 2, "${1}Z"), "age: 1Z2Z34"); } + +#[test] +fn replace_single_capture_ref_borrow_contract() { + let re = regex::Regex::new(r"^a([^/]+)/.*$").unwrap(); + let got = re.replace("abc/x", "$1"); + assert_eq!(got, "bc"); + assert!(matches!(got, std::borrow::Cow::Owned(_))); +} + +#[test] +fn replace_literal_prefix_capture_allows_newline_in_capture() { + let re = regex::Regex::new(r"^a([^/]+)/.*$").unwrap(); + assert_eq!(re.replace("a\nb/x", "$1"), "\nb"); +} + +#[test] +fn replace_literal_prefix_capture_respects_prefix_priority() { + let re = regex::Regex::new(r"^(?:a|ab)([^/]+)/.*$").unwrap(); + assert_eq!(re.replace("abc/x", "$1"), "bc"); +} + +#[test] +fn replace_literal_prefix_capture_respects_ungreedy_optional() { + let re = regex::Regex::new(r"^a??([^/]+)/.*$").unwrap(); + assert_eq!(re.replace("abc/x", "$1"), "abc"); +} + +#[test] +fn replace_literal_prefix_capture_backtracks_greedy_optional() { + let re = regex::Regex::new(r"^a?([^/]+)/.*$").unwrap(); + assert_eq!(re.replace("a/x", "$1"), "a"); +} + +#[test] +fn replace_literal_prefix_capture_backtracks_after_tail_newline() { + let re = regex::Regex::new(r"^(?:a|ab/b\nc)([^/]+)/.*$").unwrap(); + assert_eq!(re.replace("ab/b\ncd/e", "$1"), "d"); +} From b73bfb8ac41b09ecc0099ae97e378c68059b6ea4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Thu, 28 May 2026 12:27:18 +0200 Subject: [PATCH 3/6] Recover literal prefix capture scan speed --- regex-automata/src/meta/strategy.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 95e3315ed5..9b130a2d76 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -1984,16 +1984,25 @@ impl LiteralPrefixCapture { return None; } let bytes = input.haystack(); - for prefix in self.prefixes.iter() { + 'prefix: for prefix in self.prefixes.iter() { if !bytes.starts_with(prefix) { continue; } let cap_start = prefix.len(); - let Some(off) = crate::util::memchr::memchr( - self.terminator, - &bytes[cap_start..], - ) else { - continue; + let mut scan_start = cap_start; + let off = loop { + let Some(next) = crate::util::memchr::memchr2( + self.terminator, + b'\n', + &bytes[scan_start..], + ) else { + continue 'prefix; + }; + let found = scan_start + next; + if bytes[found] == self.terminator { + break found - cap_start; + } + scan_start = found + 1; }; if off == 0 { // `[^X]+` requires >= 1 byte; try the next possible prefix. From 87dcb2ca2c68d44c6c62d9365e70bc8ab4b1c3fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Thu, 28 May 2026 12:46:36 +0200 Subject: [PATCH 4/6] Fix literal prefix capture fast path correctness --- regex-automata/src/meta/strategy.rs | 75 +++++++++++++++++------------ tests/misc.rs | 10 ++++ tests/replace.rs | 6 +++ 3 files changed, 61 insertions(+), 30 deletions(-) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 9b130a2d76..66344ab389 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -1980,6 +1980,11 @@ impl LiteralPrefixCapture { /// the regex is `^...$`. #[cfg_attr(feature = "perf-inline", inline(always))] fn try_fast_match(&self, input: &Input<'_>) -> Option<(usize, usize)> { + if let Some(pid) = input.get_anchored().pattern() { + if pid != PatternID::ZERO { + return None; + } + } if input.start() != 0 || input.end() != input.haystack().len() { return None; } @@ -2127,10 +2132,7 @@ fn try_recognize_prefix_capture( if matches!(part.kind(), HirKind::Capture(_)) { break part; } - extend_prefix(&mut prefixes, part)?; - if prefixes.len() > MAX_PREFIX_VARIANTS { - return None; - } + prefixes = concat_prefix_variants(prefixes, prefix_variants(part)?)?; }; let HirKind::Capture(cap) = capture.kind() else { unreachable!() }; @@ -2169,56 +2171,69 @@ fn try_recognize_prefix_capture( Some((prefixes.into_boxed_slice(), terminator)) } -/// Extend the accumulator with one prefix segment. Returns `None` if the -/// segment isn't a finite literal shape (literal / concat / alternation / -/// `?`-optional combination of those). -fn extend_prefix(variants: &mut Vec>, hir: &Hir) -> Option<()> { +/// Return all literal variants for one prefix segment in regex-priority +/// order. Returns `None` if the segment isn't a finite literal shape +/// (literal / concat / alternation / `?`-optional combination of those). +fn prefix_variants(hir: &Hir) -> Option>> { match hir.kind() { - HirKind::Literal(Literal(bytes)) => { - for v in variants.iter_mut() { - v.extend_from_slice(bytes); - } - Some(()) - } + HirKind::Literal(Literal(bytes)) => Some(vec![bytes.to_vec()]), HirKind::Concat(parts) => { + let mut variants = vec![Vec::new()]; for part in parts { - extend_prefix(variants, part)?; - if variants.len() > MAX_PREFIX_VARIANTS { - return None; - } + variants = + concat_prefix_variants(variants, prefix_variants(part)?)?; } - Some(()) + Some(variants) } HirKind::Repetition(rep) if rep.min == 0 && rep.max == Some(1) => { - let mut with = variants.clone(); - extend_prefix(&mut with, &rep.sub)?; - if variants.len() + with.len() > MAX_PREFIX_VARIANTS { + let mut variants = prefix_variants(&rep.sub)?; + if variants.len() + 1 > MAX_PREFIX_VARIANTS { return None; } if rep.greedy { - let without = core::mem::replace(variants, with); - variants.extend(without); + variants.push(Vec::new()); } else { - variants.extend(with); + variants.insert(0, Vec::new()); } - Some(()) + Some(variants) } HirKind::Alternation(branches) => { - let base = core::mem::take(variants); + let mut variants = Vec::new(); for branch in branches { - let mut local = base.clone(); - extend_prefix(&mut local, branch)?; + let local = prefix_variants(branch)?; if variants.len() + local.len() > MAX_PREFIX_VARIANTS { return None; } variants.extend(local); } - Some(()) + Some(variants) } _ => None, } } +/// Concatenate two already-prioritized prefix variant lists. For regex +/// concatenation, every suffix priority is exhausted before backtracking to +/// the next prefix priority. +fn concat_prefix_variants( + prefixes: Vec>, + suffixes: Vec>, +) -> Option>> { + if prefixes.len().checked_mul(suffixes.len())? > MAX_PREFIX_VARIANTS { + return None; + } + let mut variants = Vec::with_capacity(prefixes.len() * suffixes.len()); + for prefix in prefixes { + for suffix in &suffixes { + let mut variant = Vec::with_capacity(prefix.len() + suffix.len()); + variant.extend_from_slice(&prefix); + variant.extend_from_slice(suffix); + variants.push(variant); + } + } + Some(variants) +} + /// Capture must be a greedy `[^X]+` over a single ASCII byte X. fn capture_terminator_byte( hir: &Hir, diff --git a/tests/misc.rs b/tests/misc.rs index 1b3329eee0..f6f46e5948 100644 --- a/tests/misc.rs +++ b/tests/misc.rs @@ -148,6 +148,16 @@ fn literal_prefix_capture_requires_exact_plus() { assert!(!re.is_match("ab/x")); } +#[test] +fn literal_prefix_capture_respects_invalid_pattern_id() { + let re = regex_automata::meta::Regex::new(r"^a([^/]+)/.*$").unwrap(); + let input = regex_automata::Input::new("abc/x").anchored( + regex_automata::Anchored::Pattern(regex_automata::PatternID::must(1)), + ); + let mut cache = re.create_cache(); + assert_eq!(None, re.search_with(&mut cache, &input)); +} + #[test] fn literal_prefix_capture_bytes_unicode_rejects_invalid_utf8() { let re = regex::bytes::Regex::new(r"^a([^/]+)/.*$").unwrap(); diff --git a/tests/replace.rs b/tests/replace.rs index b656decf63..8faa5ff42e 100644 --- a/tests/replace.rs +++ b/tests/replace.rs @@ -214,6 +214,12 @@ fn replace_literal_prefix_capture_backtracks_greedy_optional() { assert_eq!(re.replace("a/x", "$1"), "a"); } +#[test] +fn replace_literal_prefix_capture_concat_optional_priority() { + let re = regex::Regex::new(r"^a?(?:ab)?([^/]+)/.*$").unwrap(); + assert_eq!(re.replace("abx/y", "$1"), "bx"); +} + #[test] fn replace_literal_prefix_capture_backtracks_after_tail_newline() { let re = regex::Regex::new(r"^(?:a|ab/b\nc)([^/]+)/.*$").unwrap(); From 254c3d51f7800f24f073327e4df07dc58c59adda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Thu, 28 May 2026 13:01:59 +0200 Subject: [PATCH 5/6] Validate UTF-8 in literal prefix capture fast path --- regex-automata/src/meta/strategy.rs | 56 ++++++++++++++++++++--------- tests/misc.rs | 6 ++++ 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 66344ab389..b8843f4aa5 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -1940,6 +1940,9 @@ struct LiteralPrefixCapture { /// Single ASCII byte ending the capture (also the literal that must /// follow the capture in the original regex). terminator: u8, + /// Whether capture/tail classes were Unicode classes. When true, the + /// byte fast path must still reject invalid UTF-8 haystacks. + requires_valid_utf8: bool, } /// Each `(?:...)?` doubles the count and each `(a|b|c)` multiplies it, @@ -1967,12 +1970,17 @@ impl LiteralPrefixCapture { return Err(core); } let allow_unicode_classes = core.info.config().get_utf8_empty(); - let Some((prefixes, terminator)) = + let Some((prefixes, terminator, requires_valid_utf8)) = try_recognize_prefix_capture(hirs[0], allow_unicode_classes) else { return Err(core); }; - Ok(LiteralPrefixCapture { core, prefixes, terminator }) + Ok(LiteralPrefixCapture { + core, + prefixes, + terminator, + requires_valid_utf8, + }) } /// Returns capture 1's byte offsets if the input matches, else `None`. @@ -2021,6 +2029,10 @@ impl LiteralPrefixCapture { { continue; } + if self.requires_valid_utf8 && core::str::from_utf8(bytes).is_err() + { + return None; + } return Some((cap_start, cap_end)); } None @@ -2114,7 +2126,7 @@ impl Strategy for LiteralPrefixCapture { fn try_recognize_prefix_capture( hir: &Hir, allow_unicode_classes: bool, -) -> Option<(Box<[Box<[u8]>]>, u8)> { +) -> Option<(Box<[Box<[u8]>]>, u8, bool)> { let HirKind::Concat(parts) = hir.kind() else { return None; }; @@ -2139,7 +2151,8 @@ fn try_recognize_prefix_capture( if cap.index != 1 { return None; } - let terminator = capture_terminator_byte(&cap.sub, allow_unicode_classes)?; + let (terminator, capture_requires_utf8) = + capture_terminator_byte(&cap.sub, allow_unicode_classes)?; let HirKind::Literal(Literal(lit)) = iter.next()?.kind() else { return None; @@ -2148,9 +2161,8 @@ fn try_recognize_prefix_capture( return None; } - if !is_dot_star(iter.next()?, allow_unicode_classes) { - return None; - } + let dot_star_requires_utf8 = + dot_star_requires_valid_utf8(iter.next()?, allow_unicode_classes)?; if !matches!(iter.next()?.kind(), HirKind::Look(Look::End)) { return None; @@ -2168,7 +2180,8 @@ fn try_recognize_prefix_capture( let prefixes: Vec> = deduped.into_iter().map(Vec::into_boxed_slice).collect(); - Some((prefixes.into_boxed_slice(), terminator)) + let requires_valid_utf8 = capture_requires_utf8 || dot_star_requires_utf8; + Some((prefixes.into_boxed_slice(), terminator, requires_valid_utf8)) } /// Return all literal variants for one prefix segment in regex-priority @@ -2238,7 +2251,7 @@ fn concat_prefix_variants( fn capture_terminator_byte( hir: &Hir, allow_unicode_classes: bool, -) -> Option { +) -> Option<(u8, bool)> { let HirKind::Repetition(rep) = hir.kind() else { return None; }; @@ -2252,17 +2265,26 @@ fn capture_terminator_byte( } /// `.*` for default-flag regexes: any byte except `\n`, zero or more, greedy. -fn is_dot_star(hir: &Hir, allow_unicode_classes: bool) -> bool { +fn dot_star_requires_valid_utf8( + hir: &Hir, + allow_unicode_classes: bool, +) -> Option { let HirKind::Repetition(rep) = hir.kind() else { - return false; + return None; }; if rep.min != 0 || rep.max.is_some() || !rep.greedy { - return false; + return None; } let HirKind::Class(class) = rep.sub.kind() else { - return false; + return None; }; - single_excluded_ascii_byte(class, allow_unicode_classes) == Some(b'\n') + let (excluded, requires_valid_utf8) = + single_excluded_ascii_byte(class, allow_unicode_classes)?; + if excluded == b'\n' { + Some(requires_valid_utf8) + } else { + None + } } /// Returns `Some(b)` iff `class` matches every codepoint or byte except a @@ -2271,7 +2293,7 @@ fn is_dot_star(hir: &Hir, allow_unicode_classes: bool) -> bool { fn single_excluded_ascii_byte( class: &Class, allow_unicode_classes: bool, -) -> Option { +) -> Option<(u8, bool)> { match class { Class::Unicode(uc) => { if !allow_unicode_classes { @@ -2290,7 +2312,7 @@ fn single_excluded_ascii_byte( if gap_start != gap_end || gap_start > 0x7F { return None; } - Some(gap_start as u8) + Some((gap_start as u8, true)) } Class::Bytes(bc) => { let ranges = bc.ranges(); @@ -2306,7 +2328,7 @@ fn single_excluded_ascii_byte( if gap_start != gap_end || gap_start > 0x7F { return None; } - Some(gap_start as u8) + Some((gap_start as u8, false)) } } } diff --git a/tests/misc.rs b/tests/misc.rs index f6f46e5948..46cd71f7b1 100644 --- a/tests/misc.rs +++ b/tests/misc.rs @@ -158,6 +158,12 @@ fn literal_prefix_capture_respects_invalid_pattern_id() { assert_eq!(None, re.search_with(&mut cache, &input)); } +#[test] +fn literal_prefix_capture_meta_unicode_rejects_invalid_utf8() { + let re = regex_automata::meta::Regex::new(r"^a([^/]+)/.*$").unwrap(); + assert!(!re.is_match(regex_automata::Input::new(b"a\xFF/x"))); +} + #[test] fn literal_prefix_capture_bytes_unicode_rejects_invalid_utf8() { let re = regex::bytes::Regex::new(r"^a([^/]+)/.*$").unwrap(); From 48018870207a456aa8518a5d0684659f396b68d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Thu, 28 May 2026 13:19:26 +0200 Subject: [PATCH 6/6] Recover UTF-8 string fast path --- regex-automata/src/meta/strategy.rs | 5 +++-- regex-automata/src/util/search.rs | 24 +++++++++++++++++++++ src/regex/string.rs | 33 +++++++++++++++++++---------- src/regexset/string.rs | 7 +++--- 4 files changed, 53 insertions(+), 16 deletions(-) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index b8843f4aa5..17a525b13a 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -1997,6 +1997,8 @@ impl LiteralPrefixCapture { return None; } let bytes = input.haystack(); + let must_validate_utf8 = + self.requires_valid_utf8 && !input.haystack_is_known_valid_utf8(); 'prefix: for prefix in self.prefixes.iter() { if !bytes.starts_with(prefix) { continue; @@ -2029,8 +2031,7 @@ impl LiteralPrefixCapture { { continue; } - if self.requires_valid_utf8 && core::str::from_utf8(bytes).is_err() - { + if must_validate_utf8 && core::str::from_utf8(bytes).is_err() { return None; } return Some((cap_start, cap_end)); diff --git a/regex-automata/src/util/search.rs b/regex-automata/src/util/search.rs index 3ece11d155..7cc41a2983 100644 --- a/regex-automata/src/util/search.rs +++ b/regex-automata/src/util/search.rs @@ -104,6 +104,7 @@ pub struct Input<'h> { span: Span, anchored: Anchored, earliest: bool, + haystack_known_valid_utf8: bool, } impl<'h> Input<'h> { @@ -120,6 +121,24 @@ impl<'h> Input<'h> { span: Span { start: 0, end: haystack.len() }, anchored: Anchored::No, earliest: false, + haystack_known_valid_utf8: false, + } + } + + /// Create a new search configuration for the given UTF-8 haystack. + /// + /// This is like [`Input::new`], but records the fact that the haystack is + /// already known to be valid UTF-8. This lets regex engines avoid + /// redundant UTF-8 validation when Unicode matching semantics require it. + #[inline] + pub fn new_utf8(haystack: &'h str) -> Input<'h> { + let haystack = haystack.as_bytes(); + Input { + haystack, + span: Span { start: 0, end: haystack.len() }, + anchored: Anchored::No, + earliest: false, + haystack_known_valid_utf8: true, } } @@ -767,6 +786,11 @@ impl<'h> Input<'h> { pub fn is_char_boundary(&self, offset: usize) -> bool { utf8::is_boundary(self.haystack(), offset) } + + #[inline] + pub(crate) fn haystack_is_known_valid_utf8(&self) -> bool { + self.haystack_known_valid_utf8 + } } impl<'h> core::fmt::Debug for Input<'h> { diff --git a/src/regex/string.rs b/src/regex/string.rs index 6b3aa0527e..46f0d30eaf 100644 --- a/src/regex/string.rs +++ b/src/regex/string.rs @@ -262,7 +262,10 @@ impl Regex { /// ``` #[inline] pub fn find_iter<'r, 'h>(&'r self, haystack: &'h str) -> Matches<'r, 'h> { - Matches { haystack, it: self.meta.find_iter(haystack) } + Matches { + haystack, + it: self.meta.find_iter(Input::new_utf8(haystack)), + } } /// This routine searches for the first match of this regex in the haystack @@ -421,7 +424,10 @@ impl Regex { &'r self, haystack: &'h str, ) -> CaptureMatches<'r, 'h> { - CaptureMatches { haystack, it: self.meta.captures_iter(haystack) } + CaptureMatches { + haystack, + it: self.meta.captures_iter(Input::new_utf8(haystack)), + } } /// Returns an iterator of substrings of the haystack given, delimited by a @@ -551,7 +557,7 @@ impl Regex { /// ``` #[inline] pub fn split<'r, 'h>(&'r self, haystack: &'h str) -> Split<'r, 'h> { - Split { haystack, it: self.meta.split(haystack) } + Split { haystack, it: self.meta.split(Input::new_utf8(haystack)) } } /// Returns an iterator of at most `limit` substrings of the haystack @@ -630,7 +636,10 @@ impl Regex { haystack: &'h str, limit: usize, ) -> SplitN<'r, 'h> { - SplitN { haystack, it: self.meta.splitn(haystack, limit) } + SplitN { + haystack, + it: self.meta.splitn(Input::new_utf8(haystack), limit), + } } /// Replaces the leftmost-first match in the given haystack with the @@ -1088,8 +1097,9 @@ impl Regex { haystack: &str, start: usize, ) -> Option { - let input = - Input::new(haystack).earliest(true).span(start..haystack.len()); + let input = Input::new_utf8(haystack) + .earliest(true) + .span(start..haystack.len()); self.meta.search_half(&input).map(|hm| hm.offset()) } @@ -1122,8 +1132,9 @@ impl Regex { /// ``` #[inline] pub fn is_match_at(&self, haystack: &str, start: usize) -> bool { - let input = - Input::new(haystack).earliest(true).span(start..haystack.len()); + let input = Input::new_utf8(haystack) + .earliest(true) + .span(start..haystack.len()); self.meta.search_half(&input).is_some() } @@ -1160,7 +1171,7 @@ impl Regex { haystack: &'h str, start: usize, ) -> Option> { - let input = Input::new(haystack).span(start..haystack.len()); + let input = Input::new_utf8(haystack).span(start..haystack.len()); self.meta .search(&input) .map(|m| Match::new(haystack, m.start(), m.end())) @@ -1199,7 +1210,7 @@ impl Regex { haystack: &'h str, start: usize, ) -> Option> { - let input = Input::new(haystack).span(start..haystack.len()); + let input = Input::new_utf8(haystack).span(start..haystack.len()); let mut caps = self.meta.create_captures(); self.meta.search_captures(&input, &mut caps); if caps.is_match() { @@ -1290,7 +1301,7 @@ impl Regex { haystack: &'h str, start: usize, ) -> Option> { - let input = Input::new(haystack).span(start..haystack.len()); + let input = Input::new_utf8(haystack).span(start..haystack.len()); self.meta.search_captures(&input, &mut locs.0); locs.0.get_match().map(|m| Match::new(haystack, m.start(), m.end())) } diff --git a/src/regexset/string.rs b/src/regexset/string.rs index 5126a4661e..c842063839 100644 --- a/src/regexset/string.rs +++ b/src/regexset/string.rs @@ -242,7 +242,8 @@ impl RegexSet { /// ``` #[inline] pub fn is_match_at(&self, haystack: &str, start: usize) -> bool { - self.meta.is_match(Input::new(haystack).span(start..haystack.len())) + self.meta + .is_match(Input::new_utf8(haystack).span(start..haystack.len())) } /// Returns the set of regexes that match in the given haystack. @@ -323,7 +324,7 @@ impl RegexSet { /// ``` #[inline] pub fn matches_at(&self, haystack: &str, start: usize) -> SetMatches { - let input = Input::new(haystack).span(start..haystack.len()); + let input = Input::new_utf8(haystack).span(start..haystack.len()); let mut patset = PatternSet::new(self.meta.pattern_len()); self.meta.which_overlapping_matches(&input, &mut patset); SetMatches(patset) @@ -357,7 +358,7 @@ impl RegexSet { // is in regex-automata, not regex. So maybe we should just accept a // 'SetMatches', which is basically just a newtype around PatternSet. let mut patset = PatternSet::new(self.meta.pattern_len()); - let mut input = Input::new(haystack); + let mut input = Input::new_utf8(haystack); input.set_start(start); self.meta.which_overlapping_matches(&input, &mut patset); for pid in patset.iter() {