ct_regex_internal/expr/regex.rs
1use std::fmt::Debug;
2use std::ops::Range;
3
4use super::{CaptureFromRanges, IndexedCaptures};
5use crate::expr::{FindAllCaptures, RangeOfAllMatches, SliceAllMatches};
6use crate::haystack::{
7 HaystackItem, HaystackIter, HaystackOf, HaystackSlice, IntoHaystack, OwnedHaystackable,
8};
9use crate::matcher::Matcher;
10
11/// A trait that is automatically implemented for types produced by the `regex!` macro. Various
12/// function are included that test this pattern against a provided
13/// [`Haystack`](crate::haystack::Haystack).
14///
15/// Most methods will take an [`IntoHaystack`] or [`OwnedHaystackable`] parameter to save the
16/// user from creating their own `Haystack`. This allows values with types like `&str` and
17/// `&mut String` to be passed to these methods.
18///
19/// Altough rarely encountered, this trait's generic parameter, `I` refers to the item that can be
20/// matched individually from the provided `Haystack`. This is used so that the same expression can
21/// be used to match various haystack types, including `&str` (`I = char`) and `&[u8]` (`I = u8`).
22/// Implementations for both of these slice/item pairs will be implemented by the macro.
23///
24/// # Function Coverage
25///
26#[doc = include_str!("coverage.md")]
27///
28/// \* Note that these function runs through the Regex first and then the haystack. This means that
29/// substring involved is the one that matches the Regex first, not necessarily the first match in
30/// the haystack. In many cases, this makes no difference.
31pub trait Regex<I: HaystackItem, const N: usize>: Debug {
32 /// This type is a macro generated combination of ZSTs responsible for doing all of the heavy
33 /// lifting involved in actually matching or capturing against a `Haystack`. For realistic
34 /// expressions, this type will be very long an unpleasant to type, hence implementing it as an
35 /// associated type.
36 type Pattern: Matcher<I>;
37
38 /// A macro generated type holding all `N` capture groups in this expression, producing ranges
39 /// or slices of the haystack with aliases for named groups. The generated type also understands
40 /// which groups will always exist in a match and which are optional.
41 type Capture<'a, S: HaystackSlice<'a>>: CaptureFromRanges<'a, S, N> where I: 'a;
42
43 /// Returns `true` if this Regex matches the **entire** haystack provided. This should probably
44 /// be the default _matching_ function to use.
45 ///
46 /// A similar behavior can be achieved by using start and end anchors in an expression and then
47 /// calling [`contains_match`](Self::contains_match). This function should be prefered however,
48 /// because it fails fast if the first character doesn't match.
49 ///
50 /// To check if this Regex matches and perform capturing, use [`do_capture`](Self::do_capture)
51 /// instead.
52 fn is_match<'a, H: HaystackOf<'a, I>>(hay: impl IntoHaystack<'a, H>) -> bool {
53 let mut hay = hay.into_haystack();
54
55 Self::Pattern::all_matches(&mut hay)
56 .any(|state| hay.rollback(state).is_end())
57 }
58
59 /// Returns `true` if this Regex matches any substring of the haystack provided. To retrieve the
60 /// actual substring itself, use [`slice_match`](Self::slice_match) or
61 /// [`find_capture`](Self::find_capture).
62 ///
63 /// Anchors can be used as a part of this Regex to perform more complex behaviors, but if you're
64 /// just wrapping an expression with `^` and `$`, see [`is_match`](Self::is_match) instead.
65 fn contains_match<'a, H: HaystackOf<'a, I>>(hay: impl IntoHaystack<'a, H>) -> bool {
66 let mut hay = hay.into_haystack();
67
68 while hay.item().is_some() {
69 let start = hay.index();
70
71 if Self::Pattern::all_matches(&mut hay).next().is_some() {
72 return true;
73 }
74
75 hay.rollback(start).progress();
76 }
77 false
78 }
79
80 /// Returns the number of matches present in the haystack provided, optionally including
81 /// `overlapping` matches.
82 ///
83 /// If using this in conjunction with the actual matches themselves, you might be better of
84 /// collecting the other output and checking the length.
85 fn count_matches<'a, H: HaystackOf<'a, I>>(
86 hay: impl IntoHaystack<'a, H>,
87 overlapping: bool,
88 ) -> usize {
89 let mut hay = hay.into_haystack();
90 let mut count = 0;
91
92 while hay.item().is_some() {
93 let start = hay.index();
94
95 if let Some(state_fork) = Self::Pattern::all_matches(&mut hay).next() {
96 count += 1;
97
98 if overlapping {
99 hay.rollback(start).progress();
100 } else {
101 hay.rollback(state_fork);
102
103 debug_assert_ne!(start, state_fork)
104 }
105 } else {
106 hay.rollback(start).progress();
107 }
108 }
109
110 count
111 }
112
113 /// Returns the range that matches this Regex first. This is the range variant of
114 /// [`contains_match`](Self::contains_match). For the actual substring itself, see
115 /// [`slice_match`](Self::slice_match).
116 ///
117 /// Note that there is no range equivalent of [`is_match`](Self::is_match), because any match
118 /// has to be the entire haystack.
119 fn range_of_match<'a, H: HaystackOf<'a, I>>(
120 hay: impl IntoHaystack<'a, H>,
121 ) -> Option<Range<usize>> {
122 let mut hay = hay.into_haystack();
123 range_of_match::<Self, _, _>(&mut hay)
124 }
125
126 /// Returns an iterator over the ranges of all substrings in the provided haystack that match
127 /// this Regex, optionally `overlapping`. For the actual substrins themself, see
128 /// [`slice_all_matches`](Self::slice_all_matches).
129 ///
130 /// Note that each match is still made greedily. Even with `overlapping = true`, if two possible
131 /// matches start at the same index in the haystack, only the first to match the regex will be
132 /// included.
133 fn range_of_all_matches<'a, H: HaystackOf<'a, I>>(
134 hay: impl IntoHaystack<'a, H>,
135 overlapping: bool,
136 ) -> RangeOfAllMatches<'a, I, H, Self::Pattern> {
137 RangeOfAllMatches::new(hay.into_haystack(), overlapping)
138 }
139
140 /// Returns the slice that matches this Regex first. This is the slicing variant of
141 /// [`range_of_match`](Self::range_of_match).
142 ///
143 /// Note that there is no slicing equivalent of [`is_match`](Self::is_match), because any match
144 /// has to be the entire haystack.
145 fn slice_match<'a, H: HaystackOf<'a, I>>(hay: impl IntoHaystack<'a, H>) -> Option<H::Slice> {
146 let mut hay = hay.into_haystack();
147 let range = range_of_match::<Self, _, _>(&mut hay)?;
148 Some(hay.slice_with(range))
149 }
150
151 /// Returns an iterator over all slices of the provided haystack that match this Regex,
152 /// optionally `overlapping`.
153 ///
154 /// Note that each match is still made greedily. Even with `overlapping = true`, if two possible
155 /// matches start at the same index in the haystack, only the first to match the regex will be
156 /// included.
157 ///
158 /// The returned iterator doesn't implement [`ExactSizeIterator`] because it is lazy, if you
159 /// need to know how many matches are included, [`collect`](Iterator::collect) it first.
160 fn slice_all_matches<'a, H: HaystackOf<'a, I>>(
161 hay: impl IntoHaystack<'a, H>,
162 overlapping: bool,
163 ) -> SliceAllMatches<'a, I, H, Self::Pattern> {
164 SliceAllMatches {
165 inner: RangeOfAllMatches::new(hay.into_haystack(), overlapping),
166 }
167 }
168
169 /// Returns a [`Self::Capture`] representing the provided haystack matched against this Regex.
170 /// This includes any named or numbered capturing groups in the expression. As with
171 /// [`is_match`](Self::is_match), this function acts on the entire haystack, and needs to match
172 /// every character from start to end.
173 ///
174 /// Provides the same result as [`find_capture`](Self::find_capture) with start and end anchors,
175 /// although without needing to check any non-starting substring.
176 fn do_capture<'a, H: HaystackOf<'a, I>>(
177 hay: impl IntoHaystack<'a, H>,
178 ) -> Option<Self::Capture<'a, H::Slice>> {
179 let mut hay = hay.into_haystack();
180 let mut caps = IndexedCaptures::default();
181 let start = hay.index();
182
183 let all_captures = Self::Pattern::all_captures(&mut hay, &mut caps);
184
185 for (state_fork, mut caps_fork) in all_captures {
186 if hay.rollback(state_fork).is_end() {
187 caps_fork.push(0, start..state_fork);
188
189 return Some(
190 Self::Capture::from_ranges(caps_fork.into_array(), hay.whole_slice())
191 .expect("failed to convert captures despite matching correctly")
192 );
193 }
194 }
195 None
196 }
197
198 /// Returns the [`Self::Capture`] that matches this Regex first, similar to
199 /// [`slice_match`](Self::slice_match) but with any named or numbered groups included.
200 ///
201 /// Anchors should be used for complex behavior, beyond unconditional start and end matches. See
202 /// [`do_capture`](Self::do_capture) instead to capture a full haystack.
203 fn find_capture<'a, H: HaystackOf<'a, I>>(
204 hay: impl IntoHaystack<'a, H>,
205 ) -> Option<Self::Capture<'a, H::Slice>> {
206 let mut hay = hay.into_haystack();
207
208 while hay.item().is_some() {
209 let start = hay.index();
210
211 let mut caps = IndexedCaptures::default();
212
213 let first = Self::Pattern::all_captures(&mut hay, &mut caps).next();
214
215 hay.rollback(start);
216
217 if let Some((state_fork, mut caps_fork)) = first {
218 caps_fork.push(0, start..state_fork);
219
220 return Some(
221 Self::Capture::from_ranges(caps_fork.into_array(), hay.inner_slice())
222 .expect("failed to convert captures despite matching correctly")
223 );
224 }
225 hay.progress()
226 }
227 None
228 }
229
230 /// Returns an iterator over [`Self::Capture`]s representing every full match of this Regex in
231 /// the provided haystack, similar to [`slice_all_matches`](Self::slice_all_matches). This can
232 /// optionally include `overlapping` matches.
233 ///
234 /// Note that each match is still made greedily. Even with `overlapping = true`, if two possible
235 /// matches start at the same index in the haystack, only the first to match the regex will be
236 /// included.
237 fn find_all_captures<'a, H: HaystackOf<'a, I>>(
238 hay: impl IntoHaystack<'a, H>,
239 overlapping: bool,
240 ) -> FindAllCaptures<'a, Self, I, H, N> {
241 FindAllCaptures::new(hay.into_haystack(), overlapping)
242 }
243
244 /// Replaces the first match of this Regex in the provided haystack with the provided slice. The
245 /// slice type required is the one associated with the provided haystack. The return value is a
246 /// boolean indicating whether a match was found and replaced.
247 fn replace<'a, M: OwnedHaystackable<I>>(
248 hay_mut: &mut M,
249 with: <M::Hay<'a> as HaystackIter<'a>>::Slice
250 ) -> bool {
251 let Some(range) = ({
252 let mut hay = hay_mut.as_haystack();
253 range_of_match::<Self, _, _>(&mut hay)
254 }) else {
255 return false;
256 };
257 hay_mut.replace_range(range, with);
258 true
259 }
260
261 /// Replaces every matching substring in the provided haystack with a copy of the provided
262 /// slice. The slice type required is the one associated with the provided haystack. The return
263 /// value is an integer representing the number of matches and replacements that occured.
264 fn replace_all<'a, M: OwnedHaystackable<I>>(
265 hay_mut: &mut M,
266 with: <M::Hay<'a> as HaystackIter<'a>>::Slice
267 ) -> usize {
268 // Avoids redirecting to replace_all_using to avoid unnessecary clones.
269 let ranges = RangeOfAllMatches::<I, M::Hay<'_>, Self::Pattern>::new(
270 hay_mut.as_haystack(),
271 false
272 ).collect::<Vec<_>>();
273
274 let count = ranges.len();
275 let mut delta = Delta::new();
276
277 for mut range in ranges {
278 delta.apply_to(&mut range);
279
280 let initial_len = hay_mut.len();
281 hay_mut.replace_range(range, with.clone());
282 delta.add_diff(hay_mut.len(), initial_len);
283 }
284
285 count
286 }
287
288 /// Replaces every matching substring in the provided haystack with the return value of the
289 /// provided function. The return type of this functions needs to match the provided haystack.
290 /// The return value is an integer representing the number of matches and replacements that
291 /// occured.
292 ///
293 /// Because of the use of [`FnMut`] for the parameter, this can be used to replace all matches
294 /// using an iterator by passing in `|| iter.next().unwrap_or_default()`.
295 fn replace_all_using<M: OwnedHaystackable<I>>(
296 hay_mut: &mut M,
297 mut using: impl FnMut() -> M,
298 ) -> usize {
299 let ranges = RangeOfAllMatches::<I, M::Hay<'_>, Self::Pattern>::new(
300 hay_mut.as_haystack(),
301 false
302 ).collect::<Vec<_>>();
303
304 let count = ranges.len();
305 let mut delta = Delta::new();
306
307 for mut range in ranges {
308 delta.apply_to(&mut range);
309
310 let initial_len = hay_mut.len();
311 hay_mut.replace_range(range, using().as_slice());
312 delta.add_diff(hay_mut.len(), initial_len);
313 }
314
315 count
316 }
317
318 // The closure returns M because it can't continue to reference the source, given that we need
319 // to overwrite it.
320
321 /// Replaces the first captured substring in the provided haystack with a computed value. The
322 /// return value is a boolean indicating whether a match was found and replaced.
323 ///
324 /// The provided function is used to create a replacement value when given the capture. The
325 /// replacement value shares a type with the provided haystack. Its simplified signature would
326 /// be `F: FnOnce(Self::Capture<'_, <M::Hay>::Slice>) -> M`. Because of limitations with higher
327 /// ranked trait bounds surrounding closure, it may be necessary to implement this as function
328 /// with lifetime annotations like so:
329 /// ```ignore
330 /// regex!(PhoneNum = r"(0|(?<country_code>\+[0-9]+))(?<number>[0-9]{9})");
331 ///
332 /// fn remove_country_code<'a>(value: PhoneNumCapture<'a, &'a str>) -> String {
333 /// format!("0{}", value.number())
334 /// }
335 ///
336 /// fn main() {
337 /// let mut hay = String::from("+1234567890");
338 /// PhoneNum::replace_captured(hay, remove_country_code);
339 /// assert_eq!(hay, "0234567890");
340 /// }
341 /// ```
342 fn replace_captured<M, F>(hay_mut: &mut M, replacer: F) -> bool
343 where
344 M: OwnedHaystackable<I>,
345 F: for<'a> FnOnce(Self::Capture<'a, <M::Hay<'a> as HaystackIter<'a>>::Slice>) -> M,
346 {
347 let (range, replacement) = {
348 let Some(caps) = Self::find_capture(hay_mut.as_haystack()) else {
349 return false;
350 };
351 let first = caps.whole_match_range().clone();
352
353 (first, replacer(caps))
354 };
355 hay_mut.replace_range(range, replacement.as_slice());
356 true
357 }
358
359 /// Replaces all captured substring in the provided haystack with a computed value. The return
360 /// value is an integer indicating the number of matches found and replaced.
361 ///
362 /// The provided function is used to create a replacement value when given a capture. The
363 /// replacement value shares a type with the provided haystack. Its simplified signature would
364 /// be `F: FnMut(Self::Capture<'_, <M::Hay>::Slice>) -> M`. Because of limitations with higher
365 /// ranked trait bounds surrounding closure, it may be necessary to implement this as function
366 /// with lifetime annotations as mentioned in the documentation for
367 /// [`replace_captured`](Self::replace_captured).
368 fn replace_all_captured<M, F>(hay_mut: &mut M, mut replacer: F) -> usize
369 where
370 M: OwnedHaystackable<I>,
371 F: for<'a> FnMut(Self::Capture<'a, <M::Hay<'a> as HaystackIter<'a>>::Slice>) -> M,
372 {
373 let replacements: Vec<_> = {
374 let caps = Self::find_all_captures(hay_mut.as_haystack(), false);
375 caps.into_iter()
376 .map(|c| (c.whole_match_range().clone(), replacer(c)))
377 .collect()
378 };
379
380 let count = replacements.len();
381 let mut delta = Delta::new();
382
383 for (mut range, replacement) in replacements {
384 delta.apply_to(&mut range);
385
386 let initial_len = hay_mut.len();
387 hay_mut.replace_range(range, replacement.as_slice());
388 delta.add_diff(hay_mut.len(), initial_len);
389 }
390
391 count
392 }
393}
394
395fn range_of_match<'a, R: Regex<I, N> + ?Sized, I: HaystackItem, const N: usize>(
396 hay: &mut impl HaystackOf<'a, I>,
397) -> Option<Range<usize>> {
398 while hay.item().is_some() {
399 let start = hay.index();
400
401 if let Some(state_fork) = R::Pattern::all_matches(hay).next() {
402 return Some(start..state_fork);
403 }
404
405 hay.rollback(start).progress()
406 }
407 None
408}
409
410#[derive(Debug, Clone)]
411struct Delta(isize);
412
413impl Delta {
414 fn new() -> Delta {
415 Delta(0)
416 }
417
418 fn add_diff(&mut self, from: usize, to: usize) {
419 self.0 = self.0.strict_add(
420 from.checked_signed_diff(to)
421 .expect("difference between usizes doesn't fit in an isize")
422 )
423 }
424
425 fn apply_to(&self, range: &mut Range<usize>) {
426 range.start = range.start.strict_add_signed(self.0);
427 range.end = range.end.strict_add_signed(self.0);
428 }
429}