Skip to content

Commit

Permalink
automata: add internal HalfMatch APIs for NFA engines
Browse files Browse the repository at this point in the history
Welp, okay, turns out we do need to know at least the end offset of a
match even when the NFA has no capture states. This is necessary for
correctly handling the case where a regex can match the empty string but
the caller has asked that matches not split a codepoint. If we don't
know the end offset of a match, then we can't correctly determine
whether a match exists or not and are forced to return no match even
when a match exists. We can get away with this I think for `find`-style
APIs where the caller has specifically requested match offsets while
simultaneously configuring the NFA to not track offsets, but with
`is_match`-style APIs, we really should be able to handle it correctly.

We should eventually just expose the `HalfMatch` APIs on `PikeVM` and
`BoundedBacktracker`, but for now we keep them private.
  • Loading branch information
BurntSushi committed Aug 5, 2023
1 parent e003cae commit d93ddbe
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 70 deletions.
59 changes: 23 additions & 36 deletions regex-automata/src/nfa/thompson/backtrack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use crate::{
empty, iter,
prefilter::Prefilter,
primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
search::{Anchored, Input, Match, MatchError, Span},
search::{Anchored, HalfMatch, Input, Match, MatchError, Span},
},
};

Expand Down Expand Up @@ -1295,27 +1295,29 @@ impl BoundedBacktracker {
) -> Result<Option<PatternID>, MatchError> {
let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
if !utf8empty {
return self.try_search_slots_imp(cache, input, slots);
let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
return Ok(maybe_hm.map(|hm| hm.pattern()));
}
// See PikeVM::try_search_slots for why we do this.
let min = self.get_nfa().group_info().implicit_slot_len();
if slots.len() >= min {
return self.try_search_slots_imp(cache, input, slots);
let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
return Ok(maybe_hm.map(|hm| hm.pattern()));
}
if self.get_nfa().pattern_len() == 1 {
let mut enough = [None, None];
let got = self.try_search_slots_imp(cache, input, &mut enough)?;
// This is OK because we know `enough_slots` is strictly bigger
// than `slots`, otherwise this special case isn't reached.
slots.copy_from_slice(&enough[..slots.len()]);
return Ok(got);
return Ok(got.map(|hm| hm.pattern()));
}
let mut enough = vec![None; min];
let got = self.try_search_slots_imp(cache, input, &mut enough)?;
// This is OK because we know `enough_slots` is strictly bigger than
// `slots`, otherwise this special case isn't reached.
slots.copy_from_slice(&enough[..slots.len()]);
Ok(got)
Ok(got.map(|hm| hm.pattern()))
}

/// This is the actual implementation of `try_search_slots_imp` that
Expand All @@ -1328,30 +1330,17 @@ impl BoundedBacktracker {
cache: &mut Cache,
input: &Input<'_>,
slots: &mut [Option<NonMaxUsize>],
) -> Result<Option<PatternID>, MatchError> {
) -> Result<Option<HalfMatch>, MatchError> {
let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
let (pid, end) = match self.search_imp(cache, input, slots)? {
let hm = match self.search_imp(cache, input, slots)? {
None => return Ok(None),
Some(pid) if !utf8empty => return Ok(Some(pid)),
Some(pid) => {
let slot_start = pid.as_usize() * 2;
let slot_end = slot_start + 1;
// OK because we know we have a match and we know our caller
// provided slots are big enough (which we make true above if
// the caller didn't). Namely, we're only here when 'utf8empty'
// is true, and when that's true, we require slots for every
// pattern.
(pid, slots[slot_end].unwrap().get())
}
Some(hm) if !utf8empty => return Ok(Some(hm)),
Some(hm) => hm,
};
empty::skip_splits_fwd(input, pid, end, |input| {
let pid = match self.search_imp(cache, input, slots)? {
None => return Ok(None),
Some(pid) => pid,
};
let slot_start = pid.as_usize() * 2;
let slot_end = slot_start + 1;
Ok(Some((pid, slots[slot_end].unwrap().get())))
empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
Ok(self
.search_imp(cache, input, slots)?
.map(|hm| (hm, hm.offset())))
})
}

Expand All @@ -1367,7 +1356,7 @@ impl BoundedBacktracker {
cache: &mut Cache,
input: &Input<'_>,
slots: &mut [Option<NonMaxUsize>],
) -> Result<Option<PatternID>, MatchError> {
) -> Result<Option<HalfMatch>, MatchError> {
// Unlike in the PikeVM, we write our capturing group spans directly
// into the caller's captures groups. So we have to make sure we're
// starting with a blank slate first. In the PikeVM, we avoid this
Expand Down Expand Up @@ -1414,10 +1403,9 @@ impl BoundedBacktracker {
Some(ref span) => at = span.start,
}
}
if let Some(pid) =
self.backtrack(cache, input, at, start_id, slots)
if let Some(hm) = self.backtrack(cache, input, at, start_id, slots)
{
return Ok(Some(pid));
return Ok(Some(hm));
}
at += 1;
}
Expand All @@ -1438,14 +1426,13 @@ impl BoundedBacktracker {
at: usize,
start_id: StateID,
slots: &mut [Option<NonMaxUsize>],
) -> Option<PatternID> {
) -> Option<HalfMatch> {
cache.stack.push(Frame::Step { sid: start_id, at });
while let Some(frame) = cache.stack.pop() {
match frame {
Frame::Step { sid, at } => {
if let Some(pid) = self.step(cache, input, sid, at, slots)
{
return Some(pid);
if let Some(hm) = self.step(cache, input, sid, at, slots) {
return Some(hm);
}
}
Frame::RestoreCapture { slot, offset } => {
Expand Down Expand Up @@ -1475,7 +1462,7 @@ impl BoundedBacktracker {
mut sid: StateID,
mut at: usize,
slots: &mut [Option<NonMaxUsize>],
) -> Option<PatternID> {
) -> Option<HalfMatch> {
loop {
if !cache.visited.insert(sid, at - input.start()) {
return None;
Expand Down Expand Up @@ -1558,7 +1545,7 @@ impl BoundedBacktracker {
}
State::Fail => return None,
State::Match { pattern_id } => {
return Some(pattern_id);
return Some(HalfMatch::new(pattern_id, at));
}
}
}
Expand Down
60 changes: 26 additions & 34 deletions regex-automata/src/nfa/thompson/pikevm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ use crate::{
empty, iter,
prefilter::Prefilter,
primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
search::{Anchored, Input, Match, MatchKind, PatternSet, Span},
search::{
Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span,
},
sparse_set::SparseSet,
},
};
Expand Down Expand Up @@ -1094,7 +1096,8 @@ impl PikeVM {
) -> Option<PatternID> {
let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
if !utf8empty {
return self.search_slots_imp(cache, input, slots);
let hm = self.search_slots_imp(cache, input, slots)?;
return Some(hm.pattern());
}
// There is an unfortunate special case where if the regex can
// match the empty string and UTF-8 mode is enabled, the search
Expand All @@ -1109,22 +1112,23 @@ impl PikeVM {
// this case.
let min = self.get_nfa().group_info().implicit_slot_len();
if slots.len() >= min {
return self.search_slots_imp(cache, input, slots);
let hm = self.search_slots_imp(cache, input, slots)?;
return Some(hm.pattern());
}
if self.get_nfa().pattern_len() == 1 {
let mut enough = [None, None];
let got = self.search_slots_imp(cache, input, &mut enough);
// This is OK because we know `enough` is strictly bigger than
// `slots`, otherwise this special case isn't reached.
slots.copy_from_slice(&enough[..slots.len()]);
return got;
return got.map(|hm| hm.pattern());
}
let mut enough = vec![None; min];
let got = self.search_slots_imp(cache, input, &mut enough);
// This is OK because we know `enough` is strictly bigger than `slots`,
// otherwise this special case isn't reached.
slots.copy_from_slice(&enough[..slots.len()]);
got
got.map(|hm| hm.pattern())
}

/// This is the actual implementation of `search_slots_imp` that
Expand All @@ -1137,30 +1141,17 @@ impl PikeVM {
cache: &mut Cache,
input: &Input<'_>,
slots: &mut [Option<NonMaxUsize>],
) -> Option<PatternID> {
) -> Option<HalfMatch> {
let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
let (pid, end) = match self.search_imp(cache, input, slots) {
let hm = match self.search_imp(cache, input, slots) {
None => return None,
Some(pid) if !utf8empty => return Some(pid),
Some(pid) => {
let slot_start = pid.as_usize() * 2;
let slot_end = slot_start + 1;
// OK because we know we have a match and we know our caller
// provided slots are big enough (which we make true above if
// the caller didn't). Namely, we're only here when 'utf8empty'
// is true, and when that's true, we require slots for every
// pattern.
(pid, slots[slot_end].unwrap().get())
}
Some(hm) if !utf8empty => return Some(hm),
Some(hm) => hm,
};
empty::skip_splits_fwd(input, pid, end, |input| {
let pid = match self.search_imp(cache, input, slots) {
None => return Ok(None),
Some(pid) => pid,
};
let slot_start = pid.as_usize() * 2;
let slot_end = slot_start + 1;
Ok(Some((pid, slots[slot_end].unwrap().get())))
empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
Ok(self
.search_imp(cache, input, slots)
.map(|hm| (hm, hm.offset())))
})
// OK because the PikeVM never errors.
.unwrap()
Expand Down Expand Up @@ -1235,7 +1226,7 @@ impl PikeVM {
cache: &mut Cache,
input: &Input<'_>,
slots: &mut [Option<NonMaxUsize>],
) -> Option<PatternID> {
) -> Option<HalfMatch> {
cache.setup_search(slots.len());
if input.is_done() {
return None;
Expand Down Expand Up @@ -1264,7 +1255,7 @@ impl PikeVM {
let pre =
if anchored { None } else { self.get_config().get_prefilter() };
let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
let mut pid = None;
let mut hm = None;
// Yes, our search doesn't end at input.end(), but includes it. This
// is necessary because matches are delayed by one byte, just like
// how the DFA engines work. The delay is used to handle look-behind
Expand All @@ -1283,7 +1274,7 @@ impl PikeVM {
if curr.set.is_empty() {
// We have a match and we haven't been instructed to continue
// on even after finding a match, so we can quit.
if pid.is_some() && !allmatches {
if hm.is_some() && !allmatches {
break;
}
// If we're running an anchored search and we've advanced
Expand Down Expand Up @@ -1353,7 +1344,7 @@ impl PikeVM {
// search. If we re-computed it at every position, we would be
// simulating an unanchored search when we were tasked to perform
// an anchored search.
if (!pid.is_some() || allmatches)
if (!hm.is_some() || allmatches)
&& (!anchored || at == input.start())
{
// Since we are adding to the 'curr' active states and since
Expand All @@ -1372,22 +1363,23 @@ impl PikeVM {
let slots = next.slot_table.all_absent();
self.epsilon_closure(stack, slots, curr, input, at, start_id);
}
if let Some(x) = self.nexts(stack, curr, next, input, at, slots) {
pid = Some(x);
if let Some(pid) = self.nexts(stack, curr, next, input, at, slots)
{
hm = Some(HalfMatch::new(pid, at));
}
// Unless the caller asked us to return early, we need to mush on
// to see if we can extend our match. (But note that 'nexts' will
// quit right after seeing a match when match_kind==LeftmostFirst,
// as is consistent with leftmost-first match priority.)
if input.get_earliest() && pid.is_some() {
if input.get_earliest() && hm.is_some() {
break;
}
core::mem::swap(curr, next);
next.set.clear();
at += 1;
}
instrument!(|c| c.eprint(&self.nfa));
pid
hm
}

/// The implementation for the 'which_overlapping_matches' API. Basically,
Expand Down

0 comments on commit d93ddbe

Please sign in to comment.