1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
|
use std::error::Error;
use regex_automata::{
hybrid::dfa::{OverlappingState, DFA},
nfa::thompson,
HalfMatch, Input, MatchError,
};
// Tests that too many cache resets cause the lazy DFA to quit.
//
// We only test this on 64-bit because the test is gingerly crafted based on
// implementation details of cache sizes. It's not a great test because of
// that, but it does check some interesting properties around how positions are
// reported when a search "gives up."
//
// NOTE: If you change something in lazy DFA implementation that causes this
// test to fail by reporting different "gave up" positions, then it's generally
// okay to update the positions in the test below as long as you're sure your
// changes are correct. Namely, it is expected that if there are changes in the
// cache size (or changes in how big things are inside the cache), then its
// utilization may change slightly and thus impact where a search gives up.
// Precisely where a search gives up is not an API guarantee, so changing the
// offsets here is OK.
#[test]
#[cfg(target_pointer_width = "64")]
#[cfg(not(miri))]
fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
// This is a carefully chosen regex. The idea is to pick one that requires
// some decent number of states (hence the bounded repetition). But we
// specifically choose to create a class with an ASCII letter and a
// non-ASCII letter so that we can check that no new states are created
// once the cache is full. Namely, if we fill up the cache on a haystack
// of 'a's, then in order to match one 'β', a new state will need to be
// created since a 'β' is encoded with multiple bytes.
//
// So we proceed by "filling" up the cache by searching a haystack of just
// 'a's. The cache won't have enough room to add enough states to find the
// match (because of the bounded repetition), which should result in it
// giving up before it finds a match.
//
// Since there's now no more room to create states, we search a haystack
// of 'β' and confirm that it gives up immediately.
let pattern = r"[aβ]{99}";
let dfa = DFA::builder()
.configure(
// Configure it so that we have the minimum cache capacity
// possible. And that if any resets occur, the search quits.
DFA::config()
.skip_cache_capacity_check(true)
.cache_capacity(0)
.minimum_cache_clear_count(Some(0)),
)
.thompson(thompson::NFA::config())
.build(pattern)?;
let mut cache = dfa.create_cache();
let haystack = "a".repeat(101).into_bytes();
let err = MatchError::gave_up(24);
// Notice that we make the same amount of progress in each search! That's
// because the cache is reused and already has states to handle the first
// N bytes.
assert_eq!(
Err(err.clone()),
dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
);
assert_eq!(
Err(err.clone()),
dfa.try_search_overlapping_fwd(
&mut cache,
&Input::new(&haystack),
&mut OverlappingState::start()
),
);
let haystack = "β".repeat(101).into_bytes();
let err = MatchError::gave_up(2);
assert_eq!(
Err(err),
dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
);
// no need to test that other find routines quit, since we did that above
// OK, if we reset the cache, then we should be able to create more states
// and make more progress with searching for betas.
cache.reset(&dfa);
let err = MatchError::gave_up(26);
assert_eq!(
Err(err),
dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
);
// ... switching back to ASCII still makes progress since it just needs to
// set transitions on existing states!
let haystack = "a".repeat(101).into_bytes();
let err = MatchError::gave_up(13);
assert_eq!(
Err(err),
dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
);
Ok(())
}
// Tests that quit bytes in the forward direction work correctly.
#[test]
fn quit_fwd() -> Result<(), Box<dyn Error>> {
let dfa = DFA::builder()
.configure(DFA::config().quit(b'x', true))
.build("[[:word:]]+$")?;
let mut cache = dfa.create_cache();
assert_eq!(
dfa.try_search_fwd(&mut cache, &Input::new("abcxyz")),
Err(MatchError::quit(b'x', 3)),
);
assert_eq!(
dfa.try_search_overlapping_fwd(
&mut cache,
&Input::new(b"abcxyz"),
&mut OverlappingState::start()
),
Err(MatchError::quit(b'x', 3)),
);
Ok(())
}
// Tests that quit bytes in the reverse direction work correctly.
#[test]
fn quit_rev() -> Result<(), Box<dyn Error>> {
let dfa = DFA::builder()
.configure(DFA::config().quit(b'x', true))
.thompson(thompson::Config::new().reverse(true))
.build("^[[:word:]]+")?;
let mut cache = dfa.create_cache();
assert_eq!(
dfa.try_search_rev(&mut cache, &Input::new("abcxyz")),
Err(MatchError::quit(b'x', 3)),
);
Ok(())
}
// Tests that if we heuristically enable Unicode word boundaries but then
// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
// will panic.
#[test]
#[should_panic]
fn quit_panics() {
DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
}
// This tests an intesting case where even if the Unicode word boundary option
// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
// word boundaries to be enabled.
#[test]
fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
let mut config = DFA::config();
for b in 0x80..=0xFF {
config = config.quit(b, true);
}
let dfa = DFA::builder().configure(config).build(r"\b")?;
let mut cache = dfa.create_cache();
let expected = HalfMatch::must(0, 1);
assert_eq!(
Ok(Some(expected)),
dfa.try_search_fwd(&mut cache, &Input::new(" a")),
);
Ok(())
}
|