1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
|
use std::error::Error;
use regex_automata::{
hybrid::{
dfa::{self, DFA},
regex::Regex,
OverlappingState,
},
nfa::thompson,
HalfMatch, MatchError, MatchKind, MultiMatch,
};
use crate::util::{BunkPrefilter, SubstringPrefilter};
// Tests that too many cache resets cause the lazy DFA to quit.
//
// We only test this on 64-bit because the test is gingerly crafted based on
// implementation details of cache sizes. It's not a great test because of
// that, but it does check some interesting properties around how positions are
// reported when a search "gives up."
#[test]
#[cfg(target_pointer_width = "64")]
fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
// This is a carefully chosen regex. The idea is to pick one that requires
// some decent number of states (hence the bounded repetition). But we
// specifically choose to create a class with an ASCII letter and a
// non-ASCII letter so that we can check that no new states are created
// once the cache is full. Namely, if we fill up the cache on a haystack
// of 'a's, then in order to match one 'β', a new state will need to be
// created since a 'β' is encoded with multiple bytes. Since there's no
// room for this state, the search should quit at the very first position.
let pattern = r"[aβ]{100}";
let dfa = DFA::builder()
.configure(
// Configure it so that we have the minimum cache capacity
// possible. And that if any resets occur, the search quits.
DFA::config()
.skip_cache_capacity_check(true)
.cache_capacity(0)
.minimum_cache_clear_count(Some(0)),
)
.build(pattern)?;
let mut cache = dfa.create_cache();
let haystack = "a".repeat(101).into_bytes();
let err = MatchError::GaveUp { offset: 25 };
assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone()));
assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone()));
assert_eq!(
dfa.find_overlapping_fwd(
&mut cache,
&haystack,
&mut OverlappingState::start()
),
Err(err.clone())
);
let haystack = "β".repeat(101).into_bytes();
let err = MatchError::GaveUp { offset: 0 };
assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
// no need to test that other find routines quit, since we did that above
// OK, if we reset the cache, then we should be able to create more states
// and make more progress with searching for betas.
cache.reset(&dfa);
let err = MatchError::GaveUp { offset: 26 };
assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
// ... switching back to ASCII still makes progress since it just needs to
// set transitions on existing states!
let haystack = "a".repeat(101).into_bytes();
let err = MatchError::GaveUp { offset: 13 };
assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
Ok(())
}
// Tests that quit bytes in the forward direction work correctly.
#[test]
fn quit_fwd() -> Result<(), Box<dyn Error>> {
let dfa = DFA::builder()
.configure(DFA::config().quit(b'x', true))
.build("[[:word:]]+$")?;
let mut cache = dfa.create_cache();
assert_eq!(
dfa.find_earliest_fwd(&mut cache, b"abcxyz"),
Err(MatchError::Quit { byte: b'x', offset: 3 })
);
assert_eq!(
dfa.find_leftmost_fwd(&mut cache, b"abcxyz"),
Err(MatchError::Quit { byte: b'x', offset: 3 })
);
assert_eq!(
dfa.find_overlapping_fwd(
&mut cache,
b"abcxyz",
&mut OverlappingState::start()
),
Err(MatchError::Quit { byte: b'x', offset: 3 })
);
Ok(())
}
// Tests that quit bytes in the reverse direction work correctly.
#[test]
fn quit_rev() -> Result<(), Box<dyn Error>> {
let dfa = DFA::builder()
.configure(DFA::config().quit(b'x', true))
.thompson(thompson::Config::new().reverse(true))
.build("^[[:word:]]+")?;
let mut cache = dfa.create_cache();
assert_eq!(
dfa.find_earliest_rev(&mut cache, b"abcxyz"),
Err(MatchError::Quit { byte: b'x', offset: 3 })
);
assert_eq!(
dfa.find_leftmost_rev(&mut cache, b"abcxyz"),
Err(MatchError::Quit { byte: b'x', offset: 3 })
);
Ok(())
}
// Tests that if we heuristically enable Unicode word boundaries but then
// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
// will panic.
#[test]
#[should_panic]
fn quit_panics() {
DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
}
// This tests an intesting case where even if the Unicode word boundary option
// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
// word boundaries to be enabled.
#[test]
fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
let mut config = DFA::config();
for b in 0x80..=0xFF {
config = config.quit(b, true);
}
let dfa = DFA::builder().configure(config).build(r"\b")?;
let mut cache = dfa.create_cache();
let expected = HalfMatch::must(0, 1);
assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected)));
Ok(())
}
// Tests that we can provide a prefilter to a Regex, and the search reports
// correct results.
#[test]
fn prefilter_works() -> Result<(), Box<dyn Error>> {
let mut re = Regex::new(r"a[0-9]+").unwrap();
re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
let mut cache = re.create_cache();
let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
let matches: Vec<(usize, usize)> = re
.find_leftmost_iter(&mut cache, text)
.map(|m| (m.start(), m.end()))
.collect();
assert_eq!(
matches,
vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
);
Ok(())
}
// This test confirms that a prefilter is active by using a prefilter that
// reports false negatives.
#[test]
fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
let text = b"za123";
let mut re = Regex::new(r"a[0-9]+").unwrap();
let mut cache = re.create_cache();
re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
assert_eq!(
re.find_leftmost(&mut cache, b"za123"),
Some(MultiMatch::must(0, 1, 5))
);
assert_eq!(
re.find_leftmost(&mut cache, b"a123"),
Some(MultiMatch::must(0, 0, 4))
);
re.set_prefilter(Some(Box::new(BunkPrefilter::new())));
assert_eq!(re.find_leftmost(&mut cache, b"za123"), None);
// This checks that the prefilter is used when first starting the search,
// instead of waiting until at least one transition has occurred.
assert_eq!(re.find_leftmost(&mut cache, b"a123"), None);
Ok(())
}
|