Mercurial > lbo > hg > rex
changeset 89:ea0c08e43479
Fix bad exponential performance issue with pathological regexes.
This should finally bring matching complexity up to par with libraries such as
RE2. The problem used to be exponential duplication of matching states, which
we can avoid by using maps instead of vectors. By always using the same maps,
we avoid expensive duplications.
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Wed, 09 Dec 2020 22:46:13 +0100 |
parents | 437eceef26e0 |
children | f0d09f06e373 |
files | benches/e2e.rs coverage.sh src/matching.rs src/tests.rs |
diffstat | 4 files changed, 28 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- a/benches/e2e.rs Wed Dec 09 22:01:30 2020 +0100 +++ b/benches/e2e.rs Wed Dec 09 22:46:13 2020 +0100 @@ -47,7 +47,7 @@ fn bench_notorious_regex_crate(b: &mut Bencher) { let re = regex::Regex::new("(x+x+)+y").unwrap(); b.iter(|| { - assert!(re.is_match("xxxxxxxxxxy")); + assert!(re.is_match("xxxxxxxxxxxxxxxxxxxxxxy")); }); }
--- a/coverage.sh Wed Dec 09 22:01:30 2020 +0100 +++ b/coverage.sh Wed Dec 09 22:46:13 2020 +0100 @@ -2,7 +2,7 @@ #!/bin/bash KCOV=kcov -KCOV_OPTS="--verify --exclude-pattern=/.cargo,/glibc,/usr/lib,/usr/include,/usr/src" +KCOV_OPTS="--verify --exclude-pattern=/.cargo,/glibc,/usr/lib,/usr/include,/usr/src,.rustup" KCOV_OUT="./kcov-out/" export RUSTFLAGS="-C link-dead-code"
--- a/src/matching.rs Wed Dec 09 22:01:30 2020 +0100 +++ b/src/matching.rs Wed Dec 09 22:46:13 2020 +0100 @@ -4,6 +4,7 @@ #![allow(dead_code)] use std::cell::RefCell; +use std::collections::{HashSet, HashMap}; use std::mem; use std::ops::Deref; use std::rc::Rc; @@ -123,28 +124,38 @@ (false, vec![]) } +/// Create a HashMap key from a matchstate, in order to deduplicate states. +fn state_key(m: &MatchState) -> (usize, StateRef) { + (m.matchee.pos(), m.node) +} + /// start_match takes an initialized MatchState and starts matching. It returns true if the input /// string matches, otherwise false; the index in the input string to which the match was /// successful (in case a match fails, but matches some characters at the beginning); and a vector /// of submatches; if the entry at index I contains Some(J), then that means that there is a /// submatch starting at I extending to (J-1). pub fn start_match(sg: &StateGraph, m: MatchState) -> (bool, usize, Vec<Option<usize>>) { - let mut states = Vec::with_capacity(4); - let mut states_next = Vec::with_capacity(4); - states.push(m); + // State map keyed by (matchee position, node index). These two keep the current set of states + // within the string and the set of states for the next iteration. They are keyed by string + // position and node index to avoid duplication, which makes certain pathological regular + // expressions have exponential complexity (as a function of input size). + let mut states_map = HashMap::new(); + let mut states_map_next = HashMap::new(); + + states_map.insert(state_key(&m), m); let (mut ismatch, mut matches) = (false, vec![]); let mut longestmatch = 0; let mut longest_partial_match = 0; loop { - if states.is_empty() { + if states_map.is_empty() { break; } // Iterate over all current states, see which match, and add the successors of matching // states to the states_next list. - for mut matchst in states.drain(..) { + for (_, mut matchst) in states_map.drain() { let (next1, next2) = sg[matchst.node].next_states(); // Check if this node is a submatch start or end. If it is, the list of pending @@ -193,20 +204,21 @@ if next1.is_some() && next2.is_some() { // If the current state matched, or it didn't have a matcher, push next states into // list of next states. - states_next.push(matchst.fork(next1.unwrap(), advance_by)); + let nextst = matchst.fork(next1.unwrap(), advance_by); + states_map_next.entry(state_key(&nextst)).or_insert(nextst); matchst.update(next2.unwrap(), advance_by); - states_next.push(matchst); + states_map_next.entry(state_key(&matchst)).or_insert(matchst); } else if let Some(n1) = next1 { // Reuse current state if only one successor (common case). matchst.update(n1, advance_by); - states_next.push(matchst) + states_map_next.entry(state_key(&matchst)).or_insert(matchst); } else if let Some(n2) = next2 { matchst.update(n2, advance_by); - states_next.push(matchst); + states_map_next.entry(state_key(&matchst)).or_insert(matchst); } } // Swap state lists, leaving states_next empty. - mem::swap(&mut states, &mut states_next); + mem::swap(&mut states_map, &mut states_map_next); } return (ismatch, longest_partial_match, matches);
--- a/src/tests.rs Wed Dec 09 22:01:30 2020 +0100 +++ b/src/tests.rs Wed Dec 09 22:46:13 2020 +0100 @@ -8,7 +8,10 @@ #[test] fn test_notorious_graph() { - println!("{}", crate::render_graph("(x+x+)+y")); + let re = "(x+x+)+y"; + println!("{:?}", crate::parse(re)); + println!("{}", crate::render_graph(re)); + match_re(re, "xxxxy"); } #[test]