Mercurial > lbo > hg > rex
changeset 62:d761adfe9cdf
Big: Move everything to use a Vec-based graph instead of pointer-chasing
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Fri, 30 Aug 2019 22:20:51 +0200 |
parents | 987e10c9e6fe |
children | 0704050ff42a |
files | benches/e2e.rs src/compile.rs src/lib.rs src/matching.rs src/parse.rs src/repr.rs src/state.rs src/tests.rs |
diffstat | 8 files changed, 156 insertions(+), 119 deletions(-) [+] |
line wrap: on
line diff
--- a/benches/e2e.rs Fri Aug 30 16:36:51 2019 +0200 +++ b/benches/e2e.rs Fri Aug 30 22:20:51 2019 +0200 @@ -31,7 +31,7 @@ fn bench_notorious(b: &mut Bencher) { let re = rex::compile("(x+x+)+y").unwrap(); b.iter(|| { - assert!(rex::match_re(&re, "xxxxxxxxxxxy").0); + assert!(rex::match_re(&re, "xxxxxxxxxxy").0); }); }
--- a/src/compile.rs Fri Aug 30 16:36:51 2019 +0200 +++ b/src/compile.rs Fri Aug 30 22:20:51 2019 +0200 @@ -1,49 +1,56 @@ use matcher::{self, wrap_matcher}; use repr::{AnchorLocation, Pattern, Repetition}; -use state::{wrap_state, State, Submatch, WrappedState}; +use state::{State, Submatch, StateGraph, StateRef}; /// Types implementing Compile can be compiled into a state graph. pub trait Compile { /// to_state returns the start node of a subgraph, and a list of pointers that need to be /// connected to the following subgraph. The list can contain the first tuple element. - fn to_state(&self) -> (WrappedState, Vec<WrappedState>); + fn to_state(&self, sg: &mut StateGraph) -> (StateRef, Vec<StateRef>); } /// start_compile takes a parsed regex as RETree and returns the first node of a directed graph /// representing the regex. -pub fn start_compile(re: &Pattern) -> WrappedState { - let (s, sp) = re.to_state(); +pub fn start_compile(re: &Pattern) -> StateGraph { + let mut state_graph = Vec::with_capacity(64); let mut before = State::default(); before.sub = Some(Submatch::Start); - before.out = Some(s); + // First element in graph vector. + let beforeref = 0; + state_graph.push(before); + + let (s, sp) = re.to_state(&mut state_graph); + state_graph[beforeref].out = Some(s); + let mut end = State::default(); end.sub = Some(Submatch::End); + let endref = state_graph.len(); + state_graph.push(end); - let endw = wrap_state(end); // Connect all loose ends with the final node. for p in sp { - p.borrow_mut().patch(endw.clone()); + state_graph[p].patch(endref); } - wrap_state(before) + state_graph } impl Compile for Pattern { - fn to_state(&self) -> (WrappedState, Vec<WrappedState>) { + fn to_state(&self, sg: &mut StateGraph) -> (StateRef, Vec<StateRef>) { match *self { Pattern::Concat(ref ps) => { if ps.is_empty() { panic!("invalid Concat len: 0") } else if ps.len() == 1 { - return ps[0].to_state(); + return ps[0].to_state(sg); } - let (init, mut lastp) = ps[0].to_state(); + let (init, mut lastp) = ps[0].to_state(sg); for i in 1..ps.len() { - let (next, nextp) = ps[i].to_state(); + let (next, nextp) = ps[i].to_state(sg); // Connect all loose ends with the new node. for p in lastp { - p.borrow_mut().patch(next.clone()); + sg[p].patch(next); } // Remember the loose ends of this one. lastp = nextp; @@ -51,95 +58,111 @@ (init, lastp) } Pattern::Any => { - let s = wrap_state(State { + let s = State { out: None, out1: None, matcher: wrap_matcher(Box::new(matcher::AnyMatcher)), sub: None, - }); - (s.clone(), vec![s]) + }; + let sref = sg.len(); + sg.push(s); + (sref, vec![sref]) } Pattern::Char(c) => { - let s = wrap_state(State { + let s = State { out: None, out1: None, matcher: wrap_matcher(Box::new(matcher::CharMatcher(c))), sub: None, - }); - (s.clone(), vec![s]) + }; + let sref = sg.len(); + sg.push(s); + (sref, vec![sref]) } Pattern::Str(ref s) => { - let s = wrap_state(State { + let s = State { out: None, out1: None, matcher: wrap_matcher(Box::new(matcher::StringMatcher::new(s))), sub: None, - }); - (s.clone(), vec![s]) + }; + let sref = sg.len(); + sg.push(s); + (sref, vec![sref]) } Pattern::CharRange(from, to) => { - let s = wrap_state(State { + let s = State { out: None, out1: None, matcher: wrap_matcher(Box::new(matcher::CharRangeMatcher(from, to))), sub: None, - }); - (s.clone(), vec![s]) + }; + let sref = sg.len(); + sg.push(s); + (sref, vec![sref]) } Pattern::CharSet(ref set) => { - let s = wrap_state(State { + let s = State { out: None, out1: None, matcher: wrap_matcher(Box::new(matcher::CharSetMatcher(set.clone()))), sub: None, - }); - (s.clone(), vec![s]) + }; + let sref = sg.len(); + sg.push(s); + (sref, vec![sref]) } - Pattern::Alternate(ref r) => alternate(&r, &vec![]), + Pattern::Alternate(ref r) => alternate(sg, &r, &vec![]), Pattern::Submatch(ref p) => { - let (s, sp) = p.to_state(); - let before = wrap_state(State { + let (s, sp) = p.to_state(sg); + let before = State { out: Some(s), out1: None, matcher: None, sub: Some(Submatch::Start), - }); - let after = wrap_state(State { + }; + let after = State { out: None, out1: None, matcher: None, sub: Some(Submatch::End), - }); + }; + let beforeref = sg.len(); + sg.push(before); + let afterref = sg.len(); + sg.push(after); for p in sp { - p.borrow_mut().patch(after.clone()); + sg[p].patch(afterref); } - (before, vec![after]) + (beforeref, vec![afterref]) } - Pattern::Repeated(ref p) => p.to_state(), + Pattern::Repeated(ref p) => p.to_state(sg), Pattern::Anchor(ref loc) => { let mut m = matcher::AnchorMatcher::Begin; match loc { &AnchorLocation::End => m = matcher::AnchorMatcher::End, _ => (), }; - let s = wrap_state(State { + let s = State { out: None, out1: None, matcher: wrap_matcher(Box::new(m)), sub: None, - }); - (s.clone(), vec![s]) + }; + let sref = sg.len(); + sg.push(s); + (sref, vec![sref]) } } } } // alternate compiles a list of patterns into a graph that accepts any one of the patterns. -fn alternate(ps: &[Pattern], to_patch: &[WrappedState]) -> (WrappedState, Vec<WrappedState>) { +fn alternate(sg: &mut StateGraph, ps: &[Pattern], to_patch: &[StateRef]) -> (StateRef, Vec<StateRef>) { if ps.len() == 1 { - let (s, sp) = ps[0].to_state(); + let (s, sp) = ps[0].to_state(sg); for e in to_patch { - e.borrow_mut().patch(s.clone()); + sg[*e].patch(s); } (s, sp) } else { @@ -150,69 +173,81 @@ sub: None, }; let mid = ps.len() / 2; - let (left, mut leftpatch) = alternate(&ps[..mid], &vec![]); - let (right, mut rightpatch) = alternate(&ps[mid..], &vec![]); + let (left, mut leftpatch) = alternate(sg, &ps[..mid], &vec![]); + let (right, mut rightpatch) = alternate(sg, &ps[mid..], &vec![]); init.patch(left); init.patch(right); leftpatch.append(&mut rightpatch); - (wrap_state(init), leftpatch) + let initref = sg.len(); + sg.push(init); + (initref, leftpatch) } } impl Compile for Repetition { - fn to_state(&self) -> (WrappedState, Vec<WrappedState>) { + fn to_state(&self, sg: &mut StateGraph) -> (StateRef, Vec<StateRef>) { match *self { Repetition::ZeroOrOnce(ref p) => { - let (s, to_patch) = p.to_state(); - let after = wrap_state(State { + let (s, to_patch) = p.to_state(sg); + let after = State { out: None, out1: None, matcher: None, sub: None, - }); - let before = wrap_state(State { - out: Some(s.clone()), - out1: Some(after.clone()), + }; + let afterref = sg.len(); + sg.push(after); + let before = State { + out: Some(s), + out1: Some(afterref), matcher: None, sub: None, - }); + }; + let beforeref = sg.len(); + sg.push(before); for p in to_patch { - p.borrow_mut().patch(after.clone()); + sg[p].patch(afterref); } - (before, vec![after]) + (beforeref, vec![afterref]) } Repetition::ZeroOrMore(ref p) => { - let (s, to_patch) = p.to_state(); - let before = wrap_state(State { + let (s, to_patch) = p.to_state(sg); + let before = State { out: Some(s.clone()), out1: None, matcher: None, sub: None, - }); - let after = wrap_state(State { + }; + let beforeref = sg.len(); + sg.push(before); + let after = State { out: Some(s.clone()), out1: None, matcher: None, sub: None, - }); - before.borrow_mut().patch(after.clone()); + }; + let afterref = sg.len(); + sg.push(after); + sg[beforeref].patch(afterref); for p in to_patch { - p.borrow_mut().patch(after.clone()); + sg[p].patch(afterref); } - (before, vec![after]) + (beforeref, vec![afterref]) } Repetition::OnceOrMore(ref p) => { - let (s, to_patch) = p.to_state(); - let after = wrap_state(State { + let (s, to_patch) = p.to_state(sg); + let after = State { out: Some(s.clone()), out1: None, matcher: None, sub: None, - }); + }; + let afterref = sg.len(); + sg.push(after); for p in to_patch { - p.borrow_mut().patch(after.clone()); + sg[p].patch(afterref); } - (s, vec![after]) + (s, vec![afterref]) } // Specific is 'min' concatenations of a simple state and 'max - min' concatenations of // a ZeroOrOnce state. @@ -241,7 +276,7 @@ p.clone(), )))); } - Pattern::Concat(repetition).to_state() + Pattern::Concat(repetition).to_state(sg) } } }
--- a/src/lib.rs Fri Aug 30 16:36:51 2019 +0200 +++ b/src/lib.rs Fri Aug 30 22:20:51 2019 +0200 @@ -19,7 +19,8 @@ /// entire matched string. A submatch is a tuple of (start, end), where end is the index of the /// first character that isn't part of the submatch anymore (i.e. [start, end)). fn compile_and_match(re: &repr::Pattern, s: &str) -> (bool, Vec<(usize, usize)>) { - matching::do_match(compile::start_compile(re), s) + let compiled = compile::start_compile(re); + matching::do_match(&compiled, s) } /// Parse, compile, and match a regular expression. Not recommended for repeated use, as the @@ -40,6 +41,5 @@ /// tuples for all submatches, where the first element describes the match by the whole regular /// expression. pub fn match_re(re: &state::CompiledRE, s: &str) -> (bool, Vec<(usize, usize)>) { - let re = re.clone(); matching::do_match(re, s) }
--- a/src/matching.rs Fri Aug 30 16:36:51 2019 +0200 +++ b/src/matching.rs Fri Aug 30 22:20:51 2019 +0200 @@ -9,11 +9,11 @@ use std::rc::Rc; use matcher::Matchee; -use state::{Submatch, WrappedState}; +use state::{Submatch, StateGraph, StateRef}; #[derive(Clone, Debug)] pub struct MatchState { - node: WrappedState, + node: StateRef, matchee: Matchee, // The set of submatches encountered, indexed by the start of a submatch. If submatches // (with (start,end)) (1,3),(5,10) have been encountered, then submatches[1] = Some(3) and @@ -27,7 +27,7 @@ } impl MatchState { - fn new(s: &str, ws: WrappedState) -> MatchState { + fn new(s: &str, ws: StateRef) -> MatchState { MatchState { node: ws, matchee: Matchee::from_string(s), @@ -35,13 +35,13 @@ submatches_todo: Cow::Owned(Vec::new()), } } - fn fork(&self, next: WrappedState, advance: usize) -> MatchState { + fn fork(&self, next: StateRef, advance: usize) -> MatchState { let mut n = self.clone(); n.matchee.advance(advance); n.node = next; n } - fn update(&mut self, next: WrappedState, advance: usize) { + fn update(&mut self, next: StateRef, advance: usize) { self.matchee.advance(advance); self.node = next; } @@ -68,15 +68,15 @@ /// /// The boolean component is true if the match succeeded. The Vec contains tuples of (start, /// one-past-end) for each submatch, starting with the implicit whole match. -pub fn do_match(ws: WrappedState, s: &str) -> (bool, Vec<(usize, usize)>) { - let mut ms = MatchState::new(s, ws); +pub fn do_match(sg: &StateGraph, s: &str) -> (bool, Vec<(usize, usize)>) { + let mut ms = MatchState::new(s, 0); let (mut i, len) = (0, s.len()); // TODO: Find out if a failed match is definitive; an anchored regex can't match anywhere later // in the text. while i < len || i == 0 { ms.reset(i); - let m = start_match(ms.clone()); + let m = start_match(sg, ms.clone()); match m { // If the match fails, we skip as many characters as were matched at first. (false, skip, _) => i = skip + 1, @@ -99,7 +99,7 @@ /// successful (in case a match fails, but matches some characters at the beginning); and a vector /// of submatches; if the entry at index I contains Some(J), then that means that there is a /// submatch starting at I extending to (J-1). -pub fn start_match(m: MatchState) -> (bool, usize, Vec<Option<usize>>) { +pub fn start_match(sg: &StateGraph, m: MatchState) -> (bool, usize, Vec<Option<usize>>) { let mut states = Vec::with_capacity(4); let mut states_next = Vec::with_capacity(4); states.push(m); @@ -116,15 +116,15 @@ // Iterate over all current states, see which match, and add the successors of matching // states to the states_next list. for mut matchst in states.drain(..) { - let (next1, next2) = matchst.node.borrow().next_states(); + let (next1, next2) = sg[matchst.node].next_states(); // Check if this node is a submatch start or end. If it is, the list of pending // submatches is cloned and the current position pushed (Start) or the most recent // submatch start popped and stored in the overall submatch list (End). - let sub = matchst.node.borrow().sub.clone(); + let sub = sg[matchst.node].sub.as_ref(); match sub { - Some(Submatch::Start) => matchst.start_submatch(), - Some(Submatch::End) => matchst.stop_submatch(), + Some(&Submatch::Start) => matchst.start_submatch(), + Some(&Submatch::End) => matchst.stop_submatch(), None => {} } @@ -144,7 +144,7 @@ let mut advance_by = 0; // Check if the current state matches. - if let Some((matched, howmany)) = matchst.node.borrow().matches(&matchst.matchee) { + if let Some((matched, howmany)) = sg[matchst.node].matches(&matchst.matchee) { // Current state didn't match, throw away. if !matched { continue; @@ -211,8 +211,8 @@ fn test_match_simple() { let re = simple_re0(); println!("{:?}", re); - println!("{:?}", do_match(start_compile(&re), "aaab")); - let dot = dot(start_compile(&re)); + println!("{:?}", do_match(&start_compile(&re), "aaab")); + let dot = dot(&start_compile(&re)); println!("digraph st {{ {} }}", dot); } }
--- a/src/parse.rs Fri Aug 30 16:36:51 2019 +0200 +++ b/src/parse.rs Fri Aug 30 22:20:51 2019 +0200 @@ -484,7 +484,7 @@ fn test_parse_repetition_manual() { println!( "digraph st {{ {} }}", - dot(start_compile(&parse("[abc]{1,5}").unwrap())) + dot(&start_compile(&parse("[abc]{1,5}").unwrap())) ); } #[test] @@ -492,7 +492,7 @@ let rep = parse("a|[bed]|(c|d|e)|f").unwrap(); println!("{:?}", rep.clone()); - let dot = dot(start_compile(&rep)); + let dot = dot(&start_compile(&rep)); println!("digraph st {{ {} }}", dot); }
--- a/src/repr.rs Fri Aug 30 16:36:51 2019 +0200 +++ b/src/repr.rs Fri Aug 30 22:20:51 2019 +0200 @@ -258,7 +258,7 @@ #[test] fn test_re1() { // println!("{:?}", start_compile(simple_re0())); - let dot = dot(start_compile(&simple_re1())); + let dot = dot(&start_compile(&simple_re1())); println!("digraph st {{ {} }}", dot); } }
--- a/src/state.rs Fri Aug 30 16:36:51 2019 +0200 +++ b/src/state.rs Fri Aug 30 22:20:51 2019 +0200 @@ -3,22 +3,33 @@ //! the repr module. #![allow(dead_code)] -use std::cell::RefCell; use std::collections::HashSet; use std::collections::LinkedList; use std::fmt::Write; use std::iter::FromIterator; use std::rc::Rc; +use std::vec::Vec; use matcher::{Matchee, Matcher}; +/// StateGraph is the graph of states that the interpreter traverses while matching a regular +/// expression. It is represented as flat vector. The first element is the State node to start the +/// evaluation with. +pub type StateGraph = Vec<State>; + +/// StateRef is a reference to a state in a StateGraph. +pub type StateRef = usize; + +/// CompiledRE is a compiled regular expression that can be used for matching. +pub type CompiledRE = StateGraph; + /// State is a single state that the evaluation can be in. It contains several output states as /// well as a matcher. #[derive(Debug, Default, Clone)] pub struct State { // Possible following state(s). - pub out: Option<WrappedState>, - pub out1: Option<WrappedState>, + pub out: Option<StateRef>, + pub out1: Option<StateRef>, // If matcher is none, this is an "empty" state. pub matcher: Option<Rc<Box<dyn Matcher>>>, // Tells the matching logic to record the start or end of a submatch. @@ -31,18 +42,8 @@ End, } -/// WrappedState is a shared pointer to a state node. -pub type WrappedState = Rc<RefCell<State>>; - -/// CompiledRE is a compiled regular expression that can be used for matching. -pub type CompiledRE = WrappedState; - -pub fn wrap_state(s: State) -> WrappedState { - Rc::new(RefCell::new(s)) -} - impl State { - pub fn patch(&mut self, next: WrappedState) { + pub fn patch(&mut self, next: StateRef) { if self.out.is_none() { self.out = Some(next); } else if self.out1.is_none() { @@ -66,7 +67,7 @@ } /// Returns the following states, if present. Returns (None, None) if it's the final node. - pub fn next_states(&self) -> (Option<WrappedState>, Option<WrappedState>) { + pub fn next_states(&self) -> (Option<StateRef>, Option<StateRef>) { (self.out.clone(), self.out1.clone()) } @@ -88,38 +89,39 @@ } /// dot converts a graph starting with s into a Dot graph. -pub fn dot(s: WrappedState) -> String { +pub fn dot(stateg: &StateGraph) -> String { let mut result = String::new(); let mut visited = HashSet::new(); - let mut todo = LinkedList::from_iter(vec![s]); + let mut todo = LinkedList::from_iter(vec![0 as StateRef]); + let mut id = 0; loop { + id += 1; if todo.is_empty() { break; } - let node = todo.pop_front().unwrap(); - let id = format!("{:?}", node.as_ptr()); + let current = todo.pop_front().unwrap(); if visited.contains(&id) { continue; } visited.insert(id.clone()); - for next in [node.borrow().out.clone(), node.borrow().out1.clone()].into_iter() { - if let &Some(ref o) = next { - let nextid = format!("{:p}", o.as_ptr()); + for next in [stateg[current].out.clone(), stateg[current].out1.clone()].into_iter() { + if let &Some(nextid) = next { + let o = &stateg[nextid]; write!( &mut result, "\"{} {}\" -> \"{} {}\";\n", id, - node.borrow().to_string(), + stateg[current].to_string(), nextid, - o.borrow().to_string() + o.to_string(), ) .unwrap(); if !visited.contains(&nextid) { - todo.push_front(o.clone()); + todo.push_front(nextid); } } }
--- a/src/tests.rs Fri Aug 30 16:36:51 2019 +0200 +++ b/src/tests.rs Fri Aug 30 22:20:51 2019 +0200 @@ -8,11 +8,11 @@ let parsed = parse::parse(re).unwrap(); let optimized = repr::optimize::optimize(parsed); let ready = compile::start_compile(&optimized); - matching::do_match(ready, s) + matching::do_match(&ready, s) } fn render_graph(re: &str) { - println!("digraph st {{ {} }}", state::dot(compile::start_compile(parse::parse(re).as_ref().unwrap()))); + println!("digraph st {{ {} }}", state::dot(&compile::start_compile(parse::parse(re).as_ref().unwrap()))); } #[test]