changeset 62:d761adfe9cdf

Big: Move everything to use a Vec-based graph instead of pointer-chasing
author Lewin Bormann <lbo@spheniscida.de>
date Fri, 30 Aug 2019 22:20:51 +0200
parents 987e10c9e6fe
children 0704050ff42a
files benches/e2e.rs src/compile.rs src/lib.rs src/matching.rs src/parse.rs src/repr.rs src/state.rs src/tests.rs
diffstat 8 files changed, 156 insertions(+), 119 deletions(-) [+]
line wrap: on
line diff
--- a/benches/e2e.rs	Fri Aug 30 16:36:51 2019 +0200
+++ b/benches/e2e.rs	Fri Aug 30 22:20:51 2019 +0200
@@ -31,7 +31,7 @@
 fn bench_notorious(b: &mut Bencher) {
     let re = rex::compile("(x+x+)+y").unwrap();
     b.iter(|| {
-        assert!(rex::match_re(&re, "xxxxxxxxxxxy").0);
+        assert!(rex::match_re(&re, "xxxxxxxxxxy").0);
     });
 }
 
--- a/src/compile.rs	Fri Aug 30 16:36:51 2019 +0200
+++ b/src/compile.rs	Fri Aug 30 22:20:51 2019 +0200
@@ -1,49 +1,56 @@
 use matcher::{self, wrap_matcher};
 use repr::{AnchorLocation, Pattern, Repetition};
-use state::{wrap_state, State, Submatch, WrappedState};
+use state::{State, Submatch, StateGraph, StateRef};
 
 /// Types implementing Compile can be compiled into a state graph.
 pub trait Compile {
     /// to_state returns the start node of a subgraph, and a list of pointers that need to be
     /// connected to the following subgraph. The list can contain the first tuple element.
-    fn to_state(&self) -> (WrappedState, Vec<WrappedState>);
+    fn to_state(&self, sg: &mut StateGraph) -> (StateRef, Vec<StateRef>);
 }
 
 /// start_compile takes a parsed regex as RETree and returns the first node of a directed graph
 /// representing the regex.
-pub fn start_compile(re: &Pattern) -> WrappedState {
-    let (s, sp) = re.to_state();
+pub fn start_compile(re: &Pattern) -> StateGraph {
+    let mut state_graph = Vec::with_capacity(64);
 
     let mut before = State::default();
     before.sub = Some(Submatch::Start);
-    before.out = Some(s);
+    // First element in graph vector.
+    let beforeref = 0;
+    state_graph.push(before);
+
+    let (s, sp) = re.to_state(&mut state_graph);
+    state_graph[beforeref].out = Some(s);
+
     let mut end = State::default();
     end.sub = Some(Submatch::End);
+    let endref = state_graph.len();
+    state_graph.push(end);
 
-    let endw = wrap_state(end);
     // Connect all loose ends with the final node.
     for p in sp {
-        p.borrow_mut().patch(endw.clone());
+        state_graph[p].patch(endref);
     }
-    wrap_state(before)
+    state_graph
 }
 
 impl Compile for Pattern {
-    fn to_state(&self) -> (WrappedState, Vec<WrappedState>) {
+    fn to_state(&self, sg: &mut StateGraph) -> (StateRef, Vec<StateRef>) {
         match *self {
             Pattern::Concat(ref ps) => {
                 if ps.is_empty() {
                     panic!("invalid Concat len: 0")
                 } else if ps.len() == 1 {
-                    return ps[0].to_state();
+                    return ps[0].to_state(sg);
                 }
 
-                let (init, mut lastp) = ps[0].to_state();
+                let (init, mut lastp) = ps[0].to_state(sg);
                 for i in 1..ps.len() {
-                    let (next, nextp) = ps[i].to_state();
+                    let (next, nextp) = ps[i].to_state(sg);
                     // Connect all loose ends with the new node.
                     for p in lastp {
-                        p.borrow_mut().patch(next.clone());
+                        sg[p].patch(next);
                     }
                     // Remember the loose ends of this one.
                     lastp = nextp;
@@ -51,95 +58,111 @@
                 (init, lastp)
             }
             Pattern::Any => {
-                let s = wrap_state(State {
+                let s = State {
                     out: None,
                     out1: None,
                     matcher: wrap_matcher(Box::new(matcher::AnyMatcher)),
                     sub: None,
-                });
-                (s.clone(), vec![s])
+                };
+                let sref = sg.len();
+                sg.push(s);
+                (sref, vec![sref])
             }
             Pattern::Char(c) => {
-                let s = wrap_state(State {
+                let s = State {
                     out: None,
                     out1: None,
                     matcher: wrap_matcher(Box::new(matcher::CharMatcher(c))),
                     sub: None,
-                });
-                (s.clone(), vec![s])
+                };
+                let sref = sg.len();
+                sg.push(s);
+                (sref, vec![sref])
             }
             Pattern::Str(ref s) => {
-                let s = wrap_state(State {
+                let s = State {
                     out: None,
                     out1: None,
                     matcher: wrap_matcher(Box::new(matcher::StringMatcher::new(s))),
                     sub: None,
-                });
-                (s.clone(), vec![s])
+                };
+                let sref = sg.len();
+                sg.push(s);
+                (sref, vec![sref])
             }
             Pattern::CharRange(from, to) => {
-                let s = wrap_state(State {
+                let s = State {
                     out: None,
                     out1: None,
                     matcher: wrap_matcher(Box::new(matcher::CharRangeMatcher(from, to))),
                     sub: None,
-                });
-                (s.clone(), vec![s])
+                };
+                let sref = sg.len();
+                sg.push(s);
+                (sref, vec![sref])
             }
             Pattern::CharSet(ref set) => {
-                let s = wrap_state(State {
+                let s = State {
                     out: None,
                     out1: None,
                     matcher: wrap_matcher(Box::new(matcher::CharSetMatcher(set.clone()))),
                     sub: None,
-                });
-                (s.clone(), vec![s])
+                };
+                let sref = sg.len();
+                sg.push(s);
+                (sref, vec![sref])
             }
-            Pattern::Alternate(ref r) => alternate(&r, &vec![]),
+            Pattern::Alternate(ref r) => alternate(sg, &r, &vec![]),
             Pattern::Submatch(ref p) => {
-                let (s, sp) = p.to_state();
-                let before = wrap_state(State {
+                let (s, sp) = p.to_state(sg);
+                let before = State {
                     out: Some(s),
                     out1: None,
                     matcher: None,
                     sub: Some(Submatch::Start),
-                });
-                let after = wrap_state(State {
+                };
+                let after = State {
                     out: None,
                     out1: None,
                     matcher: None,
                     sub: Some(Submatch::End),
-                });
+                };
+                let beforeref = sg.len();
+                sg.push(before);
+                let afterref = sg.len();
+                sg.push(after);
                 for p in sp {
-                    p.borrow_mut().patch(after.clone());
+                    sg[p].patch(afterref);
                 }
-                (before, vec![after])
+                (beforeref, vec![afterref])
             }
-            Pattern::Repeated(ref p) => p.to_state(),
+            Pattern::Repeated(ref p) => p.to_state(sg),
             Pattern::Anchor(ref loc) => {
                 let mut m = matcher::AnchorMatcher::Begin;
                 match loc {
                     &AnchorLocation::End => m = matcher::AnchorMatcher::End,
                     _ => (),
                 };
-                let s = wrap_state(State {
+                let s = State {
                     out: None,
                     out1: None,
                     matcher: wrap_matcher(Box::new(m)),
                     sub: None,
-                });
-                (s.clone(), vec![s])
+                };
+                let sref = sg.len();
+                sg.push(s);
+                (sref, vec![sref])
             }
         }
     }
 }
 
 // alternate compiles a list of patterns into a graph that accepts any one of the patterns.
-fn alternate(ps: &[Pattern], to_patch: &[WrappedState]) -> (WrappedState, Vec<WrappedState>) {
+fn alternate(sg: &mut StateGraph, ps: &[Pattern], to_patch: &[StateRef]) -> (StateRef, Vec<StateRef>) {
     if ps.len() == 1 {
-        let (s, sp) = ps[0].to_state();
+        let (s, sp) = ps[0].to_state(sg);
         for e in to_patch {
-            e.borrow_mut().patch(s.clone());
+            sg[*e].patch(s);
         }
         (s, sp)
     } else {
@@ -150,69 +173,81 @@
             sub: None,
         };
         let mid = ps.len() / 2;
-        let (left, mut leftpatch) = alternate(&ps[..mid], &vec![]);
-        let (right, mut rightpatch) = alternate(&ps[mid..], &vec![]);
+        let (left, mut leftpatch) = alternate(sg, &ps[..mid], &vec![]);
+        let (right, mut rightpatch) = alternate(sg, &ps[mid..], &vec![]);
         init.patch(left);
         init.patch(right);
         leftpatch.append(&mut rightpatch);
-        (wrap_state(init), leftpatch)
+        let initref = sg.len();
+        sg.push(init);
+        (initref, leftpatch)
     }
 }
 
 impl Compile for Repetition {
-    fn to_state(&self) -> (WrappedState, Vec<WrappedState>) {
+    fn to_state(&self, sg: &mut StateGraph) -> (StateRef, Vec<StateRef>) {
         match *self {
             Repetition::ZeroOrOnce(ref p) => {
-                let (s, to_patch) = p.to_state();
-                let after = wrap_state(State {
+                let (s, to_patch) = p.to_state(sg);
+                let after = State {
                     out: None,
                     out1: None,
                     matcher: None,
                     sub: None,
-                });
-                let before = wrap_state(State {
-                    out: Some(s.clone()),
-                    out1: Some(after.clone()),
+                };
+                let afterref = sg.len();
+                sg.push(after);
+                let before = State {
+                    out: Some(s),
+                    out1: Some(afterref),
                     matcher: None,
                     sub: None,
-                });
+                };
+                let beforeref = sg.len();
+                sg.push(before);
                 for p in to_patch {
-                    p.borrow_mut().patch(after.clone());
+                    sg[p].patch(afterref);
                 }
-                (before, vec![after])
+                (beforeref, vec![afterref])
             }
             Repetition::ZeroOrMore(ref p) => {
-                let (s, to_patch) = p.to_state();
-                let before = wrap_state(State {
+                let (s, to_patch) = p.to_state(sg);
+                let before = State {
                     out: Some(s.clone()),
                     out1: None,
                     matcher: None,
                     sub: None,
-                });
-                let after = wrap_state(State {
+                };
+                let beforeref = sg.len();
+                sg.push(before);
+                let after = State {
                     out: Some(s.clone()),
                     out1: None,
                     matcher: None,
                     sub: None,
-                });
-                before.borrow_mut().patch(after.clone());
+                };
+                let afterref = sg.len();
+                sg.push(after);
+                sg[beforeref].patch(afterref);
                 for p in to_patch {
-                    p.borrow_mut().patch(after.clone());
+                    sg[p].patch(afterref);
                 }
-                (before, vec![after])
+                (beforeref, vec![afterref])
             }
             Repetition::OnceOrMore(ref p) => {
-                let (s, to_patch) = p.to_state();
-                let after = wrap_state(State {
+                let (s, to_patch) = p.to_state(sg);
+                let after = State {
                     out: Some(s.clone()),
                     out1: None,
                     matcher: None,
                     sub: None,
-                });
+                };
+                let afterref = sg.len();
+                sg.push(after);
                 for p in to_patch {
-                    p.borrow_mut().patch(after.clone());
+                    sg[p].patch(afterref);
                 }
-                (s, vec![after])
+                (s, vec![afterref])
             }
             // Specific is 'min' concatenations of a simple state and 'max - min' concatenations of
             // a ZeroOrOnce state.
@@ -241,7 +276,7 @@
                         p.clone(),
                     ))));
                 }
-                Pattern::Concat(repetition).to_state()
+                Pattern::Concat(repetition).to_state(sg)
             }
         }
     }
--- a/src/lib.rs	Fri Aug 30 16:36:51 2019 +0200
+++ b/src/lib.rs	Fri Aug 30 22:20:51 2019 +0200
@@ -19,7 +19,8 @@
 /// entire matched string. A submatch is a tuple of (start, end), where end is the index of the
 /// first character that isn't part of the submatch anymore (i.e. [start, end)).
 fn compile_and_match(re: &repr::Pattern, s: &str) -> (bool, Vec<(usize, usize)>) {
-    matching::do_match(compile::start_compile(re), s)
+    let compiled = compile::start_compile(re);
+    matching::do_match(&compiled, s)
 }
 
 /// Parse, compile, and match a regular expression. Not recommended for repeated use, as the
@@ -40,6 +41,5 @@
 /// tuples for all submatches, where the first element describes the match by the whole regular
 /// expression.
 pub fn match_re(re: &state::CompiledRE, s: &str) -> (bool, Vec<(usize, usize)>) {
-    let re = re.clone();
     matching::do_match(re, s)
 }
--- a/src/matching.rs	Fri Aug 30 16:36:51 2019 +0200
+++ b/src/matching.rs	Fri Aug 30 22:20:51 2019 +0200
@@ -9,11 +9,11 @@
 use std::rc::Rc;
 
 use matcher::Matchee;
-use state::{Submatch, WrappedState};
+use state::{Submatch, StateGraph, StateRef};
 
 #[derive(Clone, Debug)]
 pub struct MatchState {
-    node: WrappedState,
+    node: StateRef,
     matchee: Matchee,
     // The set of submatches encountered, indexed by the start of a submatch. If submatches
     // (with (start,end)) (1,3),(5,10) have been encountered, then submatches[1] = Some(3) and
@@ -27,7 +27,7 @@
 }
 
 impl MatchState {
-    fn new(s: &str, ws: WrappedState) -> MatchState {
+    fn new(s: &str, ws: StateRef) -> MatchState {
         MatchState {
             node: ws,
             matchee: Matchee::from_string(s),
@@ -35,13 +35,13 @@
             submatches_todo: Cow::Owned(Vec::new()),
         }
     }
-    fn fork(&self, next: WrappedState, advance: usize) -> MatchState {
+    fn fork(&self, next: StateRef, advance: usize) -> MatchState {
         let mut n = self.clone();
         n.matchee.advance(advance);
         n.node = next;
         n
     }
-    fn update(&mut self, next: WrappedState, advance: usize) {
+    fn update(&mut self, next: StateRef, advance: usize) {
         self.matchee.advance(advance);
         self.node = next;
     }
@@ -68,15 +68,15 @@
 ///
 /// The boolean component is true if the match succeeded. The Vec contains tuples of (start,
 /// one-past-end) for each submatch, starting with the implicit whole match.
-pub fn do_match(ws: WrappedState, s: &str) -> (bool, Vec<(usize, usize)>) {
-    let mut ms = MatchState::new(s, ws);
+pub fn do_match(sg: &StateGraph, s: &str) -> (bool, Vec<(usize, usize)>) {
+    let mut ms = MatchState::new(s, 0);
     let (mut i, len) = (0, s.len());
 
     // TODO: Find out if a failed match is definitive; an anchored regex can't match anywhere later
     // in the text.
     while i < len || i == 0 {
         ms.reset(i);
-        let m = start_match(ms.clone());
+        let m = start_match(sg, ms.clone());
         match m {
             // If the match fails, we skip as many characters as were matched at first.
             (false, skip, _) => i = skip + 1,
@@ -99,7 +99,7 @@
 /// successful (in case a match fails, but matches some characters at the beginning); and a vector
 /// of submatches; if the entry at index I contains Some(J), then that means that there is a
 /// submatch starting at I extending to (J-1).
-pub fn start_match(m: MatchState) -> (bool, usize, Vec<Option<usize>>) {
+pub fn start_match(sg: &StateGraph, m: MatchState) -> (bool, usize, Vec<Option<usize>>) {
     let mut states = Vec::with_capacity(4);
     let mut states_next = Vec::with_capacity(4);
     states.push(m);
@@ -116,15 +116,15 @@
         // Iterate over all current states, see which match, and add the successors of matching
         // states to the states_next list.
         for mut matchst in states.drain(..) {
-            let (next1, next2) = matchst.node.borrow().next_states();
+            let (next1, next2) = sg[matchst.node].next_states();
 
             // Check if this node is a submatch start or end. If it is, the list of pending
             // submatches is cloned and the current position pushed (Start) or the most recent
             // submatch start popped and stored in the overall submatch list (End).
-            let sub = matchst.node.borrow().sub.clone();
+            let sub = sg[matchst.node].sub.as_ref();
             match sub {
-                Some(Submatch::Start) => matchst.start_submatch(),
-                Some(Submatch::End) => matchst.stop_submatch(),
+                Some(&Submatch::Start) => matchst.start_submatch(),
+                Some(&Submatch::End) => matchst.stop_submatch(),
                 None => {}
             }
 
@@ -144,7 +144,7 @@
 
             let mut advance_by = 0;
             // Check if the current state matches.
-            if let Some((matched, howmany)) = matchst.node.borrow().matches(&matchst.matchee) {
+            if let Some((matched, howmany)) = sg[matchst.node].matches(&matchst.matchee) {
                 // Current state didn't match, throw away.
                 if !matched {
                     continue;
@@ -211,8 +211,8 @@
     fn test_match_simple() {
         let re = simple_re0();
         println!("{:?}", re);
-        println!("{:?}", do_match(start_compile(&re), "aaab"));
-        let dot = dot(start_compile(&re));
+        println!("{:?}", do_match(&start_compile(&re), "aaab"));
+        let dot = dot(&start_compile(&re));
         println!("digraph st {{ {} }}", dot);
     }
 }
--- a/src/parse.rs	Fri Aug 30 16:36:51 2019 +0200
+++ b/src/parse.rs	Fri Aug 30 22:20:51 2019 +0200
@@ -484,7 +484,7 @@
     fn test_parse_repetition_manual() {
         println!(
             "digraph st {{ {} }}",
-            dot(start_compile(&parse("[abc]{1,5}").unwrap()))
+            dot(&start_compile(&parse("[abc]{1,5}").unwrap()))
         );
     }
     #[test]
@@ -492,7 +492,7 @@
         let rep = parse("a|[bed]|(c|d|e)|f").unwrap();
         println!("{:?}", rep.clone());
 
-        let dot = dot(start_compile(&rep));
+        let dot = dot(&start_compile(&rep));
         println!("digraph st {{ {} }}", dot);
     }
 
--- a/src/repr.rs	Fri Aug 30 16:36:51 2019 +0200
+++ b/src/repr.rs	Fri Aug 30 22:20:51 2019 +0200
@@ -258,7 +258,7 @@
     #[test]
     fn test_re1() {
         // println!("{:?}", start_compile(simple_re0()));
-        let dot = dot(start_compile(&simple_re1()));
+        let dot = dot(&start_compile(&simple_re1()));
         println!("digraph st {{ {} }}", dot);
     }
 }
--- a/src/state.rs	Fri Aug 30 16:36:51 2019 +0200
+++ b/src/state.rs	Fri Aug 30 22:20:51 2019 +0200
@@ -3,22 +3,33 @@
 //! the repr module.
 #![allow(dead_code)]
 
-use std::cell::RefCell;
 use std::collections::HashSet;
 use std::collections::LinkedList;
 use std::fmt::Write;
 use std::iter::FromIterator;
 use std::rc::Rc;
+use std::vec::Vec;
 
 use matcher::{Matchee, Matcher};
 
+/// StateGraph is the graph of states that the interpreter traverses while matching a regular
+/// expression. It is represented as flat vector. The first element is the State node to start the
+/// evaluation with.
+pub type StateGraph = Vec<State>;
+
+/// StateRef is a reference to a state in a StateGraph.
+pub type StateRef = usize;
+
+/// CompiledRE is a compiled regular expression that can be used for matching.
+pub type CompiledRE = StateGraph;
+
 /// State is a single state that the evaluation can be in. It contains several output states as
 /// well as a matcher.
 #[derive(Debug, Default, Clone)]
 pub struct State {
     // Possible following state(s).
-    pub out: Option<WrappedState>,
-    pub out1: Option<WrappedState>,
+    pub out: Option<StateRef>,
+    pub out1: Option<StateRef>,
     // If matcher is none, this is an "empty" state.
     pub matcher: Option<Rc<Box<dyn Matcher>>>,
     // Tells the matching logic to record the start or end of a submatch.
@@ -31,18 +42,8 @@
     End,
 }
 
-/// WrappedState is a shared pointer to a state node.
-pub type WrappedState = Rc<RefCell<State>>;
-
-/// CompiledRE is a compiled regular expression that can be used for matching.
-pub type CompiledRE = WrappedState;
-
-pub fn wrap_state(s: State) -> WrappedState {
-    Rc::new(RefCell::new(s))
-}
-
 impl State {
-    pub fn patch(&mut self, next: WrappedState) {
+    pub fn patch(&mut self, next: StateRef) {
         if self.out.is_none() {
             self.out = Some(next);
         } else if self.out1.is_none() {
@@ -66,7 +67,7 @@
     }
 
     /// Returns the following states, if present. Returns (None, None) if it's the final node.
-    pub fn next_states(&self) -> (Option<WrappedState>, Option<WrappedState>) {
+    pub fn next_states(&self) -> (Option<StateRef>, Option<StateRef>) {
         (self.out.clone(), self.out1.clone())
     }
 
@@ -88,38 +89,39 @@
 }
 
 /// dot converts a graph starting with s into a Dot graph.
-pub fn dot(s: WrappedState) -> String {
+pub fn dot(stateg: &StateGraph) -> String {
     let mut result = String::new();
 
     let mut visited = HashSet::new();
-    let mut todo = LinkedList::from_iter(vec![s]);
+    let mut todo = LinkedList::from_iter(vec![0 as StateRef]);
+    let mut id = 0;
 
     loop {
+        id += 1;
         if todo.is_empty() {
             break;
         }
-        let node = todo.pop_front().unwrap();
-        let id = format!("{:?}", node.as_ptr());
+        let current = todo.pop_front().unwrap();
         if visited.contains(&id) {
             continue;
         }
         visited.insert(id.clone());
 
-        for next in [node.borrow().out.clone(), node.borrow().out1.clone()].into_iter() {
-            if let &Some(ref o) = next {
-                let nextid = format!("{:p}", o.as_ptr());
+        for next in [stateg[current].out.clone(), stateg[current].out1.clone()].into_iter() {
+            if let &Some(nextid) = next {
+                let o = &stateg[nextid];
                 write!(
                     &mut result,
                     "\"{} {}\" -> \"{} {}\";\n",
                     id,
-                    node.borrow().to_string(),
+                    stateg[current].to_string(),
                     nextid,
-                    o.borrow().to_string()
+                    o.to_string(),
                 )
                 .unwrap();
 
                 if !visited.contains(&nextid) {
-                    todo.push_front(o.clone());
+                    todo.push_front(nextid);
                 }
             }
         }
--- a/src/tests.rs	Fri Aug 30 16:36:51 2019 +0200
+++ b/src/tests.rs	Fri Aug 30 22:20:51 2019 +0200
@@ -8,11 +8,11 @@
     let parsed = parse::parse(re).unwrap();
     let optimized = repr::optimize::optimize(parsed);
     let ready = compile::start_compile(&optimized);
-    matching::do_match(ready, s)
+    matching::do_match(&ready, s)
 }
 
 fn render_graph(re: &str) {
-    println!("digraph st {{ {} }}", state::dot(compile::start_compile(parse::parse(re).as_ref().unwrap())));
+    println!("digraph st {{ {} }}", state::dot(&compile::start_compile(parse::parse(re).as_ref().unwrap())));
 }
 
 #[test]