Mercurial > lbo > hg > rex
changeset 42:8932c5b37a89
Move compilation functionality into dedicated module.
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Wed, 08 Nov 2017 19:17:19 +0100 |
parents | 89dbd4e17f9e |
children | 0a19bc59e7d6 |
files | src/compile.rs src/lib.rs src/matching.rs src/parse.rs src/repr.rs src/state.rs |
diffstat | 6 files changed, 255 insertions(+), 249 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/compile.rs Wed Nov 08 19:17:19 2017 +0100 @@ -0,0 +1,247 @@ + +use matcher::{self, wrap_matcher}; +use repr::{AnchorLocation, Pattern, Repetition}; +use state::{wrap_state, State, Submatch, WrappedState}; + +/// Types implementing Compile can be compiled into a state graph. +pub trait Compile { + /// to_state returns the start node of a subgraph, and a list of pointers that need to be + /// connected to the following subgraph. The list can contain the first tuple element. + fn to_state(&self) -> (WrappedState, Vec<WrappedState>); +} + +/// start_compile takes a parsed regex as RETree and returns the first node of a directed graph +/// representing the regex. +pub fn start_compile(re: &Pattern) -> WrappedState { + let (s, sp) = re.to_state(); + + let mut before = State::default(); + before.sub = Some(Submatch::Start); + before.out = Some(s); + let mut end = State::default(); + end.sub = Some(Submatch::End); + + let endw = wrap_state(end); + // Connect all loose ends with the final node. + for p in sp { + p.borrow_mut().patch(endw.clone()); + } + wrap_state(before) +} + +impl Compile for Pattern { + fn to_state(&self) -> (WrappedState, Vec<WrappedState>) { + match *self { + Pattern::Concat(ref ps) => { + if ps.is_empty() { + panic!("invalid Concat len: 0") + } else if ps.len() == 1 { + return ps[0].to_state(); + } + + let (init, mut lastp) = ps[0].to_state(); + for i in 1..ps.len() { + let (next, nextp) = ps[i].to_state(); + // Connect all loose ends with the new node. + for p in lastp { + p.borrow_mut().patch(next.clone()); + } + // Remember the loose ends of this one. + lastp = nextp; + } + (init, lastp) + } + Pattern::Any => { + let s = wrap_state(State { + out: None, + out1: None, + matcher: wrap_matcher(Box::new(matcher::AnyMatcher)), + sub: None, + }); + (s.clone(), vec![s]) + } + Pattern::Char(c) => { + let s = wrap_state(State { + out: None, + out1: None, + matcher: wrap_matcher(Box::new(matcher::CharMatcher(c))), + sub: None, + }); + (s.clone(), vec![s]) + } + Pattern::Str(ref s) => { + let s = wrap_state(State { + out: None, + out1: None, + matcher: wrap_matcher(Box::new(matcher::StringMatcher::new(s))), + sub: None, + }); + (s.clone(), vec![s]) + } + Pattern::CharRange(from, to) => { + let s = wrap_state(State { + out: None, + out1: None, + matcher: wrap_matcher(Box::new(matcher::CharRangeMatcher(from, to))), + sub: None, + }); + (s.clone(), vec![s]) + } + Pattern::CharSet(ref set) => { + let s = wrap_state(State { + out: None, + out1: None, + matcher: wrap_matcher(Box::new(matcher::CharSetMatcher(set.clone()))), + sub: None, + }); + (s.clone(), vec![s]) + } + Pattern::Alternate(ref r) => alternate(&r, &vec![]), + Pattern::Submatch(ref p) => { + let (s, sp) = p.to_state(); + let before = wrap_state(State { + out: Some(s), + out1: None, + matcher: None, + sub: Some(Submatch::Start), + }); + let after = wrap_state(State { + out: None, + out1: None, + matcher: None, + sub: Some(Submatch::End), + }); + for p in sp { + p.borrow_mut().patch(after.clone()); + } + (before, vec![after]) + } + Pattern::Repeated(ref p) => p.to_state(), + Pattern::Anchor(ref loc) => { + let mut m = matcher::AnchorMatcher::Begin; + match loc { + &AnchorLocation::End => m = matcher::AnchorMatcher::End, + _ => (), + }; + let s = wrap_state(State { + out: None, + out1: None, + matcher: wrap_matcher(Box::new(m)), + sub: None, + }); + (s.clone(), vec![s]) + } + } + } +} + +// alternate compiles a list of patterns into a graph that accepts any one of the patterns. +fn alternate(ps: &[Pattern], to_patch: &[WrappedState]) -> (WrappedState, Vec<WrappedState>) { + if ps.len() == 1 { + let (s, sp) = ps[0].to_state(); + for e in to_patch { + e.borrow_mut().patch(s.clone()); + } + (s, sp) + } else { + let mut init = State { + out: None, + out1: None, + matcher: None, + sub: None, + }; + let mid = ps.len() / 2; + let (left, mut leftpatch) = alternate(&ps[..mid], &vec![]); + let (right, mut rightpatch) = alternate(&ps[mid..], &vec![]); + init.patch(left); + init.patch(right); + leftpatch.append(&mut rightpatch); + (wrap_state(init), leftpatch) + } +} + +impl Compile for Repetition { + fn to_state(&self) -> (WrappedState, Vec<WrappedState>) { + match *self { + Repetition::ZeroOrOnce(ref p) => { + let (s, to_patch) = p.to_state(); + let after = wrap_state(State { + out: None, + out1: None, + matcher: None, + sub: None, + }); + let before = wrap_state(State { + out: Some(s.clone()), + out1: Some(after.clone()), + matcher: None, + sub: None, + }); + for p in to_patch { + p.borrow_mut().patch(after.clone()); + } + (before, vec![after]) + } + Repetition::ZeroOrMore(ref p) => { + let (s, to_patch) = p.to_state(); + let before = wrap_state(State { + out: Some(s.clone()), + out1: None, + matcher: None, + sub: None, + }); + let after = wrap_state(State { + out: Some(s.clone()), + out1: None, + matcher: None, + sub: None, + }); + before.borrow_mut().patch(after.clone()); + for p in to_patch { + p.borrow_mut().patch(after.clone()); + } + (before, vec![after]) + } + Repetition::OnceOrMore(ref p) => { + let (s, to_patch) = p.to_state(); + let after = wrap_state(State { + out: Some(s.clone()), + out1: None, + matcher: None, + sub: None, + }); + for p in to_patch { + p.borrow_mut().patch(after.clone()); + } + (s, vec![after]) + } + // Specific is 'min' concatenations of a simple state and 'max - min' concatenations of + // a ZeroOrOnce state. + Repetition::Specific(ref p, min, max_) => { + let cap = max_.unwrap_or(min) as usize; + assert!(cap >= min as usize); + let mut repetition = Vec::with_capacity(cap); + + // Append the minimum required number of occurrences. + for _ in 0..min { + repetition.push(p.clone()); + } + + // If an upper limit is set, append max-min repetitions of ZeroOrOnce states for + // the repeated pattern. + if let Some(max) = max_ { + for _ in 0..(max - min) { + repetition.push( + Pattern::Repeated( + Box::new(Repetition::ZeroOrOnce(p.clone())))); + } + } else { + // If no upper limit is set, append a ZeroOrMore state for the repeated + // pattern. + repetition.push(Pattern::Repeated(Box::new(Repetition::ZeroOrMore(p.clone())))); + } + Pattern::Concat(repetition).to_state() + } + } + } +}
--- a/src/lib.rs Mon Aug 21 20:34:59 2017 +0200 +++ b/src/lib.rs Wed Nov 08 19:17:19 2017 +0100 @@ -1,6 +1,7 @@ #![allow(dead_code)] +mod compile; mod matcher; mod matching; mod parse; @@ -12,5 +13,5 @@ /// entire matched string. A submatch is a tuple of (start, end), where end is the index of the /// first character that isn't part of the submatch anymore (i.e. [start, end)). fn compile_and_match(re: &repr::Pattern, s: &str) -> (bool, Vec<(usize, usize)>) { - matching::do_match(repr::start_compile(re), s) + matching::do_match(compile::start_compile(re), s) }
--- a/src/matching.rs Mon Aug 21 20:34:59 2017 +0200 +++ b/src/matching.rs Wed Nov 08 19:17:19 2017 +0100 @@ -179,6 +179,7 @@ use super::*; use repr::*; use state::*; + use compile::*; use parse; fn simple_re0() -> Pattern {
--- a/src/parse.rs Mon Aug 21 20:34:59 2017 +0200 +++ b/src/parse.rs Wed Nov 08 19:17:19 2017 +0100 @@ -283,6 +283,7 @@ #[cfg(test)] mod tests { use super::*; + use compile::*; use repr::*; use state::dot;
--- a/src/repr.rs Mon Aug 21 20:34:59 2017 +0200 +++ b/src/repr.rs Wed Nov 08 19:17:19 2017 +0100 @@ -1,29 +1,7 @@ -//! The repr module is concerned with the representation of parsed regular expressions and their -//! compilation into a state graph. The state graph itself is defined in the state module. +//! The repr module is concerned with the representation of parsed regular expressions. A Pattern +//! is compiled by the `compile` module into a state graph defined in `state`. #![allow(dead_code)] -use matcher::{self, wrap_matcher}; -use state::{wrap_state, Compile, State, Submatch, WrappedState}; - -/// start_compile takes a parsed regex as RETree and returns the first node of a directed graph -/// representing the regex. -pub fn start_compile(re: &Pattern) -> WrappedState { - let (s, sp) = re.to_state(); - - let mut before = State::default(); - before.sub = Some(Submatch::Start); - before.out = Some(s); - let mut end = State::default(); - end.sub = Some(Submatch::End); - - let endw = wrap_state(end); - // Connect all loose ends with the final node. - for p in sp { - p.borrow_mut().patch(endw.clone()); - } - wrap_state(before) -} - /// A Pattern is either a repeated pattern, a stored submatch, an alternation between two patterns, /// two patterns following each other, or a character range or set. #[derive(Clone, Debug, PartialEq)] @@ -55,137 +33,6 @@ End, } -impl Compile for Pattern { - fn to_state(&self) -> (WrappedState, Vec<WrappedState>) { - match *self { - Pattern::Concat(ref ps) => { - if ps.is_empty() { - panic!("invalid Concat len: 0") - } else if ps.len() == 1 { - return ps[0].to_state(); - } - - let (init, mut lastp) = ps[0].to_state(); - for i in 1..ps.len() { - let (next, nextp) = ps[i].to_state(); - // Connect all loose ends with the new node. - for p in lastp { - p.borrow_mut().patch(next.clone()); - } - // Remember the loose ends of this one. - lastp = nextp; - } - (init, lastp) - } - Pattern::Any => { - let s = wrap_state(State { - out: None, - out1: None, - matcher: wrap_matcher(Box::new(matcher::AnyMatcher)), - sub: None, - }); - (s.clone(), vec![s]) - } - Pattern::Char(c) => { - let s = wrap_state(State { - out: None, - out1: None, - matcher: wrap_matcher(Box::new(matcher::CharMatcher(c))), - sub: None, - }); - (s.clone(), vec![s]) - } - Pattern::Str(ref s) => { - let s = wrap_state(State { - out: None, - out1: None, - matcher: wrap_matcher(Box::new(matcher::StringMatcher::new(s))), - sub: None, - }); - (s.clone(), vec![s]) - } - Pattern::CharRange(from, to) => { - let s = wrap_state(State { - out: None, - out1: None, - matcher: wrap_matcher(Box::new(matcher::CharRangeMatcher(from, to))), - sub: None, - }); - (s.clone(), vec![s]) - } - Pattern::CharSet(ref set) => { - let s = wrap_state(State { - out: None, - out1: None, - matcher: wrap_matcher(Box::new(matcher::CharSetMatcher(set.clone()))), - sub: None, - }); - (s.clone(), vec![s]) - } - Pattern::Alternate(ref r) => alternate(&r, &vec![]), - Pattern::Submatch(ref p) => { - let (s, sp) = p.to_state(); - let before = wrap_state(State { - out: Some(s), - out1: None, - matcher: None, - sub: Some(Submatch::Start), - }); - let after = wrap_state(State { - out: None, - out1: None, - matcher: None, - sub: Some(Submatch::End), - }); - for p in sp { - p.borrow_mut().patch(after.clone()); - } - (before, vec![after]) - } - Pattern::Repeated(ref p) => p.to_state(), - Pattern::Anchor(ref loc) => { - let mut m = matcher::AnchorMatcher::Begin; - match loc { - &AnchorLocation::End => m = matcher::AnchorMatcher::End, - _ => (), - }; - let s = wrap_state(State { - out: None, - out1: None, - matcher: wrap_matcher(Box::new(m)), - sub: None, - }); - (s.clone(), vec![s]) - } - } - } -} - -// alternate compiles a list of patterns into a graph that accepts any one of the patterns. -fn alternate(ps: &[Pattern], to_patch: &[WrappedState]) -> (WrappedState, Vec<WrappedState>) { - if ps.len() == 1 { - let (s, sp) = ps[0].to_state(); - for e in to_patch { - e.borrow_mut().patch(s.clone()); - } - (s, sp) - } else { - let mut init = State { - out: None, - out1: None, - matcher: None, - sub: None, - }; - let mid = ps.len() / 2; - let (left, mut leftpatch) = alternate(&ps[..mid], &vec![]); - let (right, mut rightpatch) = alternate(&ps[mid..], &vec![]); - init.patch(left); - init.patch(right); - leftpatch.append(&mut rightpatch); - (wrap_state(init), leftpatch) - } -} - /// A pattern can be repeated in various manners, which is represented by the pattern being wrapped /// in a Repetition. /// The inner type is a pattern, because a repetition is either concerned with only one pattern @@ -202,92 +49,6 @@ Specific(Pattern, u32, Option<u32>), } -impl Repetition { - fn to_state(&self) -> (WrappedState, Vec<WrappedState>) { - match *self { - Repetition::ZeroOrOnce(ref p) => { - let (s, to_patch) = p.to_state(); - let after = wrap_state(State { - out: None, - out1: None, - matcher: None, - sub: None, - }); - let before = wrap_state(State { - out: Some(s.clone()), - out1: Some(after.clone()), - matcher: None, - sub: None, - }); - for p in to_patch { - p.borrow_mut().patch(after.clone()); - } - (before, vec![after]) - } - Repetition::ZeroOrMore(ref p) => { - let (s, to_patch) = p.to_state(); - let before = wrap_state(State { - out: Some(s.clone()), - out1: None, - matcher: None, - sub: None, - }); - let after = wrap_state(State { - out: Some(s.clone()), - out1: None, - matcher: None, - sub: None, - }); - before.borrow_mut().patch(after.clone()); - for p in to_patch { - p.borrow_mut().patch(after.clone()); - } - (before, vec![after]) - } - Repetition::OnceOrMore(ref p) => { - let (s, to_patch) = p.to_state(); - let after = wrap_state(State { - out: Some(s.clone()), - out1: None, - matcher: None, - sub: None, - }); - for p in to_patch { - p.borrow_mut().patch(after.clone()); - } - (s, vec![after]) - } - // Specific is 'min' concatenations of a simple state and 'max - min' concatenations of - // a ZeroOrOnce state. - Repetition::Specific(ref p, min, max_) => { - let cap = max_.unwrap_or(min) as usize; - assert!(cap >= min as usize); - let mut repetition = Vec::with_capacity(cap); - - // Append the minimum required number of occurrences. - for _ in 0..min { - repetition.push(p.clone()); - } - - // If an upper limit is set, append max-min repetitions of ZeroOrOnce states for - // the repeated pattern. - if let Some(max) = max_ { - for _ in 0..(max - min) { - repetition.push( - Pattern::Repeated( - Box::new(Repetition::ZeroOrOnce(p.clone())))); - } - } else { - // If no upper limit is set, append a ZeroOrMore state for the repeated - // pattern. - repetition.push(Pattern::Repeated(Box::new(Repetition::ZeroOrMore(p.clone())))); - } - Pattern::Concat(repetition).to_state() - } - } - } -} - /// optimize contains functionality for optimizing the representation of regular expressions for /// more efficient matching. pub mod optimize { @@ -475,6 +236,8 @@ )) } + use compile::start_compile; + #[test] fn test_re1() { // println!("{:?}", start_compile(simple_re0()));
--- a/src/state.rs Mon Aug 21 20:34:59 2017 +0200 +++ b/src/state.rs Wed Nov 08 19:17:19 2017 +0100 @@ -12,13 +12,6 @@ use matcher::{Matchee, Matcher}; -/// Types implementing Compile can be compiled into a state graph. -pub trait Compile { - /// to_state returns the start node of a subgraph, and a list of pointers that need to be - /// connected to the following subgraph. The list can contain the first tuple element. - fn to_state(&self) -> (WrappedState, Vec<WrappedState>); -} - /// State is a single state that the evaluation can be in. It contains several output states as /// well as a matcher. #[derive(Debug, Default, Clone)]