Mercurial > lbo > hg > rex
changeset 88:437eceef26e0
Move optimize code into separate crate
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Wed, 09 Dec 2020 22:01:30 +0100 |
parents | d111720bc9ef |
children | ea0c08e43479 |
files | src/lib.rs src/optimize.rs src/repr.rs |
diffstat | 3 files changed, 157 insertions(+), 157 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib.rs Wed Dec 09 17:56:41 2020 +0100 +++ b/src/lib.rs Wed Dec 09 22:01:30 2020 +0100 @@ -3,6 +3,7 @@ mod compile; mod matcher; mod matching; +mod optimize; mod parse; mod repr; mod state; @@ -44,17 +45,14 @@ /// regular expression will be compiled every time. Use `compile()` and `match_re()` to make this /// more efficient (about 3x faster). pub fn match_re_str(re: &str, s: &str) -> Result<(bool, Vec<(usize, usize)>), String> { - return Ok(compile_and_match( - &repr::optimize::optimize(parse::parse(re)?), - s, - )); + return Ok(compile_and_match(&optimize::optimize(parse::parse(re)?), s)); } /// Optimize and compile a regular expression into a representation that can be directly used for /// matching with `match_re()`. pub fn compile(re: &str) -> Result<state::CompiledRE, String> { Ok(state::CompiledRE(compile::start_compile( - &repr::optimize::optimize(parse(re)?), + &optimize::optimize(parse(re)?), ))) }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/optimize.rs Wed Dec 09 22:01:30 2020 +0100 @@ -0,0 +1,154 @@ +/// optimize contains functionality for optimizing the representation of regular expressions for +/// more efficient matching. +use std::iter::{FromIterator, Iterator}; +use std::ops::Deref; + +use crate::repr::{Pattern, Repetition}; + +pub fn optimize(mut p: Pattern) -> Pattern { + p = concat_chars_to_str(p); + p = flatten_alternate(p); + p = optimize_recursively(p); + p +} + +/// optimize_recursively applies optimize() to the inner Patterns of a Pattern. +fn optimize_recursively(p: Pattern) -> Pattern { + match p { + Pattern::Concat(ps) => Pattern::Concat(ps.into_iter().map(optimize).collect()), + Pattern::Submatch(bp) => { + let sub = optimize(bp.deref().clone()); + Pattern::Submatch(Box::new(sub)) + } + Pattern::Alternate(ps) => Pattern::Alternate(ps.into_iter().map(optimize).collect()), + Pattern::Repeated(r) => { + let rep = r.deref().clone(); + Pattern::Repeated(Box::new(match rep { + Repetition::ZeroOrOnce(rp) => Repetition::ZeroOrOnce(optimize(rp)), + Repetition::ZeroOrMore(rp) => Repetition::ZeroOrMore(optimize(rp)), + Repetition::OnceOrMore(rp) => Repetition::OnceOrMore(optimize(rp)), + Repetition::Specific(rp, min, max) => Repetition::Specific(optimize(rp), min, max), + })) + } + + p => p, + } +} + +/// concat_chars_to_str collapses successive single-character patterns into a single string +/// pattern. +fn concat_chars_to_str(p: Pattern) -> Pattern { + match p { + Pattern::Concat(mut v) => { + let mut drain = v.drain(..); + let mut new_elems = vec![]; + + // Find runs of adjacent chars/strings and convert them to single strings. + // Once a run is broken, append non-char/string patterns and continue afterwards. + let mut chars = vec![]; + for cp in &mut drain { + match cp { + Pattern::Char(c) => chars.push(c), + Pattern::Str(mut s) => { + chars.extend(s.drain(..)); + } + e => { + // Once a run of chars/strings is broken, merge the run, push the + // non-char/string pattern and continue with the next one. + if chars.len() == 1 { + new_elems.push(Pattern::Char(chars.pop().unwrap())) + } else if chars.len() > 1 { + let newp = Pattern::Str(String::from_iter(chars.drain(..))); + new_elems.push(newp); + } + new_elems.push(e); + assert!(chars.is_empty()); + } + } + } + + if chars.len() == 1 { + new_elems.push(Pattern::Char(chars[0])); + } else if chars.len() > 1 { + let newp = Pattern::Str(String::from_iter(chars.drain(..))); + new_elems.push(newp); + } + + if new_elems.len() == 1 { + new_elems.pop().unwrap() + } else { + Pattern::Concat(new_elems) + } + } + _ => p, + } +} + +/// flatten_alternate takes the alternatives in a Pattern::Alternate and reduces the nesting +/// recursively. +fn flatten_alternate(p: Pattern) -> Pattern { + fn _flatten_alternate(p: Pattern) -> Vec<Pattern> { + match p { + Pattern::Alternate(a) => { + let mut alternatives = vec![]; + for alt in a.into_iter() { + alternatives.append(&mut _flatten_alternate(alt)); + } + alternatives + } + p_ => vec![p_], + } + } + + let mut fa = _flatten_alternate(p); + if fa.len() == 1 { + fa.pop().unwrap() + } else { + Pattern::Alternate(fa) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::repr::AnchorLocation; + + #[test] + fn test_repr_optimize() { + // case = (want, input) + let case1 = ( + Pattern::Str("abc".to_string()), + Pattern::Concat(vec![ + Pattern::Char('a'), + Pattern::Char('b'), + Pattern::Char('c'), + ]), + ); + let case2 = ( + Pattern::Str("abcd".to_string()), + Pattern::Concat(vec![ + Pattern::Str("a".to_string()), + Pattern::Char('b'), + Pattern::Str("cd".to_string()), + ]), + ); + let case3 = ( + Pattern::Concat(vec![ + Pattern::Str("abc".to_string()), + Pattern::Anchor(AnchorLocation::End), + Pattern::Char('d'), + ]), + Pattern::Concat(vec![ + Pattern::Char('a'), + Pattern::Char('b'), + Pattern::Char('c'), + Pattern::Anchor(AnchorLocation::End), + Pattern::Char('d'), + ]), + ); + + for c in vec![case1, case2, case3].into_iter() { + assert_eq!(c.0, optimize(c.1)); + } + } +}
--- a/src/repr.rs Wed Dec 09 17:56:41 2020 +0100 +++ b/src/repr.rs Wed Dec 09 22:01:30 2020 +0100 @@ -51,163 +51,11 @@ Specific(Pattern, u32, Option<u32>), } -/// optimize contains functionality for optimizing the representation of regular expressions for -/// more efficient matching. -pub mod optimize { - use super::*; - use std::iter::{FromIterator, Iterator}; - use std::ops::Deref; - - pub fn optimize(mut p: Pattern) -> Pattern { - p = concat_chars_to_str(p); - p = flatten_alternate(p); - p = optimize_recursively(p); - p - } - - /// optimize_recursively applies optimize() to the inner Patterns of a Pattern. - fn optimize_recursively(p: Pattern) -> Pattern { - match p { - Pattern::Concat(ps) => Pattern::Concat(ps.into_iter().map(optimize).collect()), - Pattern::Submatch(bp) => { - let sub = optimize(bp.deref().clone()); - Pattern::Submatch(Box::new(sub)) - } - Pattern::Alternate(ps) => Pattern::Alternate(ps.into_iter().map(optimize).collect()), - Pattern::Repeated(r) => { - let rep = r.deref().clone(); - Pattern::Repeated(Box::new(match rep { - Repetition::ZeroOrOnce(rp) => Repetition::ZeroOrOnce(optimize(rp)), - Repetition::ZeroOrMore(rp) => Repetition::ZeroOrMore(optimize(rp)), - Repetition::OnceOrMore(rp) => Repetition::OnceOrMore(optimize(rp)), - Repetition::Specific(rp, min, max) => { - Repetition::Specific(optimize(rp), min, max) - } - })) - } - - p => p, - } - } - - /// concat_chars_to_str collapses successive single-character patterns into a single string - /// pattern. - fn concat_chars_to_str(p: Pattern) -> Pattern { - match p { - Pattern::Concat(mut v) => { - let mut drain = v.drain(..); - let mut new_elems = vec![]; - - // Find runs of adjacent chars/strings and convert them to single strings. - // Once a run is broken, append non-char/string patterns and continue afterwards. - let mut chars = vec![]; - for cp in &mut drain { - match cp { - Pattern::Char(c) => chars.push(c), - Pattern::Str(mut s) => { - chars.extend(s.drain(..)); - } - e => { - // Once a run of chars/strings is broken, merge the run, push the - // non-char/string pattern and continue with the next one. - if chars.len() == 1 { - new_elems.push(Pattern::Char(chars.pop().unwrap())) - } else if chars.len() > 1 { - let newp = Pattern::Str(String::from_iter(chars.drain(..))); - new_elems.push(newp); - } - new_elems.push(e); - assert!(chars.is_empty()); - } - } - } - - if chars.len() == 1 { - new_elems.push(Pattern::Char(chars[0])); - } else if chars.len() > 1 { - let newp = Pattern::Str(String::from_iter(chars.drain(..))); - new_elems.push(newp); - } - - if new_elems.len() == 1 { - new_elems.pop().unwrap() - } else { - Pattern::Concat(new_elems) - } - } - _ => p, - } - } - - /// flatten_alternate takes the alternatives in a Pattern::Alternate and reduces the nesting - /// recursively. - fn flatten_alternate(p: Pattern) -> Pattern { - fn _flatten_alternate(p: Pattern) -> Vec<Pattern> { - match p { - Pattern::Alternate(a) => { - let mut alternatives = vec![]; - for alt in a.into_iter() { - alternatives.append(&mut _flatten_alternate(alt)); - } - alternatives - } - p_ => vec![p_], - } - } - - let mut fa = _flatten_alternate(p); - if fa.len() == 1 { - fa.pop().unwrap() - } else { - Pattern::Alternate(fa) - } - } -} - #[cfg(test)] mod tests { use super::*; use crate::state::*; - #[test] - fn test_repr_optimize() { - // case = (want, input) - let case1 = ( - Pattern::Str("abc".to_string()), - Pattern::Concat(vec![ - Pattern::Char('a'), - Pattern::Char('b'), - Pattern::Char('c'), - ]), - ); - let case2 = ( - Pattern::Str("abcd".to_string()), - Pattern::Concat(vec![ - Pattern::Str("a".to_string()), - Pattern::Char('b'), - Pattern::Str("cd".to_string()), - ]), - ); - let case3 = ( - Pattern::Concat(vec![ - Pattern::Str("abc".to_string()), - Pattern::Anchor(AnchorLocation::End), - Pattern::Char('d'), - ]), - Pattern::Concat(vec![ - Pattern::Char('a'), - Pattern::Char('b'), - Pattern::Char('c'), - Pattern::Anchor(AnchorLocation::End), - Pattern::Char('d'), - ]), - ); - - for c in vec![case1, case2, case3].into_iter() { - assert_eq!(c.0, optimize::optimize(c.1)); - } - } - // /a(b|c)/ fn simple_re0() -> Pattern { Pattern::Concat(vec![