Mercurial > lbo > hg > rex
changeset 73:0ca6749edbb1
Add more documentation.
author | Lewin Bormann <lbo@spheniscida.de> |
---|---|
date | Sun, 01 Sep 2019 22:15:03 +0200 |
parents | 6ce648912de8 |
children | 15032c4aa33a |
files | src/compile.rs src/lib.rs src/matching.rs src/parse.rs src/repr.rs src/state.rs |
diffstat | 6 files changed, 58 insertions(+), 26 deletions(-) [+] |
line wrap: on
line diff
--- a/src/compile.rs Sun Sep 01 21:44:27 2019 +0200 +++ b/src/compile.rs Sun Sep 01 22:15:03 2019 +0200 @@ -1,3 +1,13 @@ +//! The compile module is responsible for taking a `Pattern` and compiling it into a `StateGraph` +//! (as defined in `state`). It is recommended to optimize Patterns before compiling them. +//! +//! The output of `to_state()`, which is implemented for Pattern and some subtypes, is a +//! `StateGraph`, which itself is a vector of `State` objects, each of which in turn is a state of +//! the generated state machine. The states reference each other by their indices in the +//! `StateGraph`. +//! +//! `start_compile()` is the entry point and public API of this module. + use matcher::{self, wrap_matcher}; use repr::{AnchorLocation, Pattern, Repetition}; use state::{State, StateGraph, StateRef, Submatch}; @@ -157,7 +167,7 @@ } } -// alternate compiles a list of patterns into a graph that accepts any one of the patterns. +/// alternate compiles a list of patterns into a graph that accepts any one of the patterns. fn alternate( sg: &mut StateGraph, ps: &[Pattern],
--- a/src/lib.rs Sun Sep 01 21:44:27 2019 +0200 +++ b/src/lib.rs Sun Sep 01 22:15:03 2019 +0200 @@ -9,6 +9,8 @@ mod tests; +/// Render the state machine generated from `re` as graphviz `dot` input. The result can be pasted +/// into `visualize.sh`, which renders a PNG image from it. pub fn render_graph(re: &str) -> String { return format!( "digraph st {{ {} }}", @@ -16,6 +18,8 @@ ); } +/// Translate a regular expression string into an unoptimized `Pattern`. This is useful for +/// inspecting (Pattern implements `Debug`) the parser output if there are unexpected effects. fn parse(re: &str) -> Result<repr::Pattern, String> { return parse::parse(re); } @@ -39,8 +43,8 @@ )); } -/// Compile a regular expression into a representation that can be directly used for matching with -/// `match_re()`. +/// Optimize and compile a regular expression into a representation that can be directly used for +/// matching with `match_re()`. pub fn compile(re: &str) -> Result<state::CompiledRE, String> { Ok(compile::start_compile(&repr::optimize::optimize(parse( re,
--- a/src/matching.rs Sun Sep 01 21:44:27 2019 +0200 +++ b/src/matching.rs Sun Sep 01 22:15:03 2019 +0200 @@ -11,19 +11,27 @@ use matcher::Matchee; use state::{StateGraph, StateRef, Submatch}; +/// MatchState stores a state in the overall algorithm while matching a string ("matchee") against +/// a regular expression. Every time there is more than one forward state (e.g. optional +/// repetitions), the state can be "forked", meaning a new state is produced, that can be used +/// later if some part of the regular expression doesn't match and we need to go back. #[derive(Clone, Debug)] pub struct MatchState { + /// The current node in the StateGraph. node: StateRef, + /// String that we are working on and position in it. matchee: Matchee, - // The set of submatches encountered, indexed by the start of a submatch. If submatches - // (with (start,end)) (1,3),(5,10) have been encountered, then submatches[1] = Some(3) and - // submatches[5] = Some(10). If the contents is None, then the end has not yet been - // encountered. - // BUG: This doesn't work for several submatches starting at the same position. For that, we'd - // need a Rc<RefCell<Vec<Vec<usize>>>> :-) + /// The set of submatches encountered, indexed by the start of a submatch. If submatches + /// (with (start,end)) (1,3),(5,10) have been encountered, then submatches[1] = Some(3) and + /// submatches[5] = Some(10). If the contents is None, then the end has not yet been + /// encountered. + /// + /// BUG: This doesn't work for several submatches starting at the same position. For that, we'd + /// need a Rc<RefCell<Vec<Vec<usize>>>> :-) submatches: Rc<RefCell<Vec<Option<usize>>>>, - // We need to clone the submatches queue only rarely (when a submatch starts or ends). + /// We need to clone the submatches queue only rarely (when a submatch starts or ends). submatches_todo: Rc<Vec<usize>>, + /// Currently unused debug: bool, }
--- a/src/parse.rs Sun Sep 01 21:44:27 2019 +0200 +++ b/src/parse.rs Sun Sep 01 22:15:03 2019 +0200 @@ -10,13 +10,16 @@ use repr::{AnchorLocation, Pattern, Repetition}; +/// The entry point for this module: Parse a string into a `Pattern` that can be optimized and/or +/// compiled. pub fn parse(s: &str) -> Result<Pattern, String> { let src: Vec<char> = s.chars().collect(); parse_re(ParseState::new(&src)).map(|t| t.0) } -/// ParseStack contains already parsed elements of a regular expression. It can be converted to an -/// RETree. +/// ParseStack contains already parsed elements of a regular expression, and is used for parsing +/// textual regular expressions (as the parsing algorithm is stack-based). It can be converted to +/// an Pattern. struct ParseStack { s: Vec<Pattern>, } @@ -36,7 +39,7 @@ fn empty(&self) -> bool { self.s.is_empty() } - fn to_retree(mut self) -> Pattern { + fn to_pattern(mut self) -> Pattern { if self.s.len() > 1 { Pattern::Concat(self.s) } else if self.s.len() == 1 { @@ -119,8 +122,8 @@ } } -// parse_re is the parser entry point; like all parser functions, it returns either a pair of -// (parsed pattern, new ParseState) or an error string. +/// parse_re is the parser entry point; like all parser functions, it returns either a pair of +/// (parsed pattern, new ParseState) or an error string. fn parse_re<'a>(mut s: ParseState<'a>) -> Result<(Pattern, ParseState<'a>), String> { // The stack assists us in parsing the linear parts of a regular expression, e.g. non-pattern // characters, or character sets. @@ -169,7 +172,7 @@ // alternation between what we've already seen and the stuff on the right. '|' => { let (rest, newst) = parse_re(s.from(1))?; - let left = stack.to_retree(); + let left = stack.to_pattern(); stack = ParseStack::new(); stack.push(Pattern::Alternate(vec![left, rest])); s = newst; @@ -217,11 +220,11 @@ } } } - Ok((stack.to_retree(), s)) + Ok((stack.to_pattern(), s)) } -// parse_char_set parses the character set at the start of the input state. -// Valid states are [a], [ab], [a-z], [-a-z], [a-z-] and [a-fh-kl]. +/// parse_char_set parses the character set at the start of the input state. +/// Valid states are [a], [ab], [a-z], [-a-z], [a-z-] and [a-fh-kl]. fn parse_char_set<'a>(s: ParseState<'a>) -> Result<(Pattern, ParseState<'a>), String> { if let Some((cs, rest)) = split_in_parens(s.clone(), SQUARE_BRACKETS) { let mut chars: Vec<char> = vec![]; @@ -261,7 +264,7 @@ } } -// Parse a repetition spec inside curly braces: {1} | {1,} | {,1} | {1,2} +/// Parse a repetition spec inside curly braces: {1} | {1,} | {,1} | {1,2} fn parse_specific_repetition<'a>(rep: ParseState<'a>, p: Pattern) -> Result<Pattern, String> { let mut nparts = 0; let mut parts: [Option<&[char]>; 2] = Default::default(); @@ -330,12 +333,15 @@ Err(format!("invalid repetition pattern {:?}", &rep[..])) } +/// Constants for generalizing parsing of parentheses. const ROUND_PARENS: (char, char) = ('(', ')'); +/// Constants for generalizing parsing of parentheses. const SQUARE_BRACKETS: (char, char) = ('[', ']'); +/// Constants for generalizing parsing of parentheses. const CURLY_BRACKETS: (char, char) = ('{', '}'); -// split_in_parens returns two new ParseStates; the first one containing the contents of the -// parenthesized clause starting at s[0], the second one containing the rest. +/// split_in_parens returns two new ParseStates; the first one containing the contents of the +/// parenthesized clause starting at s[0], the second one containing the rest. fn split_in_parens<'a>( s: ParseState<'a>, parens: (char, char), @@ -347,8 +353,8 @@ } } -// find_closing_paren returns the index of the parenthesis closing the opening parenthesis at the -// beginning of the state's string. +/// find_closing_paren returns the index of the parenthesis closing the opening parenthesis at the +/// beginning of the state's string. fn find_closing_paren<'a>(s: ParseState<'a>, parens: (char, char)) -> Option<usize> { if s[0] != parens.0 { return None;
--- a/src/repr.rs Sun Sep 01 21:44:27 2019 +0200 +++ b/src/repr.rs Sun Sep 01 22:15:03 2019 +0200 @@ -27,6 +27,7 @@ Anchor(AnchorLocation), } +/// `AnchorLocation` encodes `^` and `$` anchors, respectively. #[derive(Clone, Debug, PartialEq)] pub enum AnchorLocation { Begin, @@ -35,8 +36,9 @@ /// A pattern can be repeated in various manners, which is represented by the pattern being wrapped /// in a Repetition. +/// /// The inner type is a pattern, because a repetition is either concerned with only one pattern -/// (/.?/), or a submatch (/(abc)?/). +/// (`/.?/`), or a submatch (`/(abc)?/`). #[derive(Clone, Debug, PartialEq)] pub enum Repetition { /// /P+/
--- a/src/state.rs Sun Sep 01 21:44:27 2019 +0200 +++ b/src/state.rs Sun Sep 01 22:15:03 2019 +0200 @@ -36,6 +36,8 @@ pub sub: Option<Submatch>, } +/// A `State` can be marked to start or end a submatch (usually denoted by parentheses in a regular +/// expression). #[derive(Clone, Debug)] pub enum Submatch { Start, @@ -88,7 +90,7 @@ } } -/// dot converts a graph starting with s into a Dot graph. +/// dot converts a graph into a graphviz dot representation. pub fn dot(stateg: &StateGraph) -> String { let mut result = String::new();