changeset 73:0ca6749edbb1

Add more documentation.
author Lewin Bormann <lbo@spheniscida.de>
date Sun, 01 Sep 2019 22:15:03 +0200
parents 6ce648912de8
children 15032c4aa33a
files src/compile.rs src/lib.rs src/matching.rs src/parse.rs src/repr.rs src/state.rs
diffstat 6 files changed, 58 insertions(+), 26 deletions(-) [+]
line wrap: on
line diff
--- a/src/compile.rs	Sun Sep 01 21:44:27 2019 +0200
+++ b/src/compile.rs	Sun Sep 01 22:15:03 2019 +0200
@@ -1,3 +1,13 @@
+//! The compile module is responsible for taking a `Pattern` and compiling it into a `StateGraph`
+//! (as defined in `state`). It is recommended to optimize Patterns before compiling them.
+//!
+//! The output of `to_state()`, which is implemented for Pattern and some subtypes, is a
+//! `StateGraph`, which itself is a vector of `State` objects, each of which in turn is a state of
+//! the generated state machine. The states reference each other by their indices in the
+//! `StateGraph`.
+//!
+//! `start_compile()` is the entry point and public API of this module.
+
 use matcher::{self, wrap_matcher};
 use repr::{AnchorLocation, Pattern, Repetition};
 use state::{State, StateGraph, StateRef, Submatch};
@@ -157,7 +167,7 @@
     }
 }
 
-// alternate compiles a list of patterns into a graph that accepts any one of the patterns.
+/// alternate compiles a list of patterns into a graph that accepts any one of the patterns.
 fn alternate(
     sg: &mut StateGraph,
     ps: &[Pattern],
--- a/src/lib.rs	Sun Sep 01 21:44:27 2019 +0200
+++ b/src/lib.rs	Sun Sep 01 22:15:03 2019 +0200
@@ -9,6 +9,8 @@
 
 mod tests;
 
+/// Render the state machine generated from `re` as graphviz `dot` input. The result can be pasted
+/// into `visualize.sh`, which renders a PNG image from it.
 pub fn render_graph(re: &str) -> String {
     return format!(
         "digraph st {{ {} }}",
@@ -16,6 +18,8 @@
     );
 }
 
+/// Translate a regular expression string into an unoptimized `Pattern`. This is useful for
+/// inspecting (Pattern implements `Debug`) the parser output if there are unexpected effects.
 fn parse(re: &str) -> Result<repr::Pattern, String> {
     return parse::parse(re);
 }
@@ -39,8 +43,8 @@
     ));
 }
 
-/// Compile a regular expression into a representation that can be directly used for matching with
-/// `match_re()`.
+/// Optimize and compile a regular expression into a representation that can be directly used for
+/// matching with `match_re()`.
 pub fn compile(re: &str) -> Result<state::CompiledRE, String> {
     Ok(compile::start_compile(&repr::optimize::optimize(parse(
         re,
--- a/src/matching.rs	Sun Sep 01 21:44:27 2019 +0200
+++ b/src/matching.rs	Sun Sep 01 22:15:03 2019 +0200
@@ -11,19 +11,27 @@
 use matcher::Matchee;
 use state::{StateGraph, StateRef, Submatch};
 
+/// MatchState stores a state in the overall algorithm while matching a string ("matchee") against
+/// a regular expression. Every time there is more than one forward state (e.g. optional
+/// repetitions), the state can be "forked", meaning a new state is produced, that can be used
+/// later if some part of the regular expression doesn't match and we need to go back.
 #[derive(Clone, Debug)]
 pub struct MatchState {
+    /// The current node in the StateGraph.
     node: StateRef,
+    /// String that we are working on and position in it.
     matchee: Matchee,
-    // The set of submatches encountered, indexed by the start of a submatch. If submatches
-    // (with (start,end)) (1,3),(5,10) have been encountered, then submatches[1] = Some(3) and
-    // submatches[5] = Some(10). If the contents is None, then the end has not yet been
-    // encountered.
-    // BUG: This doesn't work for several submatches starting at the same position. For that, we'd
-    // need a Rc<RefCell<Vec<Vec<usize>>>> :-)
+    /// The set of submatches encountered, indexed by the start of a submatch. If submatches
+    /// (with (start,end)) (1,3),(5,10) have been encountered, then submatches[1] = Some(3) and
+    /// submatches[5] = Some(10). If the contents is None, then the end has not yet been
+    /// encountered.
+    ///
+    /// BUG: This doesn't work for several submatches starting at the same position. For that, we'd
+    /// need a Rc<RefCell<Vec<Vec<usize>>>> :-)
     submatches: Rc<RefCell<Vec<Option<usize>>>>,
-    // We need to clone the submatches queue only rarely (when a submatch starts or ends).
+    /// We need to clone the submatches queue only rarely (when a submatch starts or ends).
     submatches_todo: Rc<Vec<usize>>,
+    /// Currently unused
     debug: bool,
 }
 
--- a/src/parse.rs	Sun Sep 01 21:44:27 2019 +0200
+++ b/src/parse.rs	Sun Sep 01 22:15:03 2019 +0200
@@ -10,13 +10,16 @@
 
 use repr::{AnchorLocation, Pattern, Repetition};
 
+/// The entry point for this module: Parse a string into a `Pattern` that can be optimized and/or
+/// compiled.
 pub fn parse(s: &str) -> Result<Pattern, String> {
     let src: Vec<char> = s.chars().collect();
     parse_re(ParseState::new(&src)).map(|t| t.0)
 }
 
-/// ParseStack contains already parsed elements of a regular expression. It can be converted to an
-/// RETree.
+/// ParseStack contains already parsed elements of a regular expression, and is used for parsing
+/// textual regular expressions (as the parsing algorithm is stack-based). It can be converted to
+/// an Pattern.
 struct ParseStack {
     s: Vec<Pattern>,
 }
@@ -36,7 +39,7 @@
     fn empty(&self) -> bool {
         self.s.is_empty()
     }
-    fn to_retree(mut self) -> Pattern {
+    fn to_pattern(mut self) -> Pattern {
         if self.s.len() > 1 {
             Pattern::Concat(self.s)
         } else if self.s.len() == 1 {
@@ -119,8 +122,8 @@
     }
 }
 
-// parse_re is the parser entry point; like all parser functions, it returns either a pair of
-// (parsed pattern, new ParseState) or an error string.
+/// parse_re is the parser entry point; like all parser functions, it returns either a pair of
+/// (parsed pattern, new ParseState) or an error string.
 fn parse_re<'a>(mut s: ParseState<'a>) -> Result<(Pattern, ParseState<'a>), String> {
     // The stack assists us in parsing the linear parts of a regular expression, e.g. non-pattern
     // characters, or character sets.
@@ -169,7 +172,7 @@
             // alternation between what we've already seen and the stuff on the right.
             '|' => {
                 let (rest, newst) = parse_re(s.from(1))?;
-                let left = stack.to_retree();
+                let left = stack.to_pattern();
                 stack = ParseStack::new();
                 stack.push(Pattern::Alternate(vec![left, rest]));
                 s = newst;
@@ -217,11 +220,11 @@
             }
         }
     }
-    Ok((stack.to_retree(), s))
+    Ok((stack.to_pattern(), s))
 }
 
-// parse_char_set parses the character set at the start of the input state.
-// Valid states are [a], [ab], [a-z], [-a-z], [a-z-] and [a-fh-kl].
+/// parse_char_set parses the character set at the start of the input state.
+/// Valid states are [a], [ab], [a-z], [-a-z], [a-z-] and [a-fh-kl].
 fn parse_char_set<'a>(s: ParseState<'a>) -> Result<(Pattern, ParseState<'a>), String> {
     if let Some((cs, rest)) = split_in_parens(s.clone(), SQUARE_BRACKETS) {
         let mut chars: Vec<char> = vec![];
@@ -261,7 +264,7 @@
     }
 }
 
-// Parse a repetition spec inside curly braces: {1} | {1,} | {,1} | {1,2}
+/// Parse a repetition spec inside curly braces: {1} | {1,} | {,1} | {1,2}
 fn parse_specific_repetition<'a>(rep: ParseState<'a>, p: Pattern) -> Result<Pattern, String> {
     let mut nparts = 0;
     let mut parts: [Option<&[char]>; 2] = Default::default();
@@ -330,12 +333,15 @@
     Err(format!("invalid repetition pattern {:?}", &rep[..]))
 }
 
+/// Constants for generalizing parsing of parentheses.
 const ROUND_PARENS: (char, char) = ('(', ')');
+/// Constants for generalizing parsing of parentheses.
 const SQUARE_BRACKETS: (char, char) = ('[', ']');
+/// Constants for generalizing parsing of parentheses.
 const CURLY_BRACKETS: (char, char) = ('{', '}');
 
-// split_in_parens returns two new ParseStates; the first one containing the contents of the
-// parenthesized clause starting at s[0], the second one containing the rest.
+/// split_in_parens returns two new ParseStates; the first one containing the contents of the
+/// parenthesized clause starting at s[0], the second one containing the rest.
 fn split_in_parens<'a>(
     s: ParseState<'a>,
     parens: (char, char),
@@ -347,8 +353,8 @@
     }
 }
 
-// find_closing_paren returns the index of the parenthesis closing the opening parenthesis at the
-// beginning of the state's string.
+/// find_closing_paren returns the index of the parenthesis closing the opening parenthesis at the
+/// beginning of the state's string.
 fn find_closing_paren<'a>(s: ParseState<'a>, parens: (char, char)) -> Option<usize> {
     if s[0] != parens.0 {
         return None;
--- a/src/repr.rs	Sun Sep 01 21:44:27 2019 +0200
+++ b/src/repr.rs	Sun Sep 01 22:15:03 2019 +0200
@@ -27,6 +27,7 @@
     Anchor(AnchorLocation),
 }
 
+/// `AnchorLocation` encodes `^` and `$` anchors, respectively.
 #[derive(Clone, Debug, PartialEq)]
 pub enum AnchorLocation {
     Begin,
@@ -35,8 +36,9 @@
 
 /// A pattern can be repeated in various manners, which is represented by the pattern being wrapped
 /// in a Repetition.
+///
 /// The inner type is a pattern, because a repetition is either concerned with only one pattern
-/// (/.?/), or a submatch (/(abc)?/).
+/// (`/.?/`), or a submatch (`/(abc)?/`).
 #[derive(Clone, Debug, PartialEq)]
 pub enum Repetition {
     /// /P+/
--- a/src/state.rs	Sun Sep 01 21:44:27 2019 +0200
+++ b/src/state.rs	Sun Sep 01 22:15:03 2019 +0200
@@ -36,6 +36,8 @@
     pub sub: Option<Submatch>,
 }
 
+/// A `State` can be marked to start or end a submatch (usually denoted by parentheses in a regular
+/// expression).
 #[derive(Clone, Debug)]
 pub enum Submatch {
     Start,
@@ -88,7 +90,7 @@
     }
 }
 
-/// dot converts a graph starting with s into a Dot graph.
+/// dot converts a graph into a graphviz dot representation.
 pub fn dot(stateg: &StateGraph) -> String {
     let mut result = String::new();