Mercurial > lbo > hg > rcombinators
changeset 13:5eb364ffbdcb
Improve Integer parsing and extend it to all integer types
author | Lewin Bormann <lewin@lewin-bormann.info> |
---|---|
date | Sun, 02 Jun 2019 23:11:00 +0200 |
parents | a37d7c2aa256 |
children | 4122b5f7562b |
files | src/combinators.rs src/primitives.rs |
diffstat | 2 files changed, 137 insertions(+), 28 deletions(-) [+] |
line wrap: on
line diff
--- a/src/combinators.rs Sun Jun 02 15:19:00 2019 +0200 +++ b/src/combinators.rs Sun Jun 02 23:11:00 2019 +0200 @@ -1,5 +1,3 @@ -use std::ops::Shr; - use crate::parser::{ParseError, ParseResult, Parser}; use crate::state::ParseState; @@ -312,7 +310,7 @@ #[test] fn test_pair() { - let mut p = Sequence::new((Int, StringParser::new(" aba".to_string()))); + let mut p = Sequence::new((Int64::new(), StringParser::new(" aba".to_string()))); let mut ps = ParseState::new("123 aba"); assert_eq!(Ok((123, " aba".to_string())), p.parse(&mut ps)); } @@ -345,7 +343,7 @@ StringParser::new("ab"), StringParser::new("de"), StringParser::new(" "), - Transform::new(Int, |i| Ok(i.to_string())), + Transform::new(Int64::new(), |i| Ok(i.to_string())), )); let mut ps = ParseState::new("de 34"); assert_eq!(Ok("de".to_string()), p.parse(&mut ps)); @@ -355,7 +353,7 @@ #[test] fn test_partial_sequence() { - let mut p = PartialSequence::new((StringParser::new("a"), StringParser::new("c"), Int)); + let mut p = PartialSequence::new((StringParser::new("a"), StringParser::new("c"), Int64::new())); let mut ps = ParseState::new("acde"); assert_eq!( Ok((Some("a".to_string()), Some("c".to_string()), None)), @@ -363,7 +361,7 @@ ); let mut p = PartialSequence::new(( - Sequence::new((Int, StringParser::new(" "), Int)), + Sequence::new((Int64::new(), StringParser::new(" "), Int64::new())), StringParser::new("x"), )); let mut ps = ParseState::new("12 -12 nothing else");
--- a/src/primitives.rs Sun Jun 02 15:19:00 2019 +0200 +++ b/src/primitives.rs Sun Jun 02 23:11:00 2019 +0200 @@ -3,7 +3,9 @@ use crate::state::ParseState; use std::collections::HashSet; +use std::error::Error; use std::iter::FromIterator; +use std::str; pub struct StringParser(String); @@ -44,29 +46,75 @@ } } -pub struct Int; +pub struct Int<IType: Default + str::FromStr>(IType); -impl Parser for Int { - type Result = i64; +pub type Int128 = Int<i128>; +pub type Int64 = Int<i64>; +pub type Int32 = Int<i32>; +pub type Int16 = Int<i16>; +pub type Int8 = Int<i8>; +pub type Uint128 = Int<u128>; +pub type Uint64 = Int<u64>; +pub type Uint32 = Int<u32>; +pub type Uint16 = Int<u16>; +pub type Uint8 = Int<u8>; + +impl<IType: Default + str::FromStr> Int<IType> { + pub fn new() -> Int<IType> { + Int(IType::default()) + } +} + +impl<IType: Default + str::FromStr<Err = std::num::ParseIntError> + std::convert::TryFrom<i8>> + Parser for Int<IType> +{ + type Result = IType; fn parse( &mut self, st: &mut ParseState<impl Iterator<Item = char>>, ) -> ParseResult<Self::Result> { - let mut negative: i64 = 1; - let mut result: i64 = 0; + // Optimization for most ints. + const BUFSIZE: usize = 8; + let mut buf: [char; BUFSIZE] = [' '; BUFSIZE]; + let mut widebuf: Option<Vec<char>> = None; + let mut i = 0; - match st.peek() { - None => return Err(ParseError::EOF), - Some('-') => negative = -1, - Some(c) if c.is_digit(10) => result = result * 10 + ((c as i64) - ('0' as i64)), - Some(_) => return Err(ParseError::Fail("not an int", st.index())), + if IType::try_from(-1 as i8).is_ok() { + // Check for negative sign, only if integer can be signed. + match st.peek() { + None => return Err(ParseError::EOF), + Some('-') => { + buf[i] = '-'; + i += 1; + } + Some(c) if c.is_digit(10) => { + buf[i] = c; + i += 1; + } + Some(_) => return Err(ParseError::Fail("not start of integer", st.index())), + } } + let hold = st.hold(); - st.next(); + if i > 0 { + st.next(); + } + // Consume digits loop { match st.next() { - Some(c) if c.is_digit(10) => result = result * 10 + ((c as i64) - ('0' as i64)), + Some(c) if c.is_digit(10) => { + if widebuf.is_none() { + buf[i] = c; + i += 1; + if i >= BUFSIZE { + widebuf = Some(buf.to_vec()); + } + } else { + widebuf.as_mut().unwrap().push(c); + i += 1; + } + } Some(_) => { st.undo_next(); break; @@ -74,16 +122,39 @@ None => break, } } - st.release(hold); - return Ok(result * negative); + if i == 0 { + st.reset(hold); + return Err(ParseError::Fail("no appropriate integer found", st.index())); + } + let intstr: String; + if widebuf.is_none() { + intstr = buf[..i].iter().collect(); + } else { + intstr = widebuf.unwrap().iter().collect(); + } + match IType::from_str(&intstr) { + Ok(i) => { + st.release(hold); + Ok(i) + } + Err(e) => { + st.reset(hold); + Err(ParseError::ExecFail(e.description().to_string())) + } + } } } -pub struct OneOf(HashSet<char>); +/// OneOf matches any character that is in its specification. +pub struct OneOf(HashSet<char>, bool); impl OneOf { pub fn new<S: AsRef<str>>(chars: S) -> OneOf { - OneOf(chars.as_ref().chars().collect()) + OneOf(chars.as_ref().chars().collect(), false) + } + /// Create a OneOf parser that parses all characters *not* in the given set. + pub fn new_inverse<S: AsRef<str>>(chars: S) -> OneOf { + OneOf(chars.as_ref().chars().collect(), true) } } @@ -94,11 +165,24 @@ st: &mut ParseState<impl Iterator<Item = char>>, ) -> ParseResult<Self::Result> { match st.peek() { - Some(c) if self.0.contains(&c) => { - st.next(); - Ok(c) + Some(c) => { + let present = self.0.contains(&c); + if self.1 { + // Inverse mode + if present { + return Err(ParseError::Fail("char (inverse) not matched", st.index())); + } + st.next(); + return Ok(c); + } else { + if present { + st.next(); + return Ok(c); + } + return Err(ParseError::Fail("char not matched", st.index())); + } } - _ => Err(ParseError::Fail("char not matched", st.index())), + _ => Err(ParseError::EOF), } } } @@ -111,6 +195,14 @@ rp.apply(make_string) } +/// A parser that parses a string consisting of any characters not in the set. +fn string_none_of<S: AsRef<str>>(chars: S, rp: RepeatSpec) -> impl Parser<Result = String> { + let oo = OneOf::new_inverse(chars); + let rp = Repeat::new(oo, rp); + let make_string = |charvec: Vec<char>| Ok(String::from_iter(charvec.into_iter())); + rp.apply(make_string) +} + #[cfg(test)] mod tests { use super::*; @@ -125,12 +217,24 @@ #[test] fn test_parse_int() { - let mut s = ParseState::new("-1252 353"); - let mut ip = Int; + let mut s = ParseState::new("-1252 353 354 -1253"); + let mut ip = Int64::new(); + let mut up = Uint64::new(); let mut sp = StringParser::new(" ".to_string()); assert_eq!(Ok(-1252), ip.parse(&mut s)); assert_eq!(Ok(" ".to_string()), sp.parse(&mut s)); assert_eq!(Ok(353), ip.parse(&mut s)); + assert_eq!(Ok(" ".to_string()), sp.parse(&mut s)); + assert_eq!(Ok(354), up.parse(&mut s)); + assert_eq!(Ok(" ".to_string()), sp.parse(&mut s)); + assert!(up.parse(&mut s).is_err()); + } + + #[test] + fn test_parse_long_int() { + let mut s = ParseState::new("123456789"); + let mut up = Uint128::new(); + assert_eq!(Ok(123456789 as u128), up.parse(&mut s)); } #[test] @@ -139,4 +243,11 @@ let mut p = string_of("abcd", RepeatSpec::Min(1)); assert_eq!(Ok("aaabc".to_string()), p.parse(&mut st)); } + + #[test] + fn test_string_none_of() { + let mut st = ParseState::new("aaabcxxzy"); + let mut p = string_none_of("xyz", RepeatSpec::Min(1)); + assert_eq!(Ok("aaabc".to_string()), p.parse(&mut st)); + } }