changeset 36:d65f832b5e2e

Implement buffer garbage collection. This makes it more memory efficient to parse streams when the parser regularly advances and drops references to earlier parts of the input (e.g., in CSV files). This currently has a small performance penalty for inputs where garbage is collected.
author Lewin Bormann <lewin@lewin-bormann.info>
date Fri, 07 Jun 2019 18:15:13 +0200
parents 2e935e681f9b
children 1d67d854b5d7
files src/combinators.rs src/lib.rs src/state.rs
diffstat 3 files changed, 44 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/src/combinators.rs	Fri Jun 07 18:10:11 2019 +0200
+++ b/src/combinators.rs	Fri Jun 07 18:15:13 2019 +0200
@@ -574,14 +574,11 @@
         let mut i = 0;
         let mut ps = ParseState::new("123 124");
         let lzy = || {
-                assert_eq!(0, i);
-                i += 1;
-                string_of("0123456789", RepeatSpec::Min(1))
-            };
-        let mut p = Alternative::new((
-            string_of("a", RepeatSpec::Min(1)),
-            Lazy::new(lzy),
-        ));
+            assert_eq!(0, i);
+            i += 1;
+            string_of("0123456789", RepeatSpec::Min(1))
+        };
+        let mut p = Alternative::new((string_of("a", RepeatSpec::Min(1)), Lazy::new(lzy)));
         assert_eq!(Ok("123".to_string()), p.parse(&mut ps));
         assert!(whitespace().parse(&mut ps).is_ok());
         assert_eq!(Ok("124".to_string()), p.parse(&mut ps));
--- a/src/lib.rs	Fri Jun 07 18:10:11 2019 +0200
+++ b/src/lib.rs	Fri Jun 07 18:15:13 2019 +0200
@@ -56,6 +56,6 @@
 mod state;
 
 pub use combinators::{Alternative, Maybe, PartialSequence, Repeat, Sequence, Then, Transform};
-pub use parser::{execerr, Parser, ParseResult};
+pub use parser::{execerr, ParseResult, Parser};
 pub use primitives::{float, string_none_of, string_of, whitespace, Int, StringParser};
 pub use state::ParseState;
--- a/src/state.rs	Fri Jun 07 18:10:11 2019 +0200
+++ b/src/state.rs	Fri Jun 07 18:15:13 2019 +0200
@@ -85,6 +85,9 @@
 
 impl<Iter: Iterator<Item = char>> ParseState<Iter> {
     const PREFILL_DEFAULT: usize = 1024;
+    /// Only collect buffer garbage when collectable number of bytes in buffer is larger than this
+    /// threshold.
+    const GARBAGE_COLLECT_THRESHOLD: usize = 1024 * 4;
 
     /// Return current index in input.
     pub fn index(&mut self) -> usize {
@@ -112,7 +115,7 @@
             }
             Some((ix, count)) if ix == h.ix && count == 1 => {
                 self.oldest_hold_count = None;
-                // TODO: trigger garbage collection
+                self.maybe_gc();
             }
             _ => {}
         }
@@ -136,6 +139,40 @@
         h.defuse();
     }
 
+    /// Remove data from buffer that is not hold by any parser anymore.
+    fn maybe_gc(&mut self) -> bool {
+        match self.oldest_hold_count {
+            None if self.current > 0 => {
+                if self.current < Self::GARBAGE_COLLECT_THRESHOLD {
+                    return false;
+                }
+                //println!("GC: Collecting {} elements between {} and {}", self.current-1, self.global, self.global+self.current-1);
+                // We can collect up to self.current-1.
+                self.buf.rotate_left(self.current - 1);
+                self.buf
+                    .resize(self.buf.len() - self.current + 1, 0 as char);
+                self.current = 1;
+                // self.global remains untouched.
+                true
+            }
+            Some((ix, _)) => {
+                // We can collect up to ix-1.
+
+                // Calculate local offset (in buf)
+                let ix = ix - self.global;
+                if ix < Self::GARBAGE_COLLECT_THRESHOLD {
+                    return false;
+                }
+                //println!("GC: Collecting {} elements between {} and {}", ix-1, self.global-self.buf.len()+1, ix-1);
+                self.buf.rotate_left(ix - 1);
+                self.buf.resize(self.buf.len() - ix + 1, 0 as char);
+                self.current = 1;
+                true
+            }
+            _ => false,
+        }
+    }
+
     /// Returns true if no input is left.
     pub fn finished(&self) -> bool {
         self.next.is_none() && self.current == self.buf.len()