changeset 37:423f7851fe6d draft

Genericize ParseState and implement first prototype of stream parser.
author Lewin Bormann <lbo@spheniscida.de>
date Wed, 22 May 2019 22:48:28 +0200
parents 859c9eaa90c2
children 74801fe3d0dc
files pcombinators/combinators.py pcombinators/state.py
diffstat 2 files changed, 121 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/pcombinators/combinators.py	Wed May 22 22:01:10 2019 +0200
+++ b/pcombinators/combinators.py	Wed May 22 22:48:28 2019 +0200
@@ -7,8 +7,6 @@
 by all Parser's parse() method.
 """
 
-from pcombinators.state import ParseState
-
 class Parser:
     """Super class for all parsers. Implements operator overloading for easier
     chaining of parsers."""
--- a/pcombinators/state.py	Wed May 22 22:01:10 2019 +0200
+++ b/pcombinators/state.py	Wed May 22 22:48:28 2019 +0200
@@ -6,11 +6,127 @@
 @author: lbo
 """
 
+import io
+
 def ps(s):
     return ParseState(s)
 
-class ParseState:
-    """Encapsulates state as the parser goes through input."""
+class _State:
+    """Generic parsing state representation."""
+
+    _holds = [] # List of indices that are still marked as needed. Ascending
+
+    def next(self):
+        pass
+
+    def peek(self):
+        raise NotImplementedError()
+
+    def index(self):
+        raise NotImplementedError()
+
+    def len(self):
+        raise NotImplementedError()
+
+    # Holds are a simple garbage collection mechanism by which parsers should
+    # indicate which parts of state they may still backtrack to.
+    class ParserHold:
+        def __init__(self, i):
+            self.total_index = i
+        total_index = 0
+
+    def _maybe_collect(self):
+        pass
+
+    def hold(self):
+        self._holds.append(self.index())
+        return self.ParserHold(self.index())
+
+    def release(self, hold):
+        self._holds.pop(hold.total_index)
+        self._maybe_collect()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        return self.next()
+
+    def finished(self):
+        return self.index() == self.len()
+
+    def remaining(self):
+        raise NotImplementedError()
+
+    class ParseException(Exception):
+        pass
+
+    def error(self, msg):
+        raise ParseException(msg)
+
+    def reset(self):
+        raise NotImplementedError('use holds!')
+
+class ParseFileState(_State):
+    """A lazy parsing state implementation, reading from stream."""
+    _fobj = None
+    _buf = [] # List of characters.
+
+    _index = 0 # Index in current _buf
+    _total_offset = 0 # Index of first _buf entry in stream since start
+
+    def __init__(self, f):
+        if type(f) is str:
+            self._fobj = open(f, 'r')
+        elif isinstance(f, io.IOBase):
+            self._fobj = f
+        else:
+            raise NotImplementedError('unknown input source {}'.format(f))
+
+    def __del__(self):
+        if self._fobj:
+            self._fobj.close()
+    def _maybe_collect(self):
+        # No holds left, forget everything up to now.
+        if len(self._holds) == 0:
+            self._buf = self._buf[self._index:]
+        else: # Find oldest hold and update buffer.
+            assert sorted(self._holds) == self._holds
+            to_clean = self._holds[0]-self._total_offset
+            self._buf = self._buf[:to_clean]
+            self._total_offset += to_clean
+            self._index -= to_clean
+            self._holds.pop(0)
+
+    def index(self):
+        return self._total_offset + self._index
+
+    PREFILL = 256
+
+    def fill_buffer(self, min=0):
+        if len(self._buf)-self._index <= min:
+            self._buf.extend(self._fobj.read(self.PREFILL))
+
+    def peek(self):
+        self.fill_buffer()
+        return self._buf[self._index]
+
+    def next(self):
+        self.fill_buffer()
+        self._index += 1
+        return self._buf[self._index-1]
+
+    def remaining(self):
+        print('warning: remaining() on ParseFileState is only accurate to up to {} characters lookahead and expensive'.format(self.PREFIL))
+        self.fill_buffer(self.PREFILL)
+        return self._buf[self._index:]
+
+    def len(self):
+        print('warning: len() is inaccurate on ParseFileState, returning only past and present state')
+        return self._total_offset + len(self._buf)
+
+class ParseState(_State):
+    """Encapsulates state as the parser goes through input supplied as string."""
 
     _input = ''
     _index = 0
@@ -37,6 +153,9 @@
     def index(self):
         return self._index
 
+    def len(self):
+        return len(self._input)
+
     def reset(self, ix):
         self._index = ix
 
@@ -53,9 +172,3 @@
         if self.finished():
             return ''
         return self._input[self._index:]
-
-    class ParseException(Exception):
-        pass
-
-    def error(self, msg):
-        raise ParseException(msg)