Refactor Parser/pgen and add documentation and explanations

To improve the readability and maintainability of the parser generator perform the following transformations: * Separate the metagrammar parser in its own class to simplify the parser generator logic. * Create separate classes for DFAs and NFAs and move methods that act exclusively on them from the parser generator to these classes. * Add docstrings and comment documenting the process to go from the grammar file into NFAs and then DFAs. Detail some of the algorithms and give some background explanations of some concepts that will helps readers not familiar with the parser generation process. * Select more descriptive names for some variables and variables. * PEP8 formatting and quote-style homogenization. The output of the parser generator remains the same (Include/graminit.h and Python/graminit.c remain untouched by running the new parser generator).
python · Aug 22, 2019 · 7d7106e · 7d7106e
1 parent e1c638d
commit 7d7106e
Show file tree

Hide file tree

Showing 7 changed files with 745 additions and 332 deletions.
diff --git a/Parser/pgen/__main__.py b/Parser/pgen/__main__.py
@@ -8,17 +8,15 @@ def main():
     parser.add_argument(
         "grammar", type=str, help="The file with the grammar definition in EBNF format"
     )
-    parser.add_argument(
-        "tokens", type=str, help="The file with the token definitions"
-    )
+    parser.add_argument("tokens", type=str, help="The file with the token definitions")
     parser.add_argument(
         "graminit_h",
-        type=argparse.FileType('w'),
+        type=argparse.FileType("w"),
         help="The path to write the grammar's non-terminals as #defines",
     )
     parser.add_argument(
         "graminit_c",
-        type=argparse.FileType('w'),
+        type=argparse.FileType("w"),
         help="The path to write the grammar as initialized data",
     )
 

diff --git a/Parser/pgen/automata.py b/Parser/pgen/automata.py
@@ -0,0 +1,365 @@
+"""Classes representing different state-machine concepts"""
+
+class NFA:
+    """A non deterministic finite automata
+
+    A non deterministic automata is a form of a finite state
+    machine that does not follow the rules that make a state
+    machine deterministic:
+
+       * Each of the transitions is uniquely determined by
+         the source state and input symbol
+       * Reading an input symbol is required for each state
+         transition (no epsilon transitions).
+
+    The class assumes that there is only one starting state and one
+    accepting (ending) state.
+
+    Attributes:
+        name (str): The name of the rule the NFA is representing.
+        start (NFAState): The starting state.
+        end (NFAState): The ending state
+    """
+
+    def __init__(self, start, end):
+        self.name = start.rule_name
+        self.start = start
+        self.end = end
+
+    def __repr__(self):
+        return "NFA(start={}, end={})".format(self.start, self.end)
+
+    def dump(self, writer=print):
+        """Dump a graphical representation of the NFA"""
+        todo = [self.start]
+        for i, state in enumerate(todo):
+            writer("  State", i, state is self.end and "(final)" or "")
+            for arc in state.arcs:
+                label = arc.target
+                next = arc.target
+                if next in todo:
+                    j = todo.index(next)
+                else:
+                    j = len(todo)
+                    todo.append(next)
+                if label is None:
+                    writer("    -> %d" % j)
+                else:
+                    writer("    %s -> %d" % (label, j))
+
+
+class NFAArc:
+    """An arc representing a transition between two NFA states.
+
+    NFA states can be connected via two ways:
+
+        * A label transition: an input equal to the label must
+          be consumed to perform the transition.
+        * An epsilon transition: the transition can be taken without
+          consuming any input
+
+        Attributes:
+            target (NFAState): The end of the transition that the arc represents.
+            label (Optional[str]): The label that must be consumed for making
+                the transition. An epsilon tranisition is represented using `None`.
+    """
+
+    def __init__(self, target, label):
+        self.target = target
+        self.label = label
+
+    def __repr__(self):
+        return "<%s: %s>" % (self.__class__.__name__, self.label)
+
+
+class NFAState:
+    """A state of a NFA
+
+    Attributes:
+        target (rule_name): The name of the rule the NFA containing this
+            state is representing.
+        arcs (Dict[Optional[str], NFAState]): A mapping representing transitions
+            between the current NFA state and another NFA state via following
+            a label.
+    """
+
+    def __init__(self, rule_name):
+        self.rule_name = rule_name
+        self.arcs = []
+
+    def add_arc(self, target, label=None):
+        """Add a new arc to the conecting this NFA state to another
+
+        This method will add a new arc to the list of arcs in this state
+        that connects another state to the current one via an optional label.
+
+        Attributes:
+            target (NFAState): The end of the transition that the arc represents.
+            label (Optional[str]): The label that must be consumed for making
+                the transition. If the label is not provided the transition is assumed
+                to be an epsilon-transition.
+        """
+        assert label is None or isinstance(label, str)
+        assert isinstance(target, NFAState)
+        self.arcs.append(NFAArc(target, label))
+
+    def __repr__(self):
+        return "<%s: from %s>" % (self.__class__.__name__, self.rule_name)
+
+
+class DFA:
+    """A deterministic finite automata
+
+    A deterministic finite automata is a form of a finite state machine
+    that obeys the followin rules:
+
+       * Each of the transitions is uniquely determined by
+         the source state and input symbol
+       * Reading an input symbol is required for each state
+         transition (no epsilon transitions).
+
+    Thee finite-state machine will that accepts or rejects stringsof symbols
+    and only produces a unique computation of the automaton for each input
+    string. The DFA must have a unique starting state (represented as the first
+    element in the list of states) but can have multiple accepting states.
+
+    Attributes:
+        name (str): The name of the rule the DFA is representing.
+        states (List[DFAState]): A collection of DFA states.
+    """
+
+    def __init__(self, name, states):
+        self.name = name
+        self.states = states
+
+    @classmethod
+    def from_nfa(cls, nfa):
+        """Constructs a DFA from a NFA using the Rabin–Scott construction algorithm.
+
+        To simulate the operation of a DFA on a given input string, one needs to keep
+        track of a single state at any time: the state that the automaton will reach after
+        seeing a prefix of the input. In contrast, to simulate an NFA, one needs to keep
+        track of a set of states: all of the states that the automaton could reach after
+        seeing the same prefix of the input, according to the nondeterministic choices made
+        by the automaton. There are two possible sources of non-determinism:
+
+        1) Multiple (one or more) transitions with the same label
+
+                         'A'     +-------+
+                    +----------->+ State +----------->+
+                    |            |   2   |
+            +-------+            +-------+
+            | State |
+            |   1   |            +-------+
+            +-------+            | State |
+                    +----------->+   3   +----------->+
+                         'A'     +-------+
+
+        2) Epsilon transitions (transitions that can be taken without consuming any input)
+
+            +-------+            +-------+
+            | State |     ε      | State |
+            |   1   +----------->+   2   +----------->+
+            +-------+            +-------+
+
+
+        In the first case the problem is that given the input 'A' we don't know which
+        transition shold we follow while in the second case the problem is that we can
+        choose both to follow the transition or not doing it. To solve this problem we can
+        imagine that we follow all possibilities at the same time and we construct new
+        states from the set of all possible reachable states. For every case in the previous
+        example:
+
+
+        1) For multiple transitions with the same label we colapse all of the final
+           states under the same one
+
+            +-------+            +-------+
+            | State |     'A'    | State |
+            |   1   +----------->+  2-3  +----------->+
+            +-------+            +-------+
+
+        2) For epsilon transitions we collapse all epsilon-reachable states into the
+           same one
+
+            +-------+
+            | State |
+            |  1-2  +----------->
+            +-------+
+
+        Because the DFA states consist of sets of NFA states, an n-state NFA may be converted
+        to a DFA with at most 2**n states. Notice that the constructed DFA is not minimal and
+        can be simplified afterwards.
+
+        Parameters:
+            name (NFA): The NFA to transform to DFA.
+        """
+        assert isinstance(nfa, NFA)
+
+        def add_closure(nfa_state, base_nfa_set):
+            """Calculate the epsilon-closure of a given state
+
+            Add to the *base_nfa_set* all the states that are
+            reachable from *nfa_state* via epsilon-transitions.
+            """
+            assert isinstance(nfa_state, NFAState)
+            if nfa_state in base_nfa_set:
+                return
+            base_nfa_set.add(nfa_state)
+            for nfa_arc in nfa_state.arcs:
+                if nfa_arc.label is None:
+                    add_closure(nfa_arc.target, base_nfa_set)
+
+        # Calculte the epsilon-closure of the starting state
+        base_nfa_set = set()
+        add_closure(nfa.start, base_nfa_set)
+
+        # Start by visiting the NFA starting state (there is only one).
+        states = [DFAState(nfa.name, base_nfa_set, nfa.end)]
+
+        for state in states:  # NB states grows while we're iterating
+
+            # Find transitions from the current state to other rachable states
+            # and store them in mapping that correlates the label to all the
+            # possible reachable states that can be obtained by consuming a
+            # token equal to the label. Each set of all the states that can
+            # be reached after following a label will be the a DFA state.
+            arcs = {}
+            for nfa_state in state.nfa_set:
+                for nfa_arc in nfa_state.arcs:
+                    if nfa_arc.label is not None:
+                        nfa_set = arcs.setdefault(nfa_arc.label, set())
+                        # All states that can be reached by epsilon-transitions
+                        # are also included in the set of reachable states.
+                        add_closure(nfa_arc.target, nfa_set)
+
+            # Now create new DFAs by visiting all posible transitions between
+            # the current DFA state and the new power-set states (each nfa_set)
+            # via the different labels. As the nodes are appended to *states* this
+            # is performing a deep-first search traversal over the power-set of
+            # the states of the original NFA.
+            for label, nfa_set in sorted(arcs.items()):
+                for exisisting_state in states:
+                    if exisisting_state.nfa_set == nfa_set:
+                        # The DFA state already exists for this rule.
+                        next_state = exisisting_state
+                        break
+                else:
+                    next_state = DFAState(nfa.name, nfa_set, nfa.end)
+                    states.append(next_state)
+
+                # Add a transition between the current DFA state and the new
+                # DFA state (the power-set state) via the current label.
+                state.add_arc(next_state, label)
+
+        return cls(nfa.name, states)
+
+    def __iter__(self):
+        return iter(self.states)
+
+    def simplify(self):
+        """Attempt to reduce the number of states of the DFA
+
+        Transform the DFA into an equivalent DFA that has a less states. There are two
+        classes of states that can be removed or merged from the original DFA without
+        affecting the language it accepts to minimize it.
+
+            * Unreachable states are the states that are not reachable from the initial
+              state of the DFA, for any input string.
+            * Nondistinguishable states are those that cannot be distinguished from one
+            another for any input string.
+
+        This algorithm does not achieve the optimal solution, but works well enough for
+        the particularities of the Python grammar. The algorithm consists on repeatedly
+        look for two states that have the same set of arcs (same labels pointing to the
+        same nodes) and unify them, until things stop changing.
+        """
+        changes = True
+        while changes:
+            changes = False
+            for i, state_i in enumerate(self.states):
+                for j in range(i + 1, len(self.states)):
+                    state_j = self.states[j]
+                    if state_i == state_j:
+                        del self.states[j]
+                        for state in self.states:
+                            state.unifystate(state_j, state_i)
+                        changes = True
+                        break
+
+    def dump(self, writer=print):
+        """Dump a graphical representation of the DFA"""
+        for i, state in enumerate(self.states):
+            writer("  State", i, state.is_final and "(final)" or "")
+            for label, next in sorted(state.arcs.items()):
+                writer("    %s -> %d" % (label, self.states.index(next)))
+
+
+class DFAState(object):
+    """A state of a DFA
+
+    Attributes:
+        rule_name (rule_name): The name of the rule the DFA containing this
+            state is representing.
+        nfa_set (Set[NFAState]): The set of NFA states this state was created from.
+        final (bool): True if the state represents an accepting state of the DFA
+            containing this state.
+        arcs (Dict[label, DFAState]): A mapping representing transitions between
+            the current DFA state and another DFA state via following a label.
+    """
+
+    def __init__(self, rule_name, nfa_set, final):
+        assert isinstance(nfa_set, set)
+        assert isinstance(next(iter(nfa_set)), NFAState)
+        assert isinstance(final, NFAState)
+        self.rule_name = rule_name
+        self.nfa_set = nfa_set
+        self.arcs = {}  # map from terminals/nonterminals to DFAState
+        self.is_final = final in nfa_set
+
+    def add_arc(self, target, label):
+        """Add a new arc to the current state.
+
+        Parameters:
+            target (DFAState): The DFA state at the end of the arc.
+            label (str): The label respreseting the token that must be consumed
+                to perform this transition.
+        """
+        assert isinstance(label, str)
+        assert label not in self.arcs
+        assert isinstance(target, DFAState)
+        self.arcs[label] = target
+
+    def unifystate(self, old, new):
+        """Replace all arcs from the current node to *old* with *new*.
+
+        Parameters:
+            old (DFAState): The  DFA state to remove from all existing arcs.
+            new (DFAState): The DFA state to replace in all existing arcs.
+        """
+        for label, next_ in self.arcs.items():
+            if next_ is old:
+                self.arcs[label] = new
+
+    def __eq__(self, other):
+        # The nfa_set does not matter for  equality
+        assert isinstance(other, DFAState)
+        if self.is_final != other.is_final:
+            return False
+        # We cannot just return self.arcs == other.arcs because that
+        # would invoke this method recursively if there are any cycles.
+        if len(self.arcs) != len(other.arcs):
+            return False
+        for label, next_ in self.arcs.items():
+            if next_ is not other.arcs.get(label):
+                return False
+        return True
+
+    __hash__ = None  # For Py3 compatibility.
+
+    def __repr__(self):
+        return "<%s: %s is_final=%s>" % (
+            self.__class__.__name__,
+            self.rule_name,
+            self.is_final,
+        )