First Commit
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,340 @@
|
||||
"""This module implements a CYK parser."""
|
||||
|
||||
# Author: https://github.com/ehudt (2018)
|
||||
#
|
||||
# Adapted by Erez
|
||||
|
||||
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
|
||||
from ..exceptions import ParseError
|
||||
from ..lexer import Token
|
||||
from ..tree import Tree
|
||||
from ..grammar import Terminal as T, NonTerminal as NT, Symbol
|
||||
|
||||
def match(t, s):
|
||||
assert isinstance(t, T)
|
||||
return t.name == s.type
|
||||
|
||||
|
||||
class Rule:
|
||||
"""Context-free grammar rule."""
|
||||
|
||||
def __init__(self, lhs, rhs, weight, alias):
|
||||
super(Rule, self).__init__()
|
||||
assert isinstance(lhs, NT), lhs
|
||||
assert all(isinstance(x, NT) or isinstance(x, T) for x in rhs), rhs
|
||||
self.lhs = lhs
|
||||
self.rhs = rhs
|
||||
self.weight = weight
|
||||
self.alias = alias
|
||||
|
||||
def __str__(self):
|
||||
return '%s -> %s' % (str(self.lhs), ' '.join(str(x) for x in self.rhs))
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.lhs, tuple(self.rhs)))
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.lhs == other.lhs and self.rhs == other.rhs
|
||||
|
||||
def __ne__(self, other):
|
||||
return not (self == other)
|
||||
|
||||
|
||||
class Grammar:
|
||||
"""Context-free grammar."""
|
||||
|
||||
def __init__(self, rules):
|
||||
self.rules = frozenset(rules)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.rules == other.rules
|
||||
|
||||
def __str__(self):
|
||||
return '\n' + '\n'.join(sorted(repr(x) for x in self.rules)) + '\n'
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
|
||||
# Parse tree data structures
|
||||
class RuleNode:
|
||||
"""A node in the parse tree, which also contains the full rhs rule."""
|
||||
|
||||
def __init__(self, rule, children, weight=0):
|
||||
self.rule = rule
|
||||
self.children = children
|
||||
self.weight = weight
|
||||
|
||||
def __repr__(self):
|
||||
return 'RuleNode(%s, [%s])' % (repr(self.rule.lhs), ', '.join(str(x) for x in self.children))
|
||||
|
||||
|
||||
|
||||
class Parser:
|
||||
"""Parser wrapper."""
|
||||
|
||||
def __init__(self, rules):
|
||||
super(Parser, self).__init__()
|
||||
self.orig_rules = {rule: rule for rule in rules}
|
||||
rules = [self._to_rule(rule) for rule in rules]
|
||||
self.grammar = to_cnf(Grammar(rules))
|
||||
|
||||
def _to_rule(self, lark_rule):
|
||||
"""Converts a lark rule, (lhs, rhs, callback, options), to a Rule."""
|
||||
assert isinstance(lark_rule.origin, NT)
|
||||
assert all(isinstance(x, Symbol) for x in lark_rule.expansion)
|
||||
return Rule(
|
||||
lark_rule.origin, lark_rule.expansion,
|
||||
weight=lark_rule.options.priority if lark_rule.options.priority else 0,
|
||||
alias=lark_rule)
|
||||
|
||||
def parse(self, tokenized, start): # pylint: disable=invalid-name
|
||||
"""Parses input, which is a list of tokens."""
|
||||
assert start
|
||||
start = NT(start)
|
||||
|
||||
table, trees = _parse(tokenized, self.grammar)
|
||||
# Check if the parse succeeded.
|
||||
if all(r.lhs != start for r in table[(0, len(tokenized) - 1)]):
|
||||
raise ParseError('Parsing failed.')
|
||||
parse = trees[(0, len(tokenized) - 1)][start]
|
||||
return self._to_tree(revert_cnf(parse))
|
||||
|
||||
def _to_tree(self, rule_node):
|
||||
"""Converts a RuleNode parse tree to a lark Tree."""
|
||||
orig_rule = self.orig_rules[rule_node.rule.alias]
|
||||
children = []
|
||||
for child in rule_node.children:
|
||||
if isinstance(child, RuleNode):
|
||||
children.append(self._to_tree(child))
|
||||
else:
|
||||
assert isinstance(child.name, Token)
|
||||
children.append(child.name)
|
||||
t = Tree(orig_rule.origin, children)
|
||||
t.rule=orig_rule
|
||||
return t
|
||||
|
||||
|
||||
def print_parse(node, indent=0):
|
||||
if isinstance(node, RuleNode):
|
||||
print(' ' * (indent * 2) + str(node.rule.lhs))
|
||||
for child in node.children:
|
||||
print_parse(child, indent + 1)
|
||||
else:
|
||||
print(' ' * (indent * 2) + str(node.s))
|
||||
|
||||
|
||||
def _parse(s, g):
|
||||
"""Parses sentence 's' using CNF grammar 'g'."""
|
||||
# The CYK table. Indexed with a 2-tuple: (start pos, end pos)
|
||||
table = defaultdict(set)
|
||||
# Top-level structure is similar to the CYK table. Each cell is a dict from
|
||||
# rule name to the best (lightest) tree for that rule.
|
||||
trees = defaultdict(dict)
|
||||
# Populate base case with existing terminal production rules
|
||||
for i, w in enumerate(s):
|
||||
for terminal, rules in g.terminal_rules.items():
|
||||
if match(terminal, w):
|
||||
for rule in rules:
|
||||
table[(i, i)].add(rule)
|
||||
if (rule.lhs not in trees[(i, i)] or
|
||||
rule.weight < trees[(i, i)][rule.lhs].weight):
|
||||
trees[(i, i)][rule.lhs] = RuleNode(rule, [T(w)], weight=rule.weight)
|
||||
|
||||
# Iterate over lengths of sub-sentences
|
||||
for l in range(2, len(s) + 1):
|
||||
# Iterate over sub-sentences with the given length
|
||||
for i in range(len(s) - l + 1):
|
||||
# Choose partition of the sub-sentence in [1, l)
|
||||
for p in range(i + 1, i + l):
|
||||
span1 = (i, p - 1)
|
||||
span2 = (p, i + l - 1)
|
||||
for r1, r2 in itertools.product(table[span1], table[span2]):
|
||||
for rule in g.nonterminal_rules.get((r1.lhs, r2.lhs), []):
|
||||
table[(i, i + l - 1)].add(rule)
|
||||
r1_tree = trees[span1][r1.lhs]
|
||||
r2_tree = trees[span2][r2.lhs]
|
||||
rule_total_weight = rule.weight + r1_tree.weight + r2_tree.weight
|
||||
if (rule.lhs not in trees[(i, i + l - 1)]
|
||||
or rule_total_weight < trees[(i, i + l - 1)][rule.lhs].weight):
|
||||
trees[(i, i + l - 1)][rule.lhs] = RuleNode(rule, [r1_tree, r2_tree], weight=rule_total_weight)
|
||||
return table, trees
|
||||
|
||||
|
||||
# This section implements context-free grammar converter to Chomsky normal form.
|
||||
# It also implements a conversion of parse trees from its CNF to the original
|
||||
# grammar.
|
||||
# Overview:
|
||||
# Applies the following operations in this order:
|
||||
# * TERM: Eliminates non-solitary terminals from all rules
|
||||
# * BIN: Eliminates rules with more than 2 symbols on their right-hand-side.
|
||||
# * UNIT: Eliminates non-terminal unit rules
|
||||
#
|
||||
# The following grammar characteristics aren't featured:
|
||||
# * Start symbol appears on RHS
|
||||
# * Empty rules (epsilon rules)
|
||||
|
||||
|
||||
class CnfWrapper:
|
||||
"""CNF wrapper for grammar.
|
||||
|
||||
Validates that the input grammar is CNF and provides helper data structures.
|
||||
"""
|
||||
|
||||
def __init__(self, grammar):
|
||||
super(CnfWrapper, self).__init__()
|
||||
self.grammar = grammar
|
||||
self.rules = grammar.rules
|
||||
self.terminal_rules = defaultdict(list)
|
||||
self.nonterminal_rules = defaultdict(list)
|
||||
for r in self.rules:
|
||||
# Validate that the grammar is CNF and populate auxiliary data structures.
|
||||
assert isinstance(r.lhs, NT), r
|
||||
if len(r.rhs) not in [1, 2]:
|
||||
raise ParseError("CYK doesn't support empty rules")
|
||||
if len(r.rhs) == 1 and isinstance(r.rhs[0], T):
|
||||
self.terminal_rules[r.rhs[0]].append(r)
|
||||
elif len(r.rhs) == 2 and all(isinstance(x, NT) for x in r.rhs):
|
||||
self.nonterminal_rules[tuple(r.rhs)].append(r)
|
||||
else:
|
||||
assert False, r
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.grammar == other.grammar
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self.grammar)
|
||||
|
||||
|
||||
class UnitSkipRule(Rule):
|
||||
"""A rule that records NTs that were skipped during transformation."""
|
||||
|
||||
def __init__(self, lhs, rhs, skipped_rules, weight, alias):
|
||||
super(UnitSkipRule, self).__init__(lhs, rhs, weight, alias)
|
||||
self.skipped_rules = skipped_rules
|
||||
|
||||
def __eq__(self, other):
|
||||
return isinstance(other, type(self)) and self.skipped_rules == other.skipped_rules
|
||||
|
||||
__hash__ = Rule.__hash__
|
||||
|
||||
|
||||
def build_unit_skiprule(unit_rule, target_rule):
|
||||
skipped_rules = []
|
||||
if isinstance(unit_rule, UnitSkipRule):
|
||||
skipped_rules += unit_rule.skipped_rules
|
||||
skipped_rules.append(target_rule)
|
||||
if isinstance(target_rule, UnitSkipRule):
|
||||
skipped_rules += target_rule.skipped_rules
|
||||
return UnitSkipRule(unit_rule.lhs, target_rule.rhs, skipped_rules,
|
||||
weight=unit_rule.weight + target_rule.weight, alias=unit_rule.alias)
|
||||
|
||||
|
||||
def get_any_nt_unit_rule(g):
|
||||
"""Returns a non-terminal unit rule from 'g', or None if there is none."""
|
||||
for rule in g.rules:
|
||||
if len(rule.rhs) == 1 and isinstance(rule.rhs[0], NT):
|
||||
return rule
|
||||
return None
|
||||
|
||||
|
||||
def _remove_unit_rule(g, rule):
|
||||
"""Removes 'rule' from 'g' without changing the language produced by 'g'."""
|
||||
new_rules = [x for x in g.rules if x != rule]
|
||||
refs = [x for x in g.rules if x.lhs == rule.rhs[0]]
|
||||
new_rules += [build_unit_skiprule(rule, ref) for ref in refs]
|
||||
return Grammar(new_rules)
|
||||
|
||||
|
||||
def _split(rule):
|
||||
"""Splits a rule whose len(rhs) > 2 into shorter rules."""
|
||||
rule_str = str(rule.lhs) + '__' + '_'.join(str(x) for x in rule.rhs)
|
||||
rule_name = '__SP_%s' % (rule_str) + '_%d'
|
||||
yield Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias)
|
||||
for i in range(1, len(rule.rhs) - 2):
|
||||
yield Rule(NT(rule_name % i), [rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split')
|
||||
yield Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split')
|
||||
|
||||
|
||||
def _term(g):
|
||||
"""Applies the TERM rule on 'g' (see top comment)."""
|
||||
all_t = {x for rule in g.rules for x in rule.rhs if isinstance(x, T)}
|
||||
t_rules = {t: Rule(NT('__T_%s' % str(t)), [t], weight=0, alias='Term') for t in all_t}
|
||||
new_rules = []
|
||||
for rule in g.rules:
|
||||
if len(rule.rhs) > 1 and any(isinstance(x, T) for x in rule.rhs):
|
||||
new_rhs = [t_rules[x].lhs if isinstance(x, T) else x for x in rule.rhs]
|
||||
new_rules.append(Rule(rule.lhs, new_rhs, weight=rule.weight, alias=rule.alias))
|
||||
new_rules.extend(v for k, v in t_rules.items() if k in rule.rhs)
|
||||
else:
|
||||
new_rules.append(rule)
|
||||
return Grammar(new_rules)
|
||||
|
||||
|
||||
def _bin(g):
|
||||
"""Applies the BIN rule to 'g' (see top comment)."""
|
||||
new_rules = []
|
||||
for rule in g.rules:
|
||||
if len(rule.rhs) > 2:
|
||||
new_rules += _split(rule)
|
||||
else:
|
||||
new_rules.append(rule)
|
||||
return Grammar(new_rules)
|
||||
|
||||
|
||||
def _unit(g):
|
||||
"""Applies the UNIT rule to 'g' (see top comment)."""
|
||||
nt_unit_rule = get_any_nt_unit_rule(g)
|
||||
while nt_unit_rule:
|
||||
g = _remove_unit_rule(g, nt_unit_rule)
|
||||
nt_unit_rule = get_any_nt_unit_rule(g)
|
||||
return g
|
||||
|
||||
|
||||
def to_cnf(g):
|
||||
"""Creates a CNF grammar from a general context-free grammar 'g'."""
|
||||
g = _unit(_bin(_term(g)))
|
||||
return CnfWrapper(g)
|
||||
|
||||
|
||||
def unroll_unit_skiprule(lhs, orig_rhs, skipped_rules, children, weight, alias):
|
||||
if not skipped_rules:
|
||||
return RuleNode(Rule(lhs, orig_rhs, weight=weight, alias=alias), children, weight=weight)
|
||||
else:
|
||||
weight = weight - skipped_rules[0].weight
|
||||
return RuleNode(
|
||||
Rule(lhs, [skipped_rules[0].lhs], weight=weight, alias=alias), [
|
||||
unroll_unit_skiprule(skipped_rules[0].lhs, orig_rhs,
|
||||
skipped_rules[1:], children,
|
||||
skipped_rules[0].weight, skipped_rules[0].alias)
|
||||
], weight=weight)
|
||||
|
||||
|
||||
def revert_cnf(node):
|
||||
"""Reverts a parse tree (RuleNode) to its original non-CNF form (Node)."""
|
||||
if isinstance(node, T):
|
||||
return node
|
||||
# Reverts TERM rule.
|
||||
if node.rule.lhs.name.startswith('__T_'):
|
||||
return node.children[0]
|
||||
else:
|
||||
children = []
|
||||
for child in map(revert_cnf, node.children):
|
||||
# Reverts BIN rule.
|
||||
if isinstance(child, RuleNode) and child.rule.lhs.name.startswith('__SP_'):
|
||||
children += child.children
|
||||
else:
|
||||
children.append(child)
|
||||
# Reverts UNIT rule.
|
||||
if isinstance(node.rule, UnitSkipRule):
|
||||
return unroll_unit_skiprule(node.rule.lhs, node.rule.rhs,
|
||||
node.rule.skipped_rules, children,
|
||||
node.rule.weight, node.rule.alias)
|
||||
else:
|
||||
return RuleNode(node.rule, children)
|
||||
@@ -0,0 +1,312 @@
|
||||
"""This module implements an Earley parser.
|
||||
|
||||
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
|
||||
https://www.sciencedirect.com/science/article/pii/S1571066108001497
|
||||
|
||||
That is probably the best reference for understanding the algorithm here.
|
||||
|
||||
The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
|
||||
is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING, Callable, Optional, List, Any
|
||||
from collections import deque
|
||||
|
||||
from ..lexer import Token
|
||||
from ..tree import Tree
|
||||
from ..exceptions import UnexpectedEOF, UnexpectedToken
|
||||
from ..utils import logger, OrderedSet, dedup_list
|
||||
from .grammar_analysis import GrammarAnalyzer
|
||||
from ..grammar import NonTerminal
|
||||
from .earley_common import Item
|
||||
from .earley_forest import ForestSumVisitor, SymbolNode, StableSymbolNode, TokenNode, ForestToParseTree
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..common import LexerConf, ParserConf
|
||||
|
||||
class Parser:
|
||||
lexer_conf: 'LexerConf'
|
||||
parser_conf: 'ParserConf'
|
||||
debug: bool
|
||||
|
||||
def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable,
|
||||
resolve_ambiguity: bool=True, debug: bool=False,
|
||||
tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True):
|
||||
analysis = GrammarAnalyzer(parser_conf)
|
||||
self.lexer_conf = lexer_conf
|
||||
self.parser_conf = parser_conf
|
||||
self.resolve_ambiguity = resolve_ambiguity
|
||||
self.debug = debug
|
||||
self.Tree = tree_class
|
||||
self.Set = OrderedSet if ordered_sets else set
|
||||
self.SymbolNode = StableSymbolNode if ordered_sets else SymbolNode
|
||||
|
||||
self.FIRST = analysis.FIRST
|
||||
self.NULLABLE = analysis.NULLABLE
|
||||
self.callbacks = parser_conf.callbacks
|
||||
# TODO add typing info
|
||||
self.predictions = {} # type: ignore[var-annotated]
|
||||
|
||||
## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than
|
||||
# the slow 'isupper' in is_terminal.
|
||||
self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }
|
||||
self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }
|
||||
|
||||
self.forest_sum_visitor = None
|
||||
for rule in parser_conf.rules:
|
||||
if rule.origin not in self.predictions:
|
||||
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]
|
||||
|
||||
## Detect if any rules/terminals have priorities set. If the user specified priority = None, then
|
||||
# the priorities will be stripped from all rules/terminals before they reach us, allowing us to
|
||||
# skip the extra tree walk. We'll also skip this if the user just didn't specify priorities
|
||||
# on any rules/terminals.
|
||||
if self.forest_sum_visitor is None and rule.options.priority is not None:
|
||||
self.forest_sum_visitor = ForestSumVisitor
|
||||
|
||||
# Check terminals for priorities
|
||||
# Ignore terminal priorities if the basic lexer is used
|
||||
if self.lexer_conf.lexer_type != 'basic' and self.forest_sum_visitor is None:
|
||||
for term in self.lexer_conf.terminals:
|
||||
if term.priority:
|
||||
self.forest_sum_visitor = ForestSumVisitor
|
||||
break
|
||||
|
||||
self.term_matcher = term_matcher
|
||||
|
||||
|
||||
def predict_and_complete(self, i, to_scan, columns, transitives, node_cache):
|
||||
"""The core Earley Predictor and Completer.
|
||||
|
||||
At each stage of the input, we handling any completed items (things
|
||||
that matched on the last cycle) and use those to predict what should
|
||||
come next in the input stream. The completions and any predicted
|
||||
non-terminals are recursively processed until we reach a set of,
|
||||
which can be added to the scan list for the next scanner cycle."""
|
||||
# Held Completions (H in E.Scotts paper).
|
||||
held_completions = {}
|
||||
|
||||
column = columns[i]
|
||||
# R (items) = Ei (column.items)
|
||||
items = deque(column)
|
||||
while items:
|
||||
item = items.pop() # remove an element, A say, from R
|
||||
|
||||
### The Earley completer
|
||||
if item.is_complete: ### (item.s == string)
|
||||
if item.node is None:
|
||||
label = (item.s, item.start, i)
|
||||
item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
item.node.add_family(item.s, item.rule, item.start, None, None)
|
||||
|
||||
# create_leo_transitives(item.rule.origin, item.start)
|
||||
|
||||
###R Joop Leo right recursion Completer
|
||||
if item.rule.origin in transitives[item.start]:
|
||||
transitive = transitives[item.start][item.s]
|
||||
if transitive.previous in transitives[transitive.column]:
|
||||
root_transitive = transitives[transitive.column][transitive.previous]
|
||||
else:
|
||||
root_transitive = transitive
|
||||
|
||||
new_item = Item(transitive.rule, transitive.ptr, transitive.start)
|
||||
label = (root_transitive.s, root_transitive.start, i)
|
||||
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
new_item.node.add_path(root_transitive, item.node)
|
||||
if new_item.expect in self.TERMINALS:
|
||||
# Add (B :: aC.B, h, y) to Q
|
||||
to_scan.add(new_item)
|
||||
elif new_item not in column:
|
||||
# Add (B :: aC.B, h, y) to Ei and R
|
||||
column.add(new_item)
|
||||
items.append(new_item)
|
||||
###R Regular Earley completer
|
||||
else:
|
||||
# Empty has 0 length. If we complete an empty symbol in a particular
|
||||
# parse step, we need to be able to use that same empty symbol to complete
|
||||
# any predictions that result, that themselves require empty. Avoids
|
||||
# infinite recursion on empty symbols.
|
||||
# held_completions is 'H' in E.Scott's paper.
|
||||
is_empty_item = item.start == i
|
||||
if is_empty_item:
|
||||
held_completions[item.rule.origin] = item.node
|
||||
|
||||
originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
|
||||
for originator in originators:
|
||||
new_item = originator.advance()
|
||||
label = (new_item.s, originator.start, i)
|
||||
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
|
||||
if new_item.expect in self.TERMINALS:
|
||||
# Add (B :: aC.B, h, y) to Q
|
||||
to_scan.add(new_item)
|
||||
elif new_item not in column:
|
||||
# Add (B :: aC.B, h, y) to Ei and R
|
||||
column.add(new_item)
|
||||
items.append(new_item)
|
||||
|
||||
### The Earley predictor
|
||||
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
|
||||
new_items = []
|
||||
for rule in self.predictions[item.expect]:
|
||||
new_item = Item(rule, 0, i)
|
||||
new_items.append(new_item)
|
||||
|
||||
# Process any held completions (H).
|
||||
if item.expect in held_completions:
|
||||
new_item = item.advance()
|
||||
label = (new_item.s, item.start, i)
|
||||
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
|
||||
new_items.append(new_item)
|
||||
|
||||
for new_item in new_items:
|
||||
if new_item.expect in self.TERMINALS:
|
||||
to_scan.add(new_item)
|
||||
elif new_item not in column:
|
||||
column.add(new_item)
|
||||
items.append(new_item)
|
||||
|
||||
def _parse(self, lexer, columns, to_scan, start_symbol=None):
|
||||
|
||||
def is_quasi_complete(item):
|
||||
if item.is_complete:
|
||||
return True
|
||||
|
||||
quasi = item.advance()
|
||||
while not quasi.is_complete:
|
||||
if quasi.expect not in self.NULLABLE:
|
||||
return False
|
||||
if quasi.rule.origin == start_symbol and quasi.expect == start_symbol:
|
||||
return False
|
||||
quasi = quasi.advance()
|
||||
return True
|
||||
|
||||
# def create_leo_transitives(origin, start):
|
||||
# ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420
|
||||
|
||||
def scan(i, token, to_scan):
|
||||
"""The core Earley Scanner.
|
||||
|
||||
This is a custom implementation of the scanner that uses the
|
||||
Lark lexer to match tokens. The scan list is built by the
|
||||
Earley predictor, based on the previously completed tokens.
|
||||
This ensures that at each phase of the parse we have a custom
|
||||
lexer context, allowing for more complex ambiguities."""
|
||||
next_to_scan = self.Set()
|
||||
next_set = self.Set()
|
||||
columns.append(next_set)
|
||||
transitives.append({})
|
||||
node_cache = {}
|
||||
|
||||
for item in self.Set(to_scan):
|
||||
if match(item.expect, token):
|
||||
new_item = item.advance()
|
||||
label = (new_item.s, new_item.start, i + 1)
|
||||
# 'terminals' may not contain token.type when using %declare
|
||||
# Additionally, token is not always a Token
|
||||
# For example, it can be a Tree when using TreeMatcher
|
||||
term = terminals.get(token.type) if isinstance(token, Token) else None
|
||||
# Set the priority of the token node to 0 so that the
|
||||
# terminal priorities do not affect the Tree chosen by
|
||||
# ForestSumVisitor after the basic lexer has already
|
||||
# "used up" the terminal priorities
|
||||
token_node = TokenNode(token, term, priority=0)
|
||||
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)
|
||||
|
||||
if new_item.expect in self.TERMINALS:
|
||||
# add (B ::= Aai+1.B, h, y) to Q'
|
||||
next_to_scan.add(new_item)
|
||||
else:
|
||||
# add (B ::= Aa+1.B, h, y) to Ei+1
|
||||
next_set.add(new_item)
|
||||
|
||||
if not next_set and not next_to_scan:
|
||||
expect = {i.expect.name for i in to_scan}
|
||||
raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))
|
||||
|
||||
return next_to_scan, node_cache
|
||||
|
||||
|
||||
# Define parser functions
|
||||
match = self.term_matcher
|
||||
|
||||
terminals = self.lexer_conf.terminals_by_name
|
||||
|
||||
# Cache for nodes & tokens created in a particular parse step.
|
||||
transitives = [{}]
|
||||
|
||||
## The main Earley loop.
|
||||
# Run the Prediction/Completion cycle for any Items in the current Earley set.
|
||||
# Completions will be added to the SPPF tree, and predictions will be recursively
|
||||
# processed down to terminals/empty nodes to be added to the scanner for the next
|
||||
# step.
|
||||
expects = {i.expect for i in to_scan}
|
||||
i = 0
|
||||
node_cache = {}
|
||||
for token in lexer.lex(expects):
|
||||
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
|
||||
|
||||
to_scan, node_cache = scan(i, token, to_scan)
|
||||
i += 1
|
||||
|
||||
expects.clear()
|
||||
expects |= {i.expect for i in to_scan}
|
||||
|
||||
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
|
||||
|
||||
## Column is now the final column in the parse.
|
||||
assert i == len(columns)-1
|
||||
return to_scan
|
||||
|
||||
def parse(self, lexer, start):
|
||||
assert start, start
|
||||
start_symbol = NonTerminal(start)
|
||||
|
||||
columns = [self.Set()]
|
||||
to_scan = self.Set() # The scan buffer. 'Q' in E.Scott's paper.
|
||||
|
||||
## Predict for the start_symbol.
|
||||
# Add predicted items to the first Earley set (for the predictor) if they
|
||||
# result in a non-terminal, or the scanner if they result in a terminal.
|
||||
for rule in self.predictions[start_symbol]:
|
||||
item = Item(rule, 0, 0)
|
||||
if item.expect in self.TERMINALS:
|
||||
to_scan.add(item)
|
||||
else:
|
||||
columns[0].add(item)
|
||||
|
||||
to_scan = self._parse(lexer, columns, to_scan, start_symbol)
|
||||
|
||||
# If the parse was successful, the start
|
||||
# symbol should have been completed in the last step of the Earley cycle, and will be in
|
||||
# this column. Find the item for the start_symbol, which is the root of the SPPF tree.
|
||||
solutions = dedup_list(n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0)
|
||||
if not solutions:
|
||||
expected_terminals = [t.expect.name for t in to_scan]
|
||||
raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))
|
||||
if len(solutions) > 1:
|
||||
raise RuntimeError('Earley should not generate multiple start symbol items! Please report this bug.')
|
||||
solution ,= solutions
|
||||
|
||||
if self.debug:
|
||||
from .earley_forest import ForestToPyDotVisitor
|
||||
try:
|
||||
debug_walker = ForestToPyDotVisitor()
|
||||
except ImportError:
|
||||
logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
|
||||
else:
|
||||
debug_walker.visit(solution, "sppf.png")
|
||||
|
||||
|
||||
if self.Tree is not None:
|
||||
# Perform our SPPF -> AST conversion
|
||||
# Disable the ForestToParseTree cache when ambiguity='resolve'
|
||||
# to prevent a tree construction bug. See issue #1283
|
||||
use_cache = not self.resolve_ambiguity
|
||||
transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity, use_cache)
|
||||
return transformer.transform(solution)
|
||||
|
||||
# return the root of the SPPF
|
||||
return solution
|
||||
@@ -0,0 +1,42 @@
|
||||
"""This module implements useful building blocks for the Earley parser
|
||||
"""
|
||||
|
||||
|
||||
class Item:
|
||||
"An Earley Item, the atom of the algorithm."
|
||||
|
||||
__slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash')
|
||||
def __init__(self, rule, ptr, start):
|
||||
self.is_complete = len(rule.expansion) == ptr
|
||||
self.rule = rule # rule
|
||||
self.ptr = ptr # ptr
|
||||
self.start = start # j
|
||||
self.node = None # w
|
||||
if self.is_complete:
|
||||
self.s = rule.origin
|
||||
self.expect = None
|
||||
self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
|
||||
else:
|
||||
self.s = (rule, ptr)
|
||||
self.expect = rule.expansion[ptr]
|
||||
self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
|
||||
self._hash = hash((self.s, self.start, self.rule))
|
||||
|
||||
def advance(self):
|
||||
return Item(self.rule, self.ptr + 1, self.start)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self is other or (self.s == other.s and self.start == other.start and self.rule == other.rule)
|
||||
|
||||
def __hash__(self):
|
||||
return self._hash
|
||||
|
||||
def __repr__(self):
|
||||
before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] )
|
||||
after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] )
|
||||
symbol = "{} ::= {}* {}".format(self.rule.origin.name, ' '.join(before), ' '.join(after))
|
||||
return '%s (%d)' % (symbol, self.start)
|
||||
|
||||
|
||||
# class TransitiveItem(Item):
|
||||
# ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420
|
||||
@@ -0,0 +1,802 @@
|
||||
""""This module implements an SPPF implementation
|
||||
|
||||
This is used as the primary output mechanism for the Earley parser
|
||||
in order to store complex ambiguities.
|
||||
|
||||
Full reference and more details is here:
|
||||
https://web.archive.org/web/20190616123959/http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
|
||||
"""
|
||||
|
||||
from typing import Type, AbstractSet
|
||||
from random import randint
|
||||
from collections import deque
|
||||
from operator import attrgetter
|
||||
from importlib import import_module
|
||||
from functools import partial
|
||||
|
||||
from ..parse_tree_builder import AmbiguousIntermediateExpander
|
||||
from ..visitors import Discard
|
||||
from ..utils import logger, OrderedSet
|
||||
from ..tree import Tree
|
||||
|
||||
class ForestNode:
|
||||
pass
|
||||
|
||||
class SymbolNode(ForestNode):
|
||||
"""
|
||||
A Symbol Node represents a symbol (or Intermediate LR0).
|
||||
|
||||
Symbol nodes are keyed by the symbol (s). For intermediate nodes
|
||||
s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol
|
||||
nodes, s will be a string representing the non-terminal origin (i.e.
|
||||
the left hand side of the rule).
|
||||
|
||||
The children of a Symbol or Intermediate Node will always be Packed Nodes;
|
||||
with each Packed Node child representing a single derivation of a production.
|
||||
|
||||
Hence a Symbol Node with a single child is unambiguous.
|
||||
|
||||
Parameters:
|
||||
s: A Symbol, or a tuple of (rule, ptr) for an intermediate node.
|
||||
start: For dynamic lexers, the index of the start of the substring matched by this symbol (inclusive).
|
||||
end: For dynamic lexers, the index of the end of the substring matched by this symbol (exclusive).
|
||||
|
||||
Properties:
|
||||
is_intermediate: True if this node is an intermediate node.
|
||||
priority: The priority of the node's symbol.
|
||||
"""
|
||||
Set: Type[AbstractSet] = set # Overridden by StableSymbolNode
|
||||
__slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate')
|
||||
def __init__(self, s, start, end):
|
||||
self.s = s
|
||||
self.start = start
|
||||
self.end = end
|
||||
self._children = self.Set()
|
||||
self.paths = self.Set()
|
||||
self.paths_loaded = False
|
||||
|
||||
### We use inf here as it can be safely negated without resorting to conditionals,
|
||||
# unlike None or float('NaN'), and sorts appropriately.
|
||||
self.priority = float('-inf')
|
||||
self.is_intermediate = isinstance(s, tuple)
|
||||
|
||||
def add_family(self, lr0, rule, start, left, right):
|
||||
self._children.add(PackedNode(self, lr0, rule, start, left, right))
|
||||
|
||||
def add_path(self, transitive, node):
|
||||
self.paths.add((transitive, node))
|
||||
|
||||
def load_paths(self):
|
||||
for transitive, node in self.paths:
|
||||
if transitive.next_titem is not None:
|
||||
vn = type(self)(transitive.next_titem.s, transitive.next_titem.start, self.end)
|
||||
vn.add_path(transitive.next_titem, node)
|
||||
self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn)
|
||||
else:
|
||||
self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node)
|
||||
self.paths_loaded = True
|
||||
|
||||
@property
|
||||
def is_ambiguous(self):
|
||||
"""Returns True if this node is ambiguous."""
|
||||
return len(self.children) > 1
|
||||
|
||||
@property
|
||||
def children(self):
|
||||
"""Returns a list of this node's children sorted from greatest to
|
||||
least priority."""
|
||||
if not self.paths_loaded:
|
||||
self.load_paths()
|
||||
return sorted(self._children, key=attrgetter('sort_key'))
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._children)
|
||||
|
||||
def __repr__(self):
|
||||
if self.is_intermediate:
|
||||
rule = self.s[0]
|
||||
ptr = self.s[1]
|
||||
before = ( expansion.name for expansion in rule.expansion[:ptr] )
|
||||
after = ( expansion.name for expansion in rule.expansion[ptr:] )
|
||||
symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
|
||||
else:
|
||||
symbol = self.s.name
|
||||
return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority)
|
||||
|
||||
class StableSymbolNode(SymbolNode):
|
||||
"A version of SymbolNode that uses OrderedSet for output stability"
|
||||
Set = OrderedSet
|
||||
|
||||
class PackedNode(ForestNode):
|
||||
"""
|
||||
A Packed Node represents a single derivation in a symbol node.
|
||||
|
||||
Parameters:
|
||||
rule: The rule associated with this node.
|
||||
parent: The parent of this node.
|
||||
left: The left child of this node. ``None`` if one does not exist.
|
||||
right: The right child of this node. ``None`` if one does not exist.
|
||||
priority: The priority of this node.
|
||||
"""
|
||||
__slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash')
|
||||
def __init__(self, parent, s, rule, start, left, right):
|
||||
self.parent = parent
|
||||
self.s = s
|
||||
self.start = start
|
||||
self.rule = rule
|
||||
self.left = left
|
||||
self.right = right
|
||||
self.priority = float('-inf')
|
||||
self._hash = hash((self.left, self.right))
|
||||
|
||||
@property
|
||||
def is_empty(self):
|
||||
return self.left is None and self.right is None
|
||||
|
||||
@property
|
||||
def sort_key(self):
|
||||
"""
|
||||
Used to sort PackedNode children of SymbolNodes.
|
||||
A SymbolNode has multiple PackedNodes if it matched
|
||||
ambiguously. Hence, we use the sort order to identify
|
||||
the order in which ambiguous children should be considered.
|
||||
"""
|
||||
return self.is_empty, -self.priority, self.rule.order
|
||||
|
||||
@property
|
||||
def children(self):
|
||||
"""Returns a list of this node's children."""
|
||||
return [x for x in [self.left, self.right] if x is not None]
|
||||
|
||||
def __iter__(self):
|
||||
yield self.left
|
||||
yield self.right
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, PackedNode):
|
||||
return False
|
||||
return self is other or (self.left == other.left and self.right == other.right)
|
||||
|
||||
def __hash__(self):
|
||||
return self._hash
|
||||
|
||||
def __repr__(self):
|
||||
if isinstance(self.s, tuple):
|
||||
rule = self.s[0]
|
||||
ptr = self.s[1]
|
||||
before = ( expansion.name for expansion in rule.expansion[:ptr] )
|
||||
after = ( expansion.name for expansion in rule.expansion[ptr:] )
|
||||
symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
|
||||
else:
|
||||
symbol = self.s.name
|
||||
return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order)
|
||||
|
||||
class TokenNode(ForestNode):
|
||||
"""
|
||||
A Token Node represents a matched terminal and is always a leaf node.
|
||||
|
||||
Parameters:
|
||||
token: The Token associated with this node.
|
||||
term: The TerminalDef matched by the token.
|
||||
priority: The priority of this node.
|
||||
"""
|
||||
__slots__ = ('token', 'term', 'priority', '_hash')
|
||||
def __init__(self, token, term, priority=None):
|
||||
self.token = token
|
||||
self.term = term
|
||||
if priority is not None:
|
||||
self.priority = priority
|
||||
else:
|
||||
self.priority = term.priority if term is not None else 0
|
||||
self._hash = hash(token)
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, TokenNode):
|
||||
return False
|
||||
return self is other or (self.token == other.token)
|
||||
|
||||
def __hash__(self):
|
||||
return self._hash
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self.token)
|
||||
|
||||
class ForestVisitor:
|
||||
"""
|
||||
An abstract base class for building forest visitors.
|
||||
|
||||
This class performs a controllable depth-first walk of an SPPF.
|
||||
The visitor will not enter cycles and will backtrack if one is encountered.
|
||||
Subclasses are notified of cycles through the ``on_cycle`` method.
|
||||
|
||||
Behavior for visit events is defined by overriding the
|
||||
``visit*node*`` functions.
|
||||
|
||||
The walk is controlled by the return values of the ``visit*node_in``
|
||||
methods. Returning a node(s) will schedule them to be visited. The visitor
|
||||
will begin to backtrack if no nodes are returned.
|
||||
|
||||
Parameters:
|
||||
single_visit: If ``True``, non-Token nodes will only be visited once.
|
||||
"""
|
||||
|
||||
def __init__(self, single_visit=False):
|
||||
self.single_visit = single_visit
|
||||
|
||||
def visit_token_node(self, node):
|
||||
"""Called when a ``Token`` is visited. ``Token`` nodes are always leaves."""
|
||||
pass
|
||||
|
||||
def visit_symbol_node_in(self, node):
|
||||
"""Called when a symbol node is visited. Nodes that are returned
|
||||
will be scheduled to be visited. If ``visit_intermediate_node_in``
|
||||
is not implemented, this function will be called for intermediate
|
||||
nodes as well."""
|
||||
pass
|
||||
|
||||
def visit_symbol_node_out(self, node):
|
||||
"""Called after all nodes returned from a corresponding ``visit_symbol_node_in``
|
||||
call have been visited. If ``visit_intermediate_node_out``
|
||||
is not implemented, this function will be called for intermediate
|
||||
nodes as well."""
|
||||
pass
|
||||
|
||||
def visit_packed_node_in(self, node):
|
||||
"""Called when a packed node is visited. Nodes that are returned
|
||||
will be scheduled to be visited. """
|
||||
pass
|
||||
|
||||
def visit_packed_node_out(self, node):
|
||||
"""Called after all nodes returned from a corresponding ``visit_packed_node_in``
|
||||
call have been visited."""
|
||||
pass
|
||||
|
||||
def on_cycle(self, node, path):
|
||||
"""Called when a cycle is encountered.
|
||||
|
||||
Parameters:
|
||||
node: The node that causes a cycle.
|
||||
path: The list of nodes being visited: nodes that have been
|
||||
entered but not exited. The first element is the root in a forest
|
||||
visit, and the last element is the node visited most recently.
|
||||
``path`` should be treated as read-only.
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_cycle_in_path(self, node, path):
|
||||
"""A utility function for use in ``on_cycle`` to obtain a slice of
|
||||
``path`` that only contains the nodes that make up the cycle."""
|
||||
index = len(path) - 1
|
||||
while id(path[index]) != id(node):
|
||||
index -= 1
|
||||
return path[index:]
|
||||
|
||||
def visit(self, root):
|
||||
# Visiting is a list of IDs of all symbol/intermediate nodes currently in
|
||||
# the stack. It serves two purposes: to detect when we 'recurse' in and out
|
||||
# of a symbol/intermediate so that we can process both up and down. Also,
|
||||
# since the SPPF can have cycles it allows us to detect if we're trying
|
||||
# to recurse into a node that's already on the stack (infinite recursion).
|
||||
visiting = set()
|
||||
|
||||
# set of all nodes that have been visited
|
||||
visited = set()
|
||||
|
||||
# a list of nodes that are currently being visited
|
||||
# used for the `on_cycle` callback
|
||||
path = []
|
||||
|
||||
# We do not use recursion here to walk the Forest due to the limited
|
||||
# stack size in python. Therefore input_stack is essentially our stack.
|
||||
input_stack = deque([root])
|
||||
|
||||
# It is much faster to cache these as locals since they are called
|
||||
# many times in large parses.
|
||||
vpno = getattr(self, 'visit_packed_node_out')
|
||||
vpni = getattr(self, 'visit_packed_node_in')
|
||||
vsno = getattr(self, 'visit_symbol_node_out')
|
||||
vsni = getattr(self, 'visit_symbol_node_in')
|
||||
vino = getattr(self, 'visit_intermediate_node_out', vsno)
|
||||
vini = getattr(self, 'visit_intermediate_node_in', vsni)
|
||||
vtn = getattr(self, 'visit_token_node')
|
||||
oc = getattr(self, 'on_cycle')
|
||||
|
||||
while input_stack:
|
||||
current = next(reversed(input_stack))
|
||||
try:
|
||||
next_node = next(current)
|
||||
except StopIteration:
|
||||
input_stack.pop()
|
||||
continue
|
||||
except TypeError:
|
||||
### If the current object is not an iterator, pass through to Token/SymbolNode
|
||||
pass
|
||||
else:
|
||||
if next_node is None:
|
||||
continue
|
||||
|
||||
if id(next_node) in visiting:
|
||||
oc(next_node, path)
|
||||
continue
|
||||
|
||||
input_stack.append(next_node)
|
||||
continue
|
||||
|
||||
if isinstance(current, TokenNode):
|
||||
vtn(current.token)
|
||||
input_stack.pop()
|
||||
continue
|
||||
|
||||
current_id = id(current)
|
||||
if current_id in visiting:
|
||||
if isinstance(current, PackedNode):
|
||||
vpno(current)
|
||||
elif current.is_intermediate:
|
||||
vino(current)
|
||||
else:
|
||||
vsno(current)
|
||||
input_stack.pop()
|
||||
path.pop()
|
||||
visiting.remove(current_id)
|
||||
visited.add(current_id)
|
||||
elif self.single_visit and current_id in visited:
|
||||
input_stack.pop()
|
||||
else:
|
||||
visiting.add(current_id)
|
||||
path.append(current)
|
||||
if isinstance(current, PackedNode):
|
||||
next_node = vpni(current)
|
||||
elif current.is_intermediate:
|
||||
next_node = vini(current)
|
||||
else:
|
||||
next_node = vsni(current)
|
||||
if next_node is None:
|
||||
continue
|
||||
|
||||
if not isinstance(next_node, ForestNode):
|
||||
next_node = iter(next_node)
|
||||
elif id(next_node) in visiting:
|
||||
oc(next_node, path)
|
||||
continue
|
||||
|
||||
input_stack.append(next_node)
|
||||
|
||||
class ForestTransformer(ForestVisitor):
|
||||
"""The base class for a bottom-up forest transformation. Most users will
|
||||
want to use ``TreeForestTransformer`` instead as it has a friendlier
|
||||
interface and covers most use cases.
|
||||
|
||||
Transformations are applied via inheritance and overriding of the
|
||||
``transform*node`` methods.
|
||||
|
||||
``transform_token_node`` receives a ``Token`` as an argument.
|
||||
All other methods receive the node that is being transformed and
|
||||
a list of the results of the transformations of that node's children.
|
||||
The return value of these methods are the resulting transformations.
|
||||
|
||||
If ``Discard`` is raised in a node's transformation, no data from that node
|
||||
will be passed to its parent's transformation.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super(ForestTransformer, self).__init__()
|
||||
# results of transformations
|
||||
self.data = dict()
|
||||
# used to track parent nodes
|
||||
self.node_stack = deque()
|
||||
|
||||
def transform(self, root):
|
||||
"""Perform a transformation on an SPPF."""
|
||||
self.node_stack.append('result')
|
||||
self.data['result'] = []
|
||||
self.visit(root)
|
||||
assert len(self.data['result']) <= 1
|
||||
if self.data['result']:
|
||||
return self.data['result'][0]
|
||||
|
||||
def transform_symbol_node(self, node, data):
|
||||
"""Transform a symbol node."""
|
||||
return node
|
||||
|
||||
def transform_intermediate_node(self, node, data):
|
||||
"""Transform an intermediate node."""
|
||||
return node
|
||||
|
||||
def transform_packed_node(self, node, data):
|
||||
"""Transform a packed node."""
|
||||
return node
|
||||
|
||||
def transform_token_node(self, node):
|
||||
"""Transform a ``Token``."""
|
||||
return node
|
||||
|
||||
def visit_symbol_node_in(self, node):
|
||||
self.node_stack.append(id(node))
|
||||
self.data[id(node)] = []
|
||||
return node.children
|
||||
|
||||
def visit_packed_node_in(self, node):
|
||||
self.node_stack.append(id(node))
|
||||
self.data[id(node)] = []
|
||||
return node.children
|
||||
|
||||
def visit_token_node(self, node):
|
||||
transformed = self.transform_token_node(node)
|
||||
if transformed is not Discard:
|
||||
self.data[self.node_stack[-1]].append(transformed)
|
||||
|
||||
def _visit_node_out_helper(self, node, method):
|
||||
self.node_stack.pop()
|
||||
transformed = method(node, self.data[id(node)])
|
||||
if transformed is not Discard:
|
||||
self.data[self.node_stack[-1]].append(transformed)
|
||||
del self.data[id(node)]
|
||||
|
||||
def visit_symbol_node_out(self, node):
|
||||
self._visit_node_out_helper(node, self.transform_symbol_node)
|
||||
|
||||
def visit_intermediate_node_out(self, node):
|
||||
self._visit_node_out_helper(node, self.transform_intermediate_node)
|
||||
|
||||
def visit_packed_node_out(self, node):
|
||||
self._visit_node_out_helper(node, self.transform_packed_node)
|
||||
|
||||
|
||||
class ForestSumVisitor(ForestVisitor):
|
||||
"""
|
||||
A visitor for prioritizing ambiguous parts of the Forest.
|
||||
|
||||
This visitor is used when support for explicit priorities on
|
||||
rules is requested (whether normal, or invert). It walks the
|
||||
forest (or subsets thereof) and cascades properties upwards
|
||||
from the leaves.
|
||||
|
||||
It would be ideal to do this during parsing, however this would
|
||||
require processing each Earley item multiple times. That's
|
||||
a big performance drawback; so running a forest walk is the
|
||||
lesser of two evils: there can be significantly more Earley
|
||||
items created during parsing than there are SPPF nodes in the
|
||||
final tree.
|
||||
"""
|
||||
def __init__(self):
|
||||
super(ForestSumVisitor, self).__init__(single_visit=True)
|
||||
|
||||
def visit_packed_node_in(self, node):
|
||||
yield node.left
|
||||
yield node.right
|
||||
|
||||
def visit_symbol_node_in(self, node):
|
||||
return iter(node.children)
|
||||
|
||||
def visit_packed_node_out(self, node):
|
||||
priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options.priority else 0
|
||||
priority += getattr(node.right, 'priority', 0)
|
||||
priority += getattr(node.left, 'priority', 0)
|
||||
node.priority = priority
|
||||
|
||||
def visit_symbol_node_out(self, node):
|
||||
node.priority = max(child.priority for child in node.children)
|
||||
|
||||
class PackedData():
|
||||
"""Used in transformationss of packed nodes to distinguish the data
|
||||
that comes from the left child and the right child.
|
||||
"""
|
||||
|
||||
class _NoData():
|
||||
pass
|
||||
|
||||
NO_DATA = _NoData()
|
||||
|
||||
def __init__(self, node, data):
|
||||
self.left = self.NO_DATA
|
||||
self.right = self.NO_DATA
|
||||
if data:
|
||||
if node.left is not None:
|
||||
self.left = data[0]
|
||||
if len(data) > 1:
|
||||
self.right = data[1]
|
||||
else:
|
||||
self.right = data[0]
|
||||
|
||||
class ForestToParseTree(ForestTransformer):
|
||||
"""Used by the earley parser when ambiguity equals 'resolve' or
|
||||
'explicit'. Transforms an SPPF into an (ambiguous) parse tree.
|
||||
|
||||
Parameters:
|
||||
tree_class: The tree class to use for construction
|
||||
callbacks: A dictionary of rules to functions that output a tree
|
||||
prioritizer: A ``ForestVisitor`` that manipulates the priorities of ForestNodes
|
||||
resolve_ambiguity: If True, ambiguities will be resolved based on
|
||||
priorities. Otherwise, `_ambig` nodes will be in the resulting tree.
|
||||
use_cache: If True, the results of packed node transformations will be cached.
|
||||
"""
|
||||
|
||||
def __init__(self, tree_class=Tree, callbacks=dict(), prioritizer=ForestSumVisitor(), resolve_ambiguity=True, use_cache=True):
|
||||
super(ForestToParseTree, self).__init__()
|
||||
self.tree_class = tree_class
|
||||
self.callbacks = callbacks
|
||||
self.prioritizer = prioritizer
|
||||
self.resolve_ambiguity = resolve_ambiguity
|
||||
self._use_cache = use_cache
|
||||
self._cache = {}
|
||||
self._on_cycle_retreat = False
|
||||
self._cycle_node = None
|
||||
self._successful_visits = set()
|
||||
|
||||
def visit(self, root):
|
||||
if self.prioritizer:
|
||||
self.prioritizer.visit(root)
|
||||
super(ForestToParseTree, self).visit(root)
|
||||
self._cache = {}
|
||||
|
||||
def on_cycle(self, node, path):
|
||||
logger.debug("Cycle encountered in the SPPF at node: %s. "
|
||||
"As infinite ambiguities cannot be represented in a tree, "
|
||||
"this family of derivations will be discarded.", node)
|
||||
self._cycle_node = node
|
||||
self._on_cycle_retreat = True
|
||||
|
||||
def _check_cycle(self, node):
|
||||
if self._on_cycle_retreat:
|
||||
if id(node) == id(self._cycle_node) or id(node) in self._successful_visits:
|
||||
self._cycle_node = None
|
||||
self._on_cycle_retreat = False
|
||||
else:
|
||||
return Discard
|
||||
|
||||
def _collapse_ambig(self, children):
|
||||
new_children = []
|
||||
for child in children:
|
||||
if hasattr(child, 'data') and child.data == '_ambig':
|
||||
new_children += child.children
|
||||
else:
|
||||
new_children.append(child)
|
||||
return new_children
|
||||
|
||||
def _call_rule_func(self, node, data):
|
||||
# called when transforming children of symbol nodes
|
||||
# data is a list of trees or tokens that correspond to the
|
||||
# symbol's rule expansion
|
||||
return self.callbacks[node.rule](data)
|
||||
|
||||
def _call_ambig_func(self, node, data):
|
||||
# called when transforming a symbol node
|
||||
# data is a list of trees where each tree's data is
|
||||
# equal to the name of the symbol or one of its aliases.
|
||||
if len(data) > 1:
|
||||
return self.tree_class('_ambig', data)
|
||||
elif data:
|
||||
return data[0]
|
||||
return Discard
|
||||
|
||||
def transform_symbol_node(self, node, data):
|
||||
if id(node) not in self._successful_visits:
|
||||
return Discard
|
||||
r = self._check_cycle(node)
|
||||
if r is Discard:
|
||||
return r
|
||||
self._successful_visits.remove(id(node))
|
||||
data = self._collapse_ambig(data)
|
||||
return self._call_ambig_func(node, data)
|
||||
|
||||
def transform_intermediate_node(self, node, data):
|
||||
if id(node) not in self._successful_visits:
|
||||
return Discard
|
||||
r = self._check_cycle(node)
|
||||
if r is Discard:
|
||||
return r
|
||||
self._successful_visits.remove(id(node))
|
||||
if len(data) > 1:
|
||||
children = [self.tree_class('_inter', c) for c in data]
|
||||
return self.tree_class('_iambig', children)
|
||||
return data[0]
|
||||
|
||||
def transform_packed_node(self, node, data):
|
||||
r = self._check_cycle(node)
|
||||
if r is Discard:
|
||||
return r
|
||||
if self.resolve_ambiguity and id(node.parent) in self._successful_visits:
|
||||
return Discard
|
||||
if self._use_cache and id(node) in self._cache:
|
||||
return self._cache[id(node)]
|
||||
children = []
|
||||
assert len(data) <= 2
|
||||
data = PackedData(node, data)
|
||||
if data.left is not PackedData.NO_DATA:
|
||||
if node.left.is_intermediate and isinstance(data.left, list):
|
||||
children += data.left
|
||||
else:
|
||||
children.append(data.left)
|
||||
if data.right is not PackedData.NO_DATA:
|
||||
children.append(data.right)
|
||||
transformed = children if node.parent.is_intermediate else self._call_rule_func(node, children)
|
||||
if self._use_cache:
|
||||
self._cache[id(node)] = transformed
|
||||
return transformed
|
||||
|
||||
def visit_symbol_node_in(self, node):
|
||||
super(ForestToParseTree, self).visit_symbol_node_in(node)
|
||||
if self._on_cycle_retreat:
|
||||
return
|
||||
return node.children
|
||||
|
||||
def visit_packed_node_in(self, node):
|
||||
self._on_cycle_retreat = False
|
||||
to_visit = super(ForestToParseTree, self).visit_packed_node_in(node)
|
||||
if not self.resolve_ambiguity or id(node.parent) not in self._successful_visits:
|
||||
if not self._use_cache or id(node) not in self._cache:
|
||||
return to_visit
|
||||
|
||||
def visit_packed_node_out(self, node):
|
||||
super(ForestToParseTree, self).visit_packed_node_out(node)
|
||||
if not self._on_cycle_retreat:
|
||||
self._successful_visits.add(id(node.parent))
|
||||
|
||||
def handles_ambiguity(func):
|
||||
"""Decorator for methods of subclasses of ``TreeForestTransformer``.
|
||||
Denotes that the method should receive a list of transformed derivations."""
|
||||
func.handles_ambiguity = True
|
||||
return func
|
||||
|
||||
class TreeForestTransformer(ForestToParseTree):
|
||||
"""A ``ForestTransformer`` with a tree ``Transformer``-like interface.
|
||||
By default, it will construct a tree.
|
||||
|
||||
Methods provided via inheritance are called based on the rule/symbol
|
||||
names of nodes in the forest.
|
||||
|
||||
Methods that act on rules will receive a list of the results of the
|
||||
transformations of the rule's children. By default, trees and tokens.
|
||||
|
||||
Methods that act on tokens will receive a token.
|
||||
|
||||
Alternatively, methods that act on rules may be annotated with
|
||||
``handles_ambiguity``. In this case, the function will receive a list
|
||||
of all the transformations of all the derivations of the rule.
|
||||
By default, a list of trees where each tree.data is equal to the
|
||||
rule name or one of its aliases.
|
||||
|
||||
Non-tree transformations are made possible by override of
|
||||
``__default__``, ``__default_token__``, and ``__default_ambig__``.
|
||||
|
||||
Note:
|
||||
Tree shaping features such as inlined rules and token filtering are
|
||||
not built into the transformation. Positions are also not propagated.
|
||||
|
||||
Parameters:
|
||||
tree_class: The tree class to use for construction
|
||||
prioritizer: A ``ForestVisitor`` that manipulates the priorities of nodes in the SPPF.
|
||||
resolve_ambiguity: If True, ambiguities will be resolved based on priorities.
|
||||
use_cache (bool): If True, caches the results of some transformations,
|
||||
potentially improving performance when ``resolve_ambiguity==False``.
|
||||
Only use if you know what you are doing: i.e. All transformation
|
||||
functions are pure and referentially transparent.
|
||||
"""
|
||||
|
||||
def __init__(self, tree_class=Tree, prioritizer=ForestSumVisitor(), resolve_ambiguity=True, use_cache=False):
|
||||
super(TreeForestTransformer, self).__init__(tree_class, dict(), prioritizer, resolve_ambiguity, use_cache)
|
||||
|
||||
def __default__(self, name, data):
|
||||
"""Default operation on tree (for override).
|
||||
|
||||
Returns a tree with name with data as children.
|
||||
"""
|
||||
return self.tree_class(name, data)
|
||||
|
||||
def __default_ambig__(self, name, data):
|
||||
"""Default operation on ambiguous rule (for override).
|
||||
|
||||
Wraps data in an '_ambig_' node if it contains more than
|
||||
one element.
|
||||
"""
|
||||
if len(data) > 1:
|
||||
return self.tree_class('_ambig', data)
|
||||
elif data:
|
||||
return data[0]
|
||||
return Discard
|
||||
|
||||
def __default_token__(self, node):
|
||||
"""Default operation on ``Token`` (for override).
|
||||
|
||||
Returns ``node``.
|
||||
"""
|
||||
return node
|
||||
|
||||
def transform_token_node(self, node):
|
||||
return getattr(self, node.type, self.__default_token__)(node)
|
||||
|
||||
def _call_rule_func(self, node, data):
|
||||
name = node.rule.alias or node.rule.options.template_source or node.rule.origin.name
|
||||
user_func = getattr(self, name, self.__default__)
|
||||
if user_func == self.__default__ or hasattr(user_func, 'handles_ambiguity'):
|
||||
user_func = partial(self.__default__, name)
|
||||
if not self.resolve_ambiguity:
|
||||
wrapper = partial(AmbiguousIntermediateExpander, self.tree_class)
|
||||
user_func = wrapper(user_func)
|
||||
return user_func(data)
|
||||
|
||||
def _call_ambig_func(self, node, data):
|
||||
name = node.s.name
|
||||
user_func = getattr(self, name, self.__default_ambig__)
|
||||
if user_func == self.__default_ambig__ or not hasattr(user_func, 'handles_ambiguity'):
|
||||
user_func = partial(self.__default_ambig__, name)
|
||||
return user_func(data)
|
||||
|
||||
class ForestToPyDotVisitor(ForestVisitor):
|
||||
"""
|
||||
A Forest visitor which writes the SPPF to a PNG.
|
||||
|
||||
The SPPF can get really large, really quickly because
|
||||
of the amount of meta-data it stores, so this is probably
|
||||
only useful for trivial trees and learning how the SPPF
|
||||
is structured.
|
||||
"""
|
||||
def __init__(self, rankdir="TB"):
|
||||
super(ForestToPyDotVisitor, self).__init__(single_visit=True)
|
||||
self.pydot = import_module('pydot')
|
||||
self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir)
|
||||
|
||||
def visit(self, root, filename):
|
||||
super(ForestToPyDotVisitor, self).visit(root)
|
||||
try:
|
||||
self.graph.write_png(filename)
|
||||
except FileNotFoundError as e:
|
||||
logger.error("Could not write png: ", e)
|
||||
|
||||
def visit_token_node(self, node):
|
||||
graph_node_id = str(id(node))
|
||||
graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"'))
|
||||
graph_node_color = 0x808080
|
||||
graph_node_style = "\"filled,rounded\""
|
||||
graph_node_shape = "diamond"
|
||||
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
|
||||
self.graph.add_node(graph_node)
|
||||
|
||||
def visit_packed_node_in(self, node):
|
||||
graph_node_id = str(id(node))
|
||||
graph_node_label = repr(node)
|
||||
graph_node_color = 0x808080
|
||||
graph_node_style = "filled"
|
||||
graph_node_shape = "diamond"
|
||||
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
|
||||
self.graph.add_node(graph_node)
|
||||
yield node.left
|
||||
yield node.right
|
||||
|
||||
def visit_packed_node_out(self, node):
|
||||
graph_node_id = str(id(node))
|
||||
graph_node = self.graph.get_node(graph_node_id)[0]
|
||||
for child in [node.left, node.right]:
|
||||
if child is not None:
|
||||
child_graph_node_id = str(id(child.token if isinstance(child, TokenNode) else child))
|
||||
child_graph_node = self.graph.get_node(child_graph_node_id)[0]
|
||||
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))
|
||||
else:
|
||||
#### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay.
|
||||
child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890))
|
||||
child_graph_node_style = "invis"
|
||||
child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None")
|
||||
child_edge_style = "invis"
|
||||
self.graph.add_node(child_graph_node)
|
||||
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style))
|
||||
|
||||
def visit_symbol_node_in(self, node):
|
||||
graph_node_id = str(id(node))
|
||||
graph_node_label = repr(node)
|
||||
graph_node_color = 0x808080
|
||||
graph_node_style = "\"filled\""
|
||||
if node.is_intermediate:
|
||||
graph_node_shape = "ellipse"
|
||||
else:
|
||||
graph_node_shape = "rectangle"
|
||||
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
|
||||
self.graph.add_node(graph_node)
|
||||
return iter(node.children)
|
||||
|
||||
def visit_symbol_node_out(self, node):
|
||||
graph_node_id = str(id(node))
|
||||
graph_node = self.graph.get_node(graph_node_id)[0]
|
||||
for child in node.children:
|
||||
child_graph_node_id = str(id(child))
|
||||
child_graph_node = self.graph.get_node(child_graph_node_id)[0]
|
||||
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))
|
||||
@@ -0,0 +1,203 @@
|
||||
"Provides for superficial grammar analysis."
|
||||
|
||||
from collections import Counter, defaultdict
|
||||
from typing import List, Dict, Iterator, FrozenSet, Set
|
||||
|
||||
from ..utils import bfs, fzset, classify, OrderedSet
|
||||
from ..exceptions import GrammarError
|
||||
from ..grammar import Rule, Terminal, NonTerminal, Symbol
|
||||
from ..common import ParserConf
|
||||
|
||||
|
||||
class RulePtr:
|
||||
__slots__ = ('rule', 'index')
|
||||
rule: Rule
|
||||
index: int
|
||||
|
||||
def __init__(self, rule: Rule, index: int):
|
||||
assert isinstance(rule, Rule)
|
||||
assert index <= len(rule.expansion)
|
||||
self.rule = rule
|
||||
self.index = index
|
||||
|
||||
def __repr__(self):
|
||||
before = [x.name for x in self.rule.expansion[:self.index]]
|
||||
after = [x.name for x in self.rule.expansion[self.index:]]
|
||||
return '<%s : %s * %s>' % (self.rule.origin.name, ' '.join(before), ' '.join(after))
|
||||
|
||||
@property
|
||||
def next(self) -> Symbol:
|
||||
return self.rule.expansion[self.index]
|
||||
|
||||
def advance(self, sym: Symbol) -> 'RulePtr':
|
||||
assert self.next == sym
|
||||
return RulePtr(self.rule, self.index+1)
|
||||
|
||||
@property
|
||||
def is_satisfied(self) -> bool:
|
||||
return self.index == len(self.rule.expansion)
|
||||
|
||||
def __eq__(self, other) -> bool:
|
||||
if not isinstance(other, RulePtr):
|
||||
return NotImplemented
|
||||
return self.rule == other.rule and self.index == other.index
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((self.rule, self.index))
|
||||
|
||||
|
||||
State = FrozenSet[RulePtr]
|
||||
|
||||
# state generation ensures no duplicate LR0ItemSets
|
||||
class LR0ItemSet:
|
||||
__slots__ = ('kernel', 'closure', 'transitions', 'lookaheads')
|
||||
|
||||
kernel: State
|
||||
closure: State
|
||||
transitions: Dict[Symbol, 'LR0ItemSet']
|
||||
lookaheads: Dict[Symbol, Set[Rule]]
|
||||
|
||||
def __init__(self, kernel, closure):
|
||||
self.kernel = fzset(kernel)
|
||||
self.closure = fzset(closure)
|
||||
self.transitions = {}
|
||||
self.lookaheads = defaultdict(set)
|
||||
|
||||
def __repr__(self):
|
||||
return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure]))
|
||||
|
||||
|
||||
def update_set(set1, set2):
|
||||
if not set2 or set1 > set2:
|
||||
return False
|
||||
|
||||
copy = set(set1)
|
||||
set1 |= set2
|
||||
return set1 != copy
|
||||
|
||||
def calculate_sets(rules):
|
||||
"""Calculate FOLLOW sets.
|
||||
|
||||
Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets"""
|
||||
symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules}
|
||||
|
||||
# foreach grammar rule X ::= Y(1) ... Y(k)
|
||||
# if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then
|
||||
# NULLABLE = NULLABLE union {X}
|
||||
# for i = 1 to k
|
||||
# if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then
|
||||
# FIRST(X) = FIRST(X) union FIRST(Y(i))
|
||||
# for j = i+1 to k
|
||||
# if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then
|
||||
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X)
|
||||
# if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then
|
||||
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j))
|
||||
# until none of NULLABLE,FIRST,FOLLOW changed in last iteration
|
||||
|
||||
NULLABLE = set()
|
||||
FIRST = {}
|
||||
FOLLOW = {}
|
||||
for sym in symbols:
|
||||
FIRST[sym]={sym} if sym.is_term else set()
|
||||
FOLLOW[sym]=set()
|
||||
|
||||
# Calculate NULLABLE and FIRST
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
|
||||
for rule in rules:
|
||||
if set(rule.expansion) <= NULLABLE:
|
||||
if update_set(NULLABLE, {rule.origin}):
|
||||
changed = True
|
||||
|
||||
for i, sym in enumerate(rule.expansion):
|
||||
if set(rule.expansion[:i]) <= NULLABLE:
|
||||
if update_set(FIRST[rule.origin], FIRST[sym]):
|
||||
changed = True
|
||||
else:
|
||||
break
|
||||
|
||||
# Calculate FOLLOW
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
|
||||
for rule in rules:
|
||||
for i, sym in enumerate(rule.expansion):
|
||||
if i==len(rule.expansion)-1 or set(rule.expansion[i+1:]) <= NULLABLE:
|
||||
if update_set(FOLLOW[sym], FOLLOW[rule.origin]):
|
||||
changed = True
|
||||
|
||||
for j in range(i+1, len(rule.expansion)):
|
||||
if set(rule.expansion[i+1:j]) <= NULLABLE:
|
||||
if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]):
|
||||
changed = True
|
||||
|
||||
return FIRST, FOLLOW, NULLABLE
|
||||
|
||||
|
||||
class GrammarAnalyzer:
|
||||
def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
|
||||
self.debug = debug
|
||||
self.strict = strict
|
||||
|
||||
root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
|
||||
for start in parser_conf.start}
|
||||
|
||||
rules = parser_conf.rules + list(root_rules.values())
|
||||
self.rules_by_origin: Dict[NonTerminal, List[Rule]] = classify(rules, lambda r: r.origin)
|
||||
|
||||
if len(rules) != len(set(rules)):
|
||||
duplicates = [item for item, count in Counter(rules).items() if count > 1]
|
||||
raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates))
|
||||
|
||||
for r in rules:
|
||||
for sym in r.expansion:
|
||||
if not (sym.is_term or sym in self.rules_by_origin):
|
||||
raise GrammarError("Using an undefined rule: %s" % sym)
|
||||
|
||||
self.start_states = {start: self.expand_rule(root_rule.origin)
|
||||
for start, root_rule in root_rules.items()}
|
||||
|
||||
self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))})
|
||||
for start, root_rule in root_rules.items()}
|
||||
|
||||
lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)])
|
||||
for start in parser_conf.start}
|
||||
|
||||
lr0_rules = parser_conf.rules + list(lr0_root_rules.values())
|
||||
assert(len(lr0_rules) == len(set(lr0_rules)))
|
||||
|
||||
self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin)
|
||||
|
||||
# cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
|
||||
self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin))
|
||||
for start, root_rule in lr0_root_rules.items()}
|
||||
|
||||
self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)
|
||||
|
||||
def expand_rule(self, source_rule: NonTerminal, rules_by_origin=None) -> OrderedSet[RulePtr]:
|
||||
"Returns all init_ptrs accessible by rule (recursive)"
|
||||
|
||||
if rules_by_origin is None:
|
||||
rules_by_origin = self.rules_by_origin
|
||||
|
||||
init_ptrs = OrderedSet[RulePtr]()
|
||||
def _expand_rule(rule: NonTerminal) -> Iterator[NonTerminal]:
|
||||
assert not rule.is_term, rule
|
||||
|
||||
for r in rules_by_origin[rule]:
|
||||
init_ptr = RulePtr(r, 0)
|
||||
init_ptrs.add(init_ptr)
|
||||
|
||||
if r.expansion: # if not empty rule
|
||||
new_r = init_ptr.next
|
||||
if not new_r.is_term:
|
||||
assert isinstance(new_r, NonTerminal)
|
||||
yield new_r
|
||||
|
||||
for _ in bfs([source_rule], _expand_rule):
|
||||
pass
|
||||
|
||||
return init_ptrs
|
||||
@@ -0,0 +1,334 @@
|
||||
"""This module builds a LALR(1) transition-table for lalr_parser.py
|
||||
|
||||
For now, shift/reduce conflicts are automatically resolved as shifts.
|
||||
"""
|
||||
|
||||
# Author: Erez Shinan (2017)
|
||||
# Email : erezshin@gmail.com
|
||||
|
||||
from typing import Dict, Set, Iterator, Tuple, List, TypeVar, Generic
|
||||
from collections import defaultdict
|
||||
|
||||
from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger
|
||||
from ..exceptions import GrammarError
|
||||
|
||||
from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet, RulePtr, State
|
||||
from ..grammar import Rule, Symbol
|
||||
from ..common import ParserConf
|
||||
|
||||
###{standalone
|
||||
|
||||
class Action:
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
def __str__(self):
|
||||
return self.name
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
Shift = Action('Shift')
|
||||
Reduce = Action('Reduce')
|
||||
|
||||
StateT = TypeVar("StateT")
|
||||
|
||||
class ParseTableBase(Generic[StateT]):
|
||||
states: Dict[StateT, Dict[str, Tuple]]
|
||||
start_states: Dict[str, StateT]
|
||||
end_states: Dict[str, StateT]
|
||||
|
||||
def __init__(self, states, start_states, end_states):
|
||||
self.states = states
|
||||
self.start_states = start_states
|
||||
self.end_states = end_states
|
||||
|
||||
def serialize(self, memo):
|
||||
tokens = Enumerator()
|
||||
|
||||
states = {
|
||||
state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg))
|
||||
for token, (action, arg) in actions.items()}
|
||||
for state, actions in self.states.items()
|
||||
}
|
||||
|
||||
return {
|
||||
'tokens': tokens.reversed(),
|
||||
'states': states,
|
||||
'start_states': self.start_states,
|
||||
'end_states': self.end_states,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def deserialize(cls, data, memo):
|
||||
tokens = data['tokens']
|
||||
states = {
|
||||
state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg))
|
||||
for token, (action, arg) in actions.items()}
|
||||
for state, actions in data['states'].items()
|
||||
}
|
||||
return cls(states, data['start_states'], data['end_states'])
|
||||
|
||||
class ParseTable(ParseTableBase['State']):
|
||||
"""Parse-table whose key is State, i.e. set[RulePtr]
|
||||
|
||||
Slower than IntParseTable, but useful for debugging
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class IntParseTable(ParseTableBase[int]):
|
||||
"""Parse-table whose key is int. Best for performance."""
|
||||
|
||||
@classmethod
|
||||
def from_ParseTable(cls, parse_table: ParseTable):
|
||||
enum = list(parse_table.states)
|
||||
state_to_idx: Dict['State', int] = {s:i for i,s in enumerate(enum)}
|
||||
int_states = {}
|
||||
|
||||
for s, la in parse_table.states.items():
|
||||
la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v
|
||||
for k,v in la.items()}
|
||||
int_states[ state_to_idx[s] ] = la
|
||||
|
||||
|
||||
start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()}
|
||||
end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()}
|
||||
return cls(int_states, start_states, end_states)
|
||||
|
||||
###}
|
||||
|
||||
|
||||
# digraph and traverse, see The Theory and Practice of Compiler Writing
|
||||
|
||||
# computes F(x) = G(x) union (union { G(y) | x R y })
|
||||
# X: nodes
|
||||
# R: relation (function mapping node -> list of nodes that satisfy the relation)
|
||||
# G: set valued function
|
||||
def digraph(X, R, G):
|
||||
F = {}
|
||||
S = []
|
||||
N = dict.fromkeys(X, 0)
|
||||
for x in X:
|
||||
# this is always true for the first iteration, but N[x] may be updated in traverse below
|
||||
if N[x] == 0:
|
||||
traverse(x, S, N, X, R, G, F)
|
||||
return F
|
||||
|
||||
# x: single node
|
||||
# S: stack
|
||||
# N: weights
|
||||
# X: nodes
|
||||
# R: relation (see above)
|
||||
# G: set valued function
|
||||
# F: set valued function we are computing (map of input -> output)
|
||||
def traverse(x, S, N, X, R, G, F):
|
||||
S.append(x)
|
||||
d = len(S)
|
||||
N[x] = d
|
||||
F[x] = G[x]
|
||||
for y in R[x]:
|
||||
if N[y] == 0:
|
||||
traverse(y, S, N, X, R, G, F)
|
||||
n_x = N[x]
|
||||
assert(n_x > 0)
|
||||
n_y = N[y]
|
||||
assert(n_y != 0)
|
||||
if (n_y > 0) and (n_y < n_x):
|
||||
N[x] = n_y
|
||||
F[x].update(F[y])
|
||||
if N[x] == d:
|
||||
f_x = F[x]
|
||||
while True:
|
||||
z = S.pop()
|
||||
N[z] = -1
|
||||
F[z] = f_x
|
||||
if z == x:
|
||||
break
|
||||
|
||||
|
||||
class LALR_Analyzer(GrammarAnalyzer):
|
||||
lr0_itemsets: Set[LR0ItemSet]
|
||||
nonterminal_transitions: List[Tuple[LR0ItemSet, Symbol]]
|
||||
lookback: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Rule]]]
|
||||
includes: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Symbol]]]
|
||||
reads: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Symbol]]]
|
||||
directly_reads: Dict[Tuple[LR0ItemSet, Symbol], Set[Symbol]]
|
||||
|
||||
|
||||
def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
|
||||
GrammarAnalyzer.__init__(self, parser_conf, debug, strict)
|
||||
self.nonterminal_transitions = []
|
||||
self.directly_reads = defaultdict(set)
|
||||
self.reads = defaultdict(set)
|
||||
self.includes = defaultdict(set)
|
||||
self.lookback = defaultdict(set)
|
||||
|
||||
|
||||
def compute_lr0_states(self) -> None:
|
||||
self.lr0_itemsets = set()
|
||||
# map of kernels to LR0ItemSets
|
||||
cache: Dict['State', LR0ItemSet] = {}
|
||||
|
||||
def step(state: LR0ItemSet) -> Iterator[LR0ItemSet]:
|
||||
_, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied)
|
||||
|
||||
d = classify(unsat, lambda rp: rp.next)
|
||||
for sym, rps in d.items():
|
||||
kernel = fzset({rp.advance(sym) for rp in rps})
|
||||
new_state = cache.get(kernel, None)
|
||||
if new_state is None:
|
||||
closure = set(kernel)
|
||||
for rp in kernel:
|
||||
if not rp.is_satisfied and not rp.next.is_term:
|
||||
closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin)
|
||||
new_state = LR0ItemSet(kernel, closure)
|
||||
cache[kernel] = new_state
|
||||
|
||||
state.transitions[sym] = new_state
|
||||
yield new_state
|
||||
|
||||
self.lr0_itemsets.add(state)
|
||||
|
||||
for _ in bfs(self.lr0_start_states.values(), step):
|
||||
pass
|
||||
|
||||
def compute_reads_relations(self):
|
||||
# handle start state
|
||||
for root in self.lr0_start_states.values():
|
||||
assert(len(root.kernel) == 1)
|
||||
for rp in root.kernel:
|
||||
assert(rp.index == 0)
|
||||
self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ])
|
||||
|
||||
for state in self.lr0_itemsets:
|
||||
seen = set()
|
||||
for rp in state.closure:
|
||||
if rp.is_satisfied:
|
||||
continue
|
||||
s = rp.next
|
||||
# if s is a not a nonterminal
|
||||
if s not in self.lr0_rules_by_origin:
|
||||
continue
|
||||
if s in seen:
|
||||
continue
|
||||
seen.add(s)
|
||||
nt = (state, s)
|
||||
self.nonterminal_transitions.append(nt)
|
||||
dr = self.directly_reads[nt]
|
||||
r = self.reads[nt]
|
||||
next_state = state.transitions[s]
|
||||
for rp2 in next_state.closure:
|
||||
if rp2.is_satisfied:
|
||||
continue
|
||||
s2 = rp2.next
|
||||
# if s2 is a terminal
|
||||
if s2 not in self.lr0_rules_by_origin:
|
||||
dr.add(s2)
|
||||
if s2 in self.NULLABLE:
|
||||
r.add((next_state, s2))
|
||||
|
||||
def compute_includes_lookback(self):
|
||||
for nt in self.nonterminal_transitions:
|
||||
state, nonterminal = nt
|
||||
includes = []
|
||||
lookback = self.lookback[nt]
|
||||
for rp in state.closure:
|
||||
if rp.rule.origin != nonterminal:
|
||||
continue
|
||||
# traverse the states for rp(.rule)
|
||||
state2 = state
|
||||
for i in range(rp.index, len(rp.rule.expansion)):
|
||||
s = rp.rule.expansion[i]
|
||||
nt2 = (state2, s)
|
||||
state2 = state2.transitions[s]
|
||||
if nt2 not in self.reads:
|
||||
continue
|
||||
for j in range(i + 1, len(rp.rule.expansion)):
|
||||
if rp.rule.expansion[j] not in self.NULLABLE:
|
||||
break
|
||||
else:
|
||||
includes.append(nt2)
|
||||
# state2 is at the final state for rp.rule
|
||||
if rp.index == 0:
|
||||
for rp2 in state2.closure:
|
||||
if (rp2.rule == rp.rule) and rp2.is_satisfied:
|
||||
lookback.add((state2, rp2.rule))
|
||||
for nt2 in includes:
|
||||
self.includes[nt2].add(nt)
|
||||
|
||||
def compute_lookaheads(self):
|
||||
read_sets = digraph(self.nonterminal_transitions, self.reads, self.directly_reads)
|
||||
follow_sets = digraph(self.nonterminal_transitions, self.includes, read_sets)
|
||||
|
||||
for nt, lookbacks in self.lookback.items():
|
||||
for state, rule in lookbacks:
|
||||
for s in follow_sets[nt]:
|
||||
state.lookaheads[s].add(rule)
|
||||
|
||||
def compute_lalr1_states(self) -> None:
|
||||
m: Dict[LR0ItemSet, Dict[str, Tuple]] = {}
|
||||
reduce_reduce = []
|
||||
for itemset in self.lr0_itemsets:
|
||||
actions: Dict[Symbol, Tuple] = {la: (Shift, next_state.closure)
|
||||
for la, next_state in itemset.transitions.items()}
|
||||
for la, rules in itemset.lookaheads.items():
|
||||
if len(rules) > 1:
|
||||
# Try to resolve conflict based on priority
|
||||
p = [(r.options.priority or 0, r) for r in rules]
|
||||
p.sort(key=lambda r: r[0], reverse=True)
|
||||
best, second_best = p[:2]
|
||||
if best[0] > second_best[0]:
|
||||
rules = {best[1]}
|
||||
else:
|
||||
reduce_reduce.append((itemset, la, rules))
|
||||
continue
|
||||
|
||||
rule ,= rules
|
||||
if la in actions:
|
||||
if self.strict:
|
||||
msg = f'Shift/Reduce conflict for terminal {la.name}. [strict-mode]\n' \
|
||||
f' * {rule}\n'
|
||||
raise GrammarError(msg)
|
||||
elif self.debug:
|
||||
logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
|
||||
logger.warning(' * %s', rule)
|
||||
else:
|
||||
logger.debug('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
|
||||
logger.debug(' * %s', rule)
|
||||
else:
|
||||
actions[la] = (Reduce, rule)
|
||||
m[itemset] = { k.name: v for k, v in actions.items() }
|
||||
|
||||
if reduce_reduce:
|
||||
msgs = []
|
||||
for itemset, la, rules in reduce_reduce:
|
||||
msg = 'Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t- ' + str(r) for r in rules ]))
|
||||
if self.debug:
|
||||
msg += '\n collision occurred in state: {%s\n }' % ''.join(['\n\t' + str(x) for x in itemset.closure])
|
||||
msgs.append(msg)
|
||||
raise GrammarError('\n\n'.join(msgs))
|
||||
|
||||
states = { k.closure: v for k, v in m.items() }
|
||||
|
||||
# compute end states
|
||||
end_states: Dict[str, 'State'] = {}
|
||||
for state in states:
|
||||
for rp in state:
|
||||
for start in self.lr0_start_states:
|
||||
if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied:
|
||||
assert start not in end_states
|
||||
end_states[start] = state
|
||||
|
||||
start_states = { start: state.closure for start, state in self.lr0_start_states.items() }
|
||||
_parse_table = ParseTable(states, start_states, end_states)
|
||||
|
||||
if self.debug:
|
||||
self.parse_table = _parse_table
|
||||
else:
|
||||
self.parse_table = IntParseTable.from_ParseTable(_parse_table)
|
||||
|
||||
def compute_lalr(self):
|
||||
self.compute_lr0_states()
|
||||
self.compute_reads_relations()
|
||||
self.compute_includes_lookback()
|
||||
self.compute_lookaheads()
|
||||
self.compute_lalr1_states()
|
||||
@@ -0,0 +1,158 @@
|
||||
# This module provides a LALR interactive parser, which is used for debugging and error handling
|
||||
|
||||
from typing import Iterator, List
|
||||
from copy import copy
|
||||
import warnings
|
||||
|
||||
from lark.exceptions import UnexpectedToken
|
||||
from lark.lexer import Token, LexerThread
|
||||
from .lalr_parser_state import ParserState
|
||||
|
||||
###{standalone
|
||||
|
||||
class InteractiveParser:
|
||||
"""InteractiveParser gives you advanced control over parsing and error handling when parsing with LALR.
|
||||
|
||||
For a simpler interface, see the ``on_error`` argument to ``Lark.parse()``.
|
||||
"""
|
||||
def __init__(self, parser, parser_state: ParserState, lexer_thread: LexerThread):
|
||||
self.parser = parser
|
||||
self.parser_state = parser_state
|
||||
self.lexer_thread = lexer_thread
|
||||
self.result = None
|
||||
|
||||
@property
|
||||
def lexer_state(self) -> LexerThread:
|
||||
warnings.warn("lexer_state will be removed in subsequent releases. Use lexer_thread instead.", DeprecationWarning)
|
||||
return self.lexer_thread
|
||||
|
||||
def feed_token(self, token: Token):
|
||||
"""Feed the parser with a token, and advance it to the next state, as if it received it from the lexer.
|
||||
|
||||
Note that ``token`` has to be an instance of ``Token``.
|
||||
"""
|
||||
return self.parser_state.feed_token(token, token.type == '$END')
|
||||
|
||||
def iter_parse(self) -> Iterator[Token]:
|
||||
"""Step through the different stages of the parse, by reading tokens from the lexer
|
||||
and feeding them to the parser, one per iteration.
|
||||
|
||||
Returns an iterator of the tokens it encounters.
|
||||
|
||||
When the parse is over, the resulting tree can be found in ``InteractiveParser.result``.
|
||||
"""
|
||||
for token in self.lexer_thread.lex(self.parser_state):
|
||||
yield token
|
||||
self.result = self.feed_token(token)
|
||||
|
||||
def exhaust_lexer(self) -> List[Token]:
|
||||
"""Try to feed the rest of the lexer state into the interactive parser.
|
||||
|
||||
Note that this modifies the instance in place and does not feed an '$END' Token
|
||||
"""
|
||||
return list(self.iter_parse())
|
||||
|
||||
|
||||
def feed_eof(self, last_token=None):
|
||||
"""Feed a '$END' Token. Borrows from 'last_token' if given."""
|
||||
eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else self.lexer_thread._Token('$END', '', 0, 1, 1)
|
||||
return self.feed_token(eof)
|
||||
|
||||
|
||||
def __copy__(self):
|
||||
"""Create a new interactive parser with a separate state.
|
||||
|
||||
Calls to feed_token() won't affect the old instance, and vice-versa.
|
||||
"""
|
||||
return self.copy()
|
||||
|
||||
def copy(self, deepcopy_values=True):
|
||||
return type(self)(
|
||||
self.parser,
|
||||
self.parser_state.copy(deepcopy_values=deepcopy_values),
|
||||
copy(self.lexer_thread),
|
||||
)
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, InteractiveParser):
|
||||
return False
|
||||
|
||||
return self.parser_state == other.parser_state and self.lexer_thread == other.lexer_thread
|
||||
|
||||
def as_immutable(self):
|
||||
"""Convert to an ``ImmutableInteractiveParser``."""
|
||||
p = copy(self)
|
||||
return ImmutableInteractiveParser(p.parser, p.parser_state, p.lexer_thread)
|
||||
|
||||
def pretty(self):
|
||||
"""Print the output of ``choices()`` in a way that's easier to read."""
|
||||
out = ["Parser choices:"]
|
||||
for k, v in self.choices().items():
|
||||
out.append('\t- %s -> %r' % (k, v))
|
||||
out.append('stack size: %s' % len(self.parser_state.state_stack))
|
||||
return '\n'.join(out)
|
||||
|
||||
def choices(self):
|
||||
"""Returns a dictionary of token types, matched to their action in the parser.
|
||||
|
||||
Only returns token types that are accepted by the current state.
|
||||
|
||||
Updated by ``feed_token()``.
|
||||
"""
|
||||
return self.parser_state.parse_conf.parse_table.states[self.parser_state.position]
|
||||
|
||||
def accepts(self):
|
||||
"""Returns the set of possible tokens that will advance the parser into a new valid state."""
|
||||
accepts = set()
|
||||
conf_no_callbacks = copy(self.parser_state.parse_conf)
|
||||
# We don't want to call callbacks here since those might have arbitrary side effects
|
||||
# and are unnecessarily slow.
|
||||
conf_no_callbacks.callbacks = {}
|
||||
for t in self.choices():
|
||||
if t.isupper(): # is terminal?
|
||||
new_cursor = self.copy(deepcopy_values=False)
|
||||
new_cursor.parser_state.parse_conf = conf_no_callbacks
|
||||
try:
|
||||
new_cursor.feed_token(self.lexer_thread._Token(t, ''))
|
||||
except UnexpectedToken:
|
||||
pass
|
||||
else:
|
||||
accepts.add(t)
|
||||
return accepts
|
||||
|
||||
def resume_parse(self):
|
||||
"""Resume automated parsing from the current state.
|
||||
"""
|
||||
return self.parser.parse_from_state(self.parser_state, last_token=self.lexer_thread.state.last_token)
|
||||
|
||||
|
||||
|
||||
class ImmutableInteractiveParser(InteractiveParser):
|
||||
"""Same as ``InteractiveParser``, but operations create a new instance instead
|
||||
of changing it in-place.
|
||||
"""
|
||||
|
||||
result = None
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.parser_state, self.lexer_thread))
|
||||
|
||||
def feed_token(self, token):
|
||||
c = copy(self)
|
||||
c.result = InteractiveParser.feed_token(c, token)
|
||||
return c
|
||||
|
||||
def exhaust_lexer(self):
|
||||
"""Try to feed the rest of the lexer state into the parser.
|
||||
|
||||
Note that this returns a new ImmutableInteractiveParser and does not feed an '$END' Token"""
|
||||
cursor = self.as_mutable()
|
||||
cursor.exhaust_lexer()
|
||||
return cursor.as_immutable()
|
||||
|
||||
def as_mutable(self):
|
||||
"""Convert to an ``InteractiveParser``."""
|
||||
p = copy(self)
|
||||
return InteractiveParser(p.parser, p.parser_state, p.lexer_thread)
|
||||
|
||||
###}
|
||||
@@ -0,0 +1,122 @@
|
||||
"""This module implements a LALR(1) Parser
|
||||
"""
|
||||
# Author: Erez Shinan (2017)
|
||||
# Email : erezshin@gmail.com
|
||||
from typing import Dict, Any, Optional
|
||||
from ..lexer import Token, LexerThread
|
||||
from ..utils import Serialize
|
||||
from ..common import ParserConf, ParserCallbacks
|
||||
|
||||
from .lalr_analysis import LALR_Analyzer, IntParseTable, ParseTableBase
|
||||
from .lalr_interactive_parser import InteractiveParser
|
||||
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
|
||||
from .lalr_parser_state import ParserState, ParseConf
|
||||
|
||||
###{standalone
|
||||
|
||||
class LALR_Parser(Serialize):
|
||||
def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
|
||||
analysis = LALR_Analyzer(parser_conf, debug=debug, strict=strict)
|
||||
analysis.compute_lalr()
|
||||
callbacks = parser_conf.callbacks
|
||||
|
||||
self._parse_table = analysis.parse_table
|
||||
self.parser_conf = parser_conf
|
||||
self.parser = _Parser(analysis.parse_table, callbacks, debug)
|
||||
|
||||
@classmethod
|
||||
def deserialize(cls, data, memo, callbacks, debug=False):
|
||||
inst = cls.__new__(cls)
|
||||
inst._parse_table = IntParseTable.deserialize(data, memo)
|
||||
inst.parser = _Parser(inst._parse_table, callbacks, debug)
|
||||
return inst
|
||||
|
||||
def serialize(self, memo: Any = None) -> Dict[str, Any]:
|
||||
return self._parse_table.serialize(memo)
|
||||
|
||||
def parse_interactive(self, lexer: LexerThread, start: str):
|
||||
return self.parser.parse(lexer, start, start_interactive=True)
|
||||
|
||||
def parse(self, lexer, start, on_error=None):
|
||||
try:
|
||||
return self.parser.parse(lexer, start)
|
||||
except UnexpectedInput as e:
|
||||
if on_error is None:
|
||||
raise
|
||||
|
||||
while True:
|
||||
if isinstance(e, UnexpectedCharacters):
|
||||
s = e.interactive_parser.lexer_thread.state
|
||||
p = s.line_ctr.char_pos
|
||||
|
||||
if not on_error(e):
|
||||
raise e
|
||||
|
||||
if isinstance(e, UnexpectedCharacters):
|
||||
# If user didn't change the character position, then we should
|
||||
if p == s.line_ctr.char_pos:
|
||||
s.line_ctr.feed(s.text.text[p:p+1])
|
||||
|
||||
try:
|
||||
return e.interactive_parser.resume_parse()
|
||||
except UnexpectedToken as e2:
|
||||
if (isinstance(e, UnexpectedToken)
|
||||
and e.token.type == e2.token.type == '$END'
|
||||
and e.interactive_parser == e2.interactive_parser):
|
||||
# Prevent infinite loop
|
||||
raise e2
|
||||
e = e2
|
||||
except UnexpectedCharacters as e2:
|
||||
e = e2
|
||||
|
||||
|
||||
class _Parser:
|
||||
parse_table: ParseTableBase
|
||||
callbacks: ParserCallbacks
|
||||
debug: bool
|
||||
|
||||
def __init__(self, parse_table: ParseTableBase, callbacks: ParserCallbacks, debug: bool=False):
|
||||
self.parse_table = parse_table
|
||||
self.callbacks = callbacks
|
||||
self.debug = debug
|
||||
|
||||
def parse(self, lexer: LexerThread, start: str, value_stack=None, state_stack=None, start_interactive=False):
|
||||
parse_conf = ParseConf(self.parse_table, self.callbacks, start)
|
||||
parser_state = ParserState(parse_conf, lexer, state_stack, value_stack)
|
||||
if start_interactive:
|
||||
return InteractiveParser(self, parser_state, parser_state.lexer)
|
||||
return self.parse_from_state(parser_state)
|
||||
|
||||
|
||||
def parse_from_state(self, state: ParserState, last_token: Optional[Token]=None):
|
||||
"""Run the main LALR parser loop
|
||||
|
||||
Parameters:
|
||||
state - the initial state. Changed in-place.
|
||||
last_token - Used only for line information in case of an empty lexer.
|
||||
"""
|
||||
try:
|
||||
token = last_token
|
||||
for token in state.lexer.lex(state):
|
||||
assert token is not None
|
||||
state.feed_token(token)
|
||||
|
||||
end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
|
||||
return state.feed_token(end_token, True)
|
||||
except UnexpectedInput as e:
|
||||
try:
|
||||
e.interactive_parser = InteractiveParser(self, state, state.lexer)
|
||||
except NameError:
|
||||
pass
|
||||
raise e
|
||||
except Exception as e:
|
||||
if self.debug:
|
||||
print("")
|
||||
print("STATE STACK DUMP")
|
||||
print("----------------")
|
||||
for i, s in enumerate(state.state_stack):
|
||||
print('%d)' % i , s)
|
||||
print("")
|
||||
|
||||
raise
|
||||
###}
|
||||
@@ -0,0 +1,110 @@
|
||||
from copy import deepcopy, copy
|
||||
from typing import Dict, Any, Generic, List
|
||||
from ..lexer import Token, LexerThread
|
||||
from ..common import ParserCallbacks
|
||||
|
||||
from .lalr_analysis import Shift, ParseTableBase, StateT
|
||||
from lark.exceptions import UnexpectedToken
|
||||
|
||||
###{standalone
|
||||
|
||||
class ParseConf(Generic[StateT]):
|
||||
__slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states'
|
||||
|
||||
parse_table: ParseTableBase[StateT]
|
||||
callbacks: ParserCallbacks
|
||||
start: str
|
||||
|
||||
start_state: StateT
|
||||
end_state: StateT
|
||||
states: Dict[StateT, Dict[str, tuple]]
|
||||
|
||||
def __init__(self, parse_table: ParseTableBase[StateT], callbacks: ParserCallbacks, start: str):
|
||||
self.parse_table = parse_table
|
||||
|
||||
self.start_state = self.parse_table.start_states[start]
|
||||
self.end_state = self.parse_table.end_states[start]
|
||||
self.states = self.parse_table.states
|
||||
|
||||
self.callbacks = callbacks
|
||||
self.start = start
|
||||
|
||||
class ParserState(Generic[StateT]):
|
||||
__slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack'
|
||||
|
||||
parse_conf: ParseConf[StateT]
|
||||
lexer: LexerThread
|
||||
state_stack: List[StateT]
|
||||
value_stack: list
|
||||
|
||||
def __init__(self, parse_conf: ParseConf[StateT], lexer: LexerThread, state_stack=None, value_stack=None):
|
||||
self.parse_conf = parse_conf
|
||||
self.lexer = lexer
|
||||
self.state_stack = state_stack or [self.parse_conf.start_state]
|
||||
self.value_stack = value_stack or []
|
||||
|
||||
@property
|
||||
def position(self) -> StateT:
|
||||
return self.state_stack[-1]
|
||||
|
||||
# Necessary for match_examples() to work
|
||||
def __eq__(self, other) -> bool:
|
||||
if not isinstance(other, ParserState):
|
||||
return NotImplemented
|
||||
return len(self.state_stack) == len(other.state_stack) and self.position == other.position
|
||||
|
||||
def __copy__(self):
|
||||
return self.copy()
|
||||
|
||||
def copy(self, deepcopy_values=True) -> 'ParserState[StateT]':
|
||||
return type(self)(
|
||||
self.parse_conf,
|
||||
self.lexer, # XXX copy
|
||||
copy(self.state_stack),
|
||||
deepcopy(self.value_stack) if deepcopy_values else copy(self.value_stack),
|
||||
)
|
||||
|
||||
def feed_token(self, token: Token, is_end=False) -> Any:
|
||||
state_stack = self.state_stack
|
||||
value_stack = self.value_stack
|
||||
states = self.parse_conf.states
|
||||
end_state = self.parse_conf.end_state
|
||||
callbacks = self.parse_conf.callbacks
|
||||
|
||||
while True:
|
||||
state = state_stack[-1]
|
||||
try:
|
||||
action, arg = states[state][token.type]
|
||||
except KeyError:
|
||||
expected = {s for s in states[state].keys() if s.isupper()}
|
||||
raise UnexpectedToken(token, expected, state=self, interactive_parser=None)
|
||||
|
||||
assert arg != end_state
|
||||
|
||||
if action is Shift:
|
||||
# shift once and return
|
||||
assert not is_end
|
||||
state_stack.append(arg)
|
||||
value_stack.append(token if token.type not in callbacks else callbacks[token.type](token))
|
||||
return
|
||||
else:
|
||||
# reduce+shift as many times as necessary
|
||||
rule = arg
|
||||
size = len(rule.expansion)
|
||||
if size:
|
||||
s = value_stack[-size:]
|
||||
del state_stack[-size:]
|
||||
del value_stack[-size:]
|
||||
else:
|
||||
s = []
|
||||
|
||||
value = callbacks[rule](s) if callbacks else s
|
||||
|
||||
_action, new_state = states[state_stack[-1]][rule.origin.name]
|
||||
assert _action is Shift
|
||||
state_stack.append(new_state)
|
||||
value_stack.append(value)
|
||||
|
||||
if is_end and state_stack[-1] == end_state:
|
||||
return value_stack[-1]
|
||||
###}
|
||||
@@ -0,0 +1,166 @@
|
||||
"""This module implements an Earley parser with a dynamic lexer
|
||||
|
||||
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
|
||||
https://www.sciencedirect.com/science/article/pii/S1571066108001497
|
||||
|
||||
That is probably the best reference for understanding the algorithm here.
|
||||
|
||||
The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
|
||||
is better documented here:
|
||||
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
|
||||
|
||||
Instead of running a lexer beforehand, or using a costy char-by-char method, this parser
|
||||
uses regular expressions by necessity, achieving high-performance while maintaining all of
|
||||
Earley's power in parsing any CFG.
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING, Callable, Optional, List, Any
|
||||
from collections import defaultdict
|
||||
|
||||
from ..tree import Tree
|
||||
from ..exceptions import UnexpectedCharacters
|
||||
from ..lexer import Token
|
||||
from ..grammar import Terminal
|
||||
from .earley import Parser as BaseParser
|
||||
from .earley_forest import TokenNode
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..common import LexerConf, ParserConf
|
||||
|
||||
class Parser(BaseParser):
|
||||
def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable,
|
||||
resolve_ambiguity: bool=True, complete_lex: bool=False, debug: bool=False,
|
||||
tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True):
|
||||
BaseParser.__init__(self, lexer_conf, parser_conf, term_matcher, resolve_ambiguity,
|
||||
debug, tree_class, ordered_sets)
|
||||
self.ignore = [Terminal(t) for t in lexer_conf.ignore]
|
||||
self.complete_lex = complete_lex
|
||||
|
||||
def _parse(self, stream, columns, to_scan, start_symbol=None):
|
||||
|
||||
def scan(i, to_scan):
|
||||
"""The core Earley Scanner.
|
||||
|
||||
This is a custom implementation of the scanner that uses the
|
||||
Lark lexer to match tokens. The scan list is built by the
|
||||
Earley predictor, based on the previously completed tokens.
|
||||
This ensures that at each phase of the parse we have a custom
|
||||
lexer context, allowing for more complex ambiguities."""
|
||||
|
||||
node_cache = {}
|
||||
|
||||
# 1) Loop the expectations and ask the lexer to match.
|
||||
# Since regexp is forward looking on the input stream, and we only
|
||||
# want to process tokens when we hit the point in the stream at which
|
||||
# they complete, we push all tokens into a buffer (delayed_matches), to
|
||||
# be held possibly for a later parse step when we reach the point in the
|
||||
# input stream at which they complete.
|
||||
for item in self.Set(to_scan):
|
||||
m = match(item.expect, stream, i)
|
||||
if m:
|
||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
|
||||
delayed_matches[m.end()].append( (item, i, t) )
|
||||
|
||||
if self.complete_lex:
|
||||
s = m.group(0)
|
||||
for j in range(1, len(s)):
|
||||
m = match(item.expect, s[:-j])
|
||||
if m:
|
||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
|
||||
delayed_matches[i+m.end()].append( (item, i, t) )
|
||||
|
||||
# XXX The following 3 lines were commented out for causing a bug. See issue #768
|
||||
# # Remove any items that successfully matched in this pass from the to_scan buffer.
|
||||
# # This ensures we don't carry over tokens that already matched, if we're ignoring below.
|
||||
# to_scan.remove(item)
|
||||
|
||||
# 3) Process any ignores. This is typically used for e.g. whitespace.
|
||||
# We carry over any unmatched items from the to_scan buffer to be matched again after
|
||||
# the ignore. This should allow us to use ignored symbols in non-terminals to implement
|
||||
# e.g. mandatory spacing.
|
||||
for x in self.ignore:
|
||||
m = match(x, stream, i)
|
||||
if m:
|
||||
# Carry over any items still in the scan buffer, to past the end of the ignored items.
|
||||
delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ])
|
||||
|
||||
# If we're ignoring up to the end of the file, # carry over the start symbol if it already completed.
|
||||
delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol])
|
||||
|
||||
next_to_scan = self.Set()
|
||||
next_set = self.Set()
|
||||
columns.append(next_set)
|
||||
transitives.append({})
|
||||
|
||||
## 4) Process Tokens from delayed_matches.
|
||||
# This is the core of the Earley scanner. Create an SPPF node for each Token,
|
||||
# and create the symbol node in the SPPF tree. Advance the item that completed,
|
||||
# and add the resulting new item to either the Earley set (for processing by the
|
||||
# completer/predictor) or the to_scan buffer for the next parse step.
|
||||
for item, start, token in delayed_matches[i+1]:
|
||||
if token is not None:
|
||||
token.end_line = text_line
|
||||
token.end_column = text_column + 1
|
||||
token.end_pos = i + 1
|
||||
|
||||
new_item = item.advance()
|
||||
label = (new_item.s, new_item.start, i + 1)
|
||||
token_node = TokenNode(token, terminals[token.type])
|
||||
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)
|
||||
else:
|
||||
new_item = item
|
||||
|
||||
if new_item.expect in self.TERMINALS:
|
||||
# add (B ::= Aai+1.B, h, y) to Q'
|
||||
next_to_scan.add(new_item)
|
||||
else:
|
||||
# add (B ::= Aa+1.B, h, y) to Ei+1
|
||||
next_set.add(new_item)
|
||||
|
||||
del delayed_matches[i+1] # No longer needed, so unburden memory
|
||||
|
||||
if not next_set and not delayed_matches and not next_to_scan:
|
||||
considered_rules = list(sorted(to_scan, key=lambda key: key.rule.origin.name))
|
||||
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan},
|
||||
set(to_scan), state=frozenset(i.s for i in to_scan),
|
||||
considered_rules=considered_rules
|
||||
)
|
||||
|
||||
return next_to_scan, node_cache
|
||||
|
||||
|
||||
delayed_matches = defaultdict(list)
|
||||
match = self.term_matcher
|
||||
terminals = self.lexer_conf.terminals_by_name
|
||||
|
||||
# Cache for nodes & tokens created in a particular parse step.
|
||||
transitives = [{}]
|
||||
|
||||
text_line = 1
|
||||
text_column = 1
|
||||
|
||||
## The main Earley loop.
|
||||
# Run the Prediction/Completion cycle for any Items in the current Earley set.
|
||||
# Completions will be added to the SPPF tree, and predictions will be recursively
|
||||
# processed down to terminals/empty nodes to be added to the scanner for the next
|
||||
# step.
|
||||
i = 0
|
||||
node_cache = {}
|
||||
for token in stream:
|
||||
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
|
||||
|
||||
to_scan, node_cache = scan(i, to_scan)
|
||||
|
||||
if token == '\n':
|
||||
text_line += 1
|
||||
text_column = 1
|
||||
else:
|
||||
text_column += 1
|
||||
i += 1
|
||||
|
||||
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
|
||||
|
||||
## Column is now the final column in the parse.
|
||||
assert i == len(columns)-1
|
||||
return to_scan
|
||||
Reference in New Issue
Block a user