First Commit
This commit is contained in:
@@ -0,0 +1,39 @@
|
||||
from .exceptions import (
|
||||
GrammarError,
|
||||
LarkError,
|
||||
LexError,
|
||||
ParseError,
|
||||
UnexpectedCharacters,
|
||||
UnexpectedEOF,
|
||||
UnexpectedInput,
|
||||
UnexpectedToken,
|
||||
)
|
||||
from .lark import Lark
|
||||
from .lexer import Token
|
||||
from .tree import ParseTree, Tree
|
||||
from .utils import logger, TextSlice
|
||||
from .visitors import Discard, Transformer, Transformer_NonRecursive, Visitor, v_args
|
||||
|
||||
__version__: str = "1.3.1"
|
||||
|
||||
__all__ = (
|
||||
"GrammarError",
|
||||
"LarkError",
|
||||
"LexError",
|
||||
"ParseError",
|
||||
"UnexpectedCharacters",
|
||||
"UnexpectedEOF",
|
||||
"UnexpectedInput",
|
||||
"UnexpectedToken",
|
||||
"Lark",
|
||||
"Token",
|
||||
"ParseTree",
|
||||
"Tree",
|
||||
"logger",
|
||||
"Discard",
|
||||
"Transformer",
|
||||
"Transformer_NonRecursive",
|
||||
"TextSlice",
|
||||
"Visitor",
|
||||
"v_args",
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,6 @@
|
||||
# For usage of lark with PyInstaller. See https://pyinstaller-sample-hook.readthedocs.io/en/latest/index.html
|
||||
|
||||
import os
|
||||
|
||||
def get_hook_dirs():
|
||||
return [os.path.dirname(__file__)]
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,14 @@
|
||||
#-----------------------------------------------------------------------------
|
||||
# Copyright (c) 2017-2020, PyInstaller Development Team.
|
||||
#
|
||||
# Distributed under the terms of the GNU General Public License (version 2
|
||||
# or later) with exception for distributing the bootloader.
|
||||
#
|
||||
# The full license is in the file COPYING.txt, distributed with this software.
|
||||
#
|
||||
# SPDX-License-Identifier: (GPL-2.0-or-later WITH Bootloader-exception)
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
from PyInstaller.utils.hooks import collect_data_files
|
||||
|
||||
datas = collect_data_files('lark')
|
||||
@@ -0,0 +1,59 @@
|
||||
"""
|
||||
Module of utilities for transforming a lark.Tree into a custom Abstract Syntax Tree (AST defined in classes)
|
||||
"""
|
||||
|
||||
import inspect, re
|
||||
import types
|
||||
from typing import Optional, Callable
|
||||
|
||||
from lark import Transformer, v_args
|
||||
|
||||
class Ast:
|
||||
"""Abstract class
|
||||
|
||||
Subclasses will be collected by `create_transformer()`
|
||||
"""
|
||||
pass
|
||||
|
||||
class AsList:
|
||||
"""Abstract class
|
||||
|
||||
Subclasses will be instantiated with the parse results as a single list, instead of as arguments.
|
||||
"""
|
||||
|
||||
class WithMeta:
|
||||
"""Abstract class
|
||||
|
||||
Subclasses will be instantiated with the Meta instance of the tree. (see ``v_args`` for more detail)
|
||||
"""
|
||||
pass
|
||||
|
||||
def camel_to_snake(name):
|
||||
return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()
|
||||
|
||||
def create_transformer(ast_module: types.ModuleType,
|
||||
transformer: Optional[Transformer]=None,
|
||||
decorator_factory: Callable=v_args) -> Transformer:
|
||||
"""Collects `Ast` subclasses from the given module, and creates a Lark transformer that builds the AST.
|
||||
|
||||
For each class, we create a corresponding rule in the transformer, with a matching name.
|
||||
CamelCase names will be converted into snake_case. Example: "CodeBlock" -> "code_block".
|
||||
|
||||
Classes starting with an underscore (`_`) will be skipped.
|
||||
|
||||
Parameters:
|
||||
ast_module: A Python module containing all the subclasses of ``ast_utils.Ast``
|
||||
transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten.
|
||||
decorator_factory (Callable): An optional callable accepting two booleans, inline, and meta,
|
||||
and returning a decorator for the methods of ``transformer``. (default: ``v_args``).
|
||||
"""
|
||||
t = transformer or Transformer()
|
||||
|
||||
for name, obj in inspect.getmembers(ast_module):
|
||||
if not name.startswith('_') and inspect.isclass(obj):
|
||||
if issubclass(obj, Ast):
|
||||
wrapper = decorator_factory(inline=not issubclass(obj, AsList), meta=issubclass(obj, WithMeta))
|
||||
obj = wrapper(obj).__get__(t)
|
||||
setattr(t, camel_to_snake(name), obj)
|
||||
|
||||
return t
|
||||
@@ -0,0 +1,86 @@
|
||||
from copy import deepcopy
|
||||
import sys
|
||||
from types import ModuleType
|
||||
from typing import Callable, Collection, Dict, Optional, TYPE_CHECKING, List
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .lark import PostLex
|
||||
from .lexer import Lexer
|
||||
from .grammar import Rule
|
||||
from typing import Union, Type
|
||||
from typing import Literal
|
||||
if sys.version_info >= (3, 10):
|
||||
from typing import TypeAlias
|
||||
else:
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from .utils import Serialize
|
||||
from .lexer import TerminalDef, Token
|
||||
|
||||
###{standalone
|
||||
|
||||
_ParserArgType: 'TypeAlias' = 'Literal["earley", "lalr", "cyk", "auto"]'
|
||||
_LexerArgType: 'TypeAlias' = 'Union[Literal["auto", "basic", "contextual", "dynamic", "dynamic_complete"], Type[Lexer]]'
|
||||
_LexerCallback = Callable[[Token], Token]
|
||||
ParserCallbacks = Dict[str, Callable]
|
||||
|
||||
class LexerConf(Serialize):
|
||||
__serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type'
|
||||
__serialize_namespace__ = TerminalDef,
|
||||
|
||||
terminals: Collection[TerminalDef]
|
||||
re_module: ModuleType
|
||||
ignore: Collection[str]
|
||||
postlex: 'Optional[PostLex]'
|
||||
callbacks: Dict[str, _LexerCallback]
|
||||
g_regex_flags: int
|
||||
skip_validation: bool
|
||||
use_bytes: bool
|
||||
lexer_type: Optional[_LexerArgType]
|
||||
strict: bool
|
||||
|
||||
def __init__(self, terminals: Collection[TerminalDef], re_module: ModuleType, ignore: Collection[str]=(), postlex: 'Optional[PostLex]'=None,
|
||||
callbacks: Optional[Dict[str, _LexerCallback]]=None, g_regex_flags: int=0, skip_validation: bool=False, use_bytes: bool=False, strict: bool=False):
|
||||
self.terminals = terminals
|
||||
self.terminals_by_name = {t.name: t for t in self.terminals}
|
||||
assert len(self.terminals) == len(self.terminals_by_name)
|
||||
self.ignore = ignore
|
||||
self.postlex = postlex
|
||||
self.callbacks = callbacks or {}
|
||||
self.g_regex_flags = g_regex_flags
|
||||
self.re_module = re_module
|
||||
self.skip_validation = skip_validation
|
||||
self.use_bytes = use_bytes
|
||||
self.strict = strict
|
||||
self.lexer_type = None
|
||||
|
||||
def _deserialize(self):
|
||||
self.terminals_by_name = {t.name: t for t in self.terminals}
|
||||
|
||||
def __deepcopy__(self, memo=None):
|
||||
return type(self)(
|
||||
deepcopy(self.terminals, memo),
|
||||
self.re_module,
|
||||
deepcopy(self.ignore, memo),
|
||||
deepcopy(self.postlex, memo),
|
||||
deepcopy(self.callbacks, memo),
|
||||
deepcopy(self.g_regex_flags, memo),
|
||||
deepcopy(self.skip_validation, memo),
|
||||
deepcopy(self.use_bytes, memo),
|
||||
)
|
||||
|
||||
class ParserConf(Serialize):
|
||||
__serialize_fields__ = 'rules', 'start', 'parser_type'
|
||||
|
||||
rules: List['Rule']
|
||||
callbacks: ParserCallbacks
|
||||
start: List[str]
|
||||
parser_type: _ParserArgType
|
||||
|
||||
def __init__(self, rules: List['Rule'], callbacks: ParserCallbacks, start: List[str]):
|
||||
assert isinstance(start, list)
|
||||
self.rules = rules
|
||||
self.callbacks = callbacks
|
||||
self.start = start
|
||||
|
||||
###}
|
||||
@@ -0,0 +1,291 @@
|
||||
from .utils import logger, NO_VALUE
|
||||
from typing import Mapping, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set, Optional, Collection, TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .lexer import Token
|
||||
from .parsers.lalr_interactive_parser import InteractiveParser
|
||||
from .tree import Tree
|
||||
|
||||
###{standalone
|
||||
|
||||
class LarkError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ConfigurationError(LarkError, ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def assert_config(value, options: Collection, msg='Got %r, expected one of %s'):
|
||||
if value not in options:
|
||||
raise ConfigurationError(msg % (value, options))
|
||||
|
||||
|
||||
class GrammarError(LarkError):
|
||||
pass
|
||||
|
||||
|
||||
class ParseError(LarkError):
|
||||
pass
|
||||
|
||||
|
||||
class LexError(LarkError):
|
||||
pass
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
class UnexpectedInput(LarkError):
|
||||
"""UnexpectedInput Error.
|
||||
|
||||
Used as a base class for the following exceptions:
|
||||
|
||||
- ``UnexpectedCharacters``: The lexer encountered an unexpected string
|
||||
- ``UnexpectedToken``: The parser received an unexpected token
|
||||
- ``UnexpectedEOF``: The parser expected a token, but the input ended
|
||||
|
||||
After catching one of these exceptions, you may call the following helper methods to create a nicer error message.
|
||||
"""
|
||||
line: int
|
||||
column: int
|
||||
pos_in_stream = None
|
||||
state: Any
|
||||
_terminals_by_name = None
|
||||
interactive_parser: 'InteractiveParser'
|
||||
|
||||
def get_context(self, text: str, span: int=40) -> str:
|
||||
"""Returns a pretty string pinpointing the error in the text,
|
||||
with span amount of context characters around it.
|
||||
|
||||
Note:
|
||||
The parser doesn't hold a copy of the text it has to parse,
|
||||
so you have to provide it again
|
||||
"""
|
||||
pos = self.pos_in_stream or 0
|
||||
start = max(pos - span, 0)
|
||||
end = pos + span
|
||||
if not isinstance(text, bytes):
|
||||
before = text[start:pos].rsplit('\n', 1)[-1]
|
||||
after = text[pos:end].split('\n', 1)[0]
|
||||
return before + after + '\n' + ' ' * len(before.expandtabs()) + '^\n'
|
||||
else:
|
||||
before = text[start:pos].rsplit(b'\n', 1)[-1]
|
||||
after = text[pos:end].split(b'\n', 1)[0]
|
||||
return (before + after + b'\n' + b' ' * len(before.expandtabs()) + b'^\n').decode("ascii", "backslashreplace")
|
||||
|
||||
def match_examples(self, parse_fn: 'Callable[[str], Tree]',
|
||||
examples: Union[Mapping[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]],
|
||||
token_type_match_fallback: bool=False,
|
||||
use_accepts: bool=True
|
||||
) -> Optional[T]:
|
||||
"""Allows you to detect what's wrong in the input text by matching
|
||||
against example errors.
|
||||
|
||||
Given a parser instance and a dictionary mapping some label with
|
||||
some malformed syntax examples, it'll return the label for the
|
||||
example that bests matches the current error. The function will
|
||||
iterate the dictionary until it finds a matching error, and
|
||||
return the corresponding value.
|
||||
|
||||
For an example usage, see `examples/error_reporting_lalr.py`
|
||||
|
||||
Parameters:
|
||||
parse_fn: parse function (usually ``lark_instance.parse``)
|
||||
examples: dictionary of ``{'example_string': value}``.
|
||||
use_accepts: Recommended to keep this as ``use_accepts=True``.
|
||||
"""
|
||||
assert self.state is not None, "Not supported for this exception"
|
||||
|
||||
if isinstance(examples, Mapping):
|
||||
examples = examples.items()
|
||||
|
||||
candidate = (None, False)
|
||||
for i, (label, example) in enumerate(examples):
|
||||
assert not isinstance(example, str), "Expecting a list"
|
||||
|
||||
for j, malformed in enumerate(example):
|
||||
try:
|
||||
parse_fn(malformed)
|
||||
except UnexpectedInput as ut:
|
||||
if ut.state == self.state:
|
||||
if (
|
||||
use_accepts
|
||||
and isinstance(self, UnexpectedToken)
|
||||
and isinstance(ut, UnexpectedToken)
|
||||
and ut.accepts != self.accepts
|
||||
):
|
||||
logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" %
|
||||
(self.state, self.accepts, ut.accepts, i, j))
|
||||
continue
|
||||
if (
|
||||
isinstance(self, (UnexpectedToken, UnexpectedEOF))
|
||||
and isinstance(ut, (UnexpectedToken, UnexpectedEOF))
|
||||
):
|
||||
if ut.token == self.token: # Try exact match first
|
||||
logger.debug("Exact Match at example [%s][%s]" % (i, j))
|
||||
return label
|
||||
|
||||
if token_type_match_fallback:
|
||||
# Fallback to token types match
|
||||
if (ut.token.type == self.token.type) and not candidate[-1]:
|
||||
logger.debug("Token Type Fallback at example [%s][%s]" % (i, j))
|
||||
candidate = label, True
|
||||
|
||||
if candidate[0] is None:
|
||||
logger.debug("Same State match at example [%s][%s]" % (i, j))
|
||||
candidate = label, False
|
||||
|
||||
return candidate[0]
|
||||
|
||||
def _format_expected(self, expected):
|
||||
if self._terminals_by_name:
|
||||
d = self._terminals_by_name
|
||||
expected = [d[t_name].user_repr() if t_name in d else t_name for t_name in expected]
|
||||
return "Expected one of: \n\t* %s\n" % '\n\t* '.join(expected)
|
||||
|
||||
|
||||
class UnexpectedEOF(ParseError, UnexpectedInput):
|
||||
"""An exception that is raised by the parser, when the input ends while it still expects a token.
|
||||
"""
|
||||
expected: 'List[Token]'
|
||||
|
||||
def __init__(self, expected, state=None, terminals_by_name=None):
|
||||
super(UnexpectedEOF, self).__init__()
|
||||
|
||||
self.expected = expected
|
||||
self.state = state
|
||||
from .lexer import Token
|
||||
self.token = Token("<EOF>", "") # , line=-1, column=-1, pos_in_stream=-1)
|
||||
self.pos_in_stream = -1
|
||||
self.line = -1
|
||||
self.column = -1
|
||||
self._terminals_by_name = terminals_by_name
|
||||
|
||||
|
||||
def __str__(self):
|
||||
message = "Unexpected end-of-input. "
|
||||
message += self._format_expected(self.expected)
|
||||
return message
|
||||
|
||||
|
||||
class UnexpectedCharacters(LexError, UnexpectedInput):
|
||||
"""An exception that is raised by the lexer, when it cannot match the next
|
||||
string of characters to any of its terminals.
|
||||
"""
|
||||
|
||||
allowed: Set[str]
|
||||
considered_tokens: Set[Any]
|
||||
|
||||
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None,
|
||||
terminals_by_name=None, considered_rules=None):
|
||||
super(UnexpectedCharacters, self).__init__()
|
||||
|
||||
# TODO considered_tokens and allowed can be figured out using state
|
||||
self.line = line
|
||||
self.column = column
|
||||
self.pos_in_stream = lex_pos
|
||||
self.state = state
|
||||
self._terminals_by_name = terminals_by_name
|
||||
|
||||
self.allowed = allowed
|
||||
self.considered_tokens = considered_tokens
|
||||
self.considered_rules = considered_rules
|
||||
self.token_history = token_history
|
||||
|
||||
if isinstance(seq, bytes):
|
||||
self.char = seq[lex_pos:lex_pos + 1].decode("ascii", "backslashreplace")
|
||||
else:
|
||||
self.char = seq[lex_pos]
|
||||
self._context = self.get_context(seq)
|
||||
|
||||
|
||||
def __str__(self):
|
||||
message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column)
|
||||
message += '\n\n' + self._context
|
||||
if self.allowed:
|
||||
message += self._format_expected(self.allowed)
|
||||
if self.token_history:
|
||||
message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history)
|
||||
return message
|
||||
|
||||
|
||||
class UnexpectedToken(ParseError, UnexpectedInput):
|
||||
"""An exception that is raised by the parser, when the token it received
|
||||
doesn't match any valid step forward.
|
||||
|
||||
Parameters:
|
||||
token: The mismatched token
|
||||
expected: The set of expected tokens
|
||||
considered_rules: Which rules were considered, to deduce the expected tokens
|
||||
state: A value representing the parser state. Do not rely on its value or type.
|
||||
interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failure,
|
||||
and can be used for debugging and error handling.
|
||||
|
||||
Note: These parameters are available as attributes of the instance.
|
||||
"""
|
||||
|
||||
expected: Set[str]
|
||||
considered_rules: Set[str]
|
||||
|
||||
def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None):
|
||||
super(UnexpectedToken, self).__init__()
|
||||
|
||||
# TODO considered_rules and expected can be figured out using state
|
||||
self.line = getattr(token, 'line', '?')
|
||||
self.column = getattr(token, 'column', '?')
|
||||
self.pos_in_stream = getattr(token, 'start_pos', None)
|
||||
self.state = state
|
||||
|
||||
self.token = token
|
||||
self.expected = expected # XXX deprecate? `accepts` is better
|
||||
self._accepts = NO_VALUE
|
||||
self.considered_rules = considered_rules
|
||||
self.interactive_parser = interactive_parser
|
||||
self._terminals_by_name = terminals_by_name
|
||||
self.token_history = token_history
|
||||
|
||||
|
||||
@property
|
||||
def accepts(self) -> Set[str]:
|
||||
if self._accepts is NO_VALUE:
|
||||
self._accepts = self.interactive_parser and self.interactive_parser.accepts()
|
||||
return self._accepts
|
||||
|
||||
def __str__(self):
|
||||
message = ("Unexpected token %r at line %s, column %s.\n%s"
|
||||
% (self.token, self.line, self.column, self._format_expected(self.accepts or self.expected)))
|
||||
if self.token_history:
|
||||
message += "Previous tokens: %r\n" % self.token_history
|
||||
|
||||
return message
|
||||
|
||||
|
||||
|
||||
class VisitError(LarkError):
|
||||
"""VisitError is raised when visitors are interrupted by an exception
|
||||
|
||||
It provides the following attributes for inspection:
|
||||
|
||||
Parameters:
|
||||
rule: the name of the visit rule that failed
|
||||
obj: the tree-node or token that was being processed
|
||||
orig_exc: the exception that cause it to fail
|
||||
|
||||
Note: These parameters are available as attributes
|
||||
"""
|
||||
|
||||
obj: 'Union[Tree, Token]'
|
||||
orig_exc: Exception
|
||||
|
||||
def __init__(self, rule, obj, orig_exc):
|
||||
message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
|
||||
super(VisitError, self).__init__(message)
|
||||
|
||||
self.rule = rule
|
||||
self.obj = obj
|
||||
self.orig_exc = orig_exc
|
||||
|
||||
|
||||
class MissingVariableError(LarkError):
|
||||
pass
|
||||
|
||||
###}
|
||||
@@ -0,0 +1,136 @@
|
||||
from typing import Any, Dict, Optional, Tuple, ClassVar, Sequence
|
||||
|
||||
from .utils import Serialize
|
||||
|
||||
###{standalone
|
||||
TOKEN_DEFAULT_PRIORITY = 0
|
||||
|
||||
|
||||
class Symbol(Serialize):
|
||||
__slots__ = ('name',)
|
||||
|
||||
name: str
|
||||
is_term: ClassVar[bool] = NotImplemented
|
||||
|
||||
def __init__(self, name: str) -> None:
|
||||
self.name = name
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, Symbol):
|
||||
return NotImplemented
|
||||
return self.is_term == other.is_term and self.name == other.name
|
||||
|
||||
def __ne__(self, other):
|
||||
return not (self == other)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.name)
|
||||
|
||||
def __repr__(self):
|
||||
return '%s(%r)' % (type(self).__name__, self.name)
|
||||
|
||||
fullrepr = property(__repr__)
|
||||
|
||||
def renamed(self, f):
|
||||
return type(self)(f(self.name))
|
||||
|
||||
|
||||
class Terminal(Symbol):
|
||||
__serialize_fields__ = 'name', 'filter_out'
|
||||
|
||||
is_term: ClassVar[bool] = True
|
||||
|
||||
def __init__(self, name: str, filter_out: bool = False) -> None:
|
||||
self.name = name
|
||||
self.filter_out = filter_out
|
||||
|
||||
@property
|
||||
def fullrepr(self):
|
||||
return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out)
|
||||
|
||||
def renamed(self, f):
|
||||
return type(self)(f(self.name), self.filter_out)
|
||||
|
||||
|
||||
class NonTerminal(Symbol):
|
||||
__serialize_fields__ = 'name',
|
||||
|
||||
is_term: ClassVar[bool] = False
|
||||
|
||||
def serialize(self, memo=None) -> Dict[str, Any]:
|
||||
# TODO this is here because self.name can be a Token instance.
|
||||
# remove this function when the issue is fixed. (backwards-incompatible)
|
||||
return {'name': str(self.name), '__type__': 'NonTerminal'}
|
||||
|
||||
|
||||
class RuleOptions(Serialize):
|
||||
__serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'template_source', 'empty_indices'
|
||||
|
||||
keep_all_tokens: bool
|
||||
expand1: bool
|
||||
priority: Optional[int]
|
||||
template_source: Optional[str]
|
||||
empty_indices: Tuple[bool, ...]
|
||||
|
||||
def __init__(self, keep_all_tokens: bool=False, expand1: bool=False, priority: Optional[int]=None, template_source: Optional[str]=None, empty_indices: Tuple[bool, ...]=()) -> None:
|
||||
self.keep_all_tokens = keep_all_tokens
|
||||
self.expand1 = expand1
|
||||
self.priority = priority
|
||||
self.template_source = template_source
|
||||
self.empty_indices = empty_indices
|
||||
|
||||
def __repr__(self):
|
||||
return 'RuleOptions(%r, %r, %r, %r)' % (
|
||||
self.keep_all_tokens,
|
||||
self.expand1,
|
||||
self.priority,
|
||||
self.template_source
|
||||
)
|
||||
|
||||
|
||||
class Rule(Serialize):
|
||||
"""
|
||||
origin : a symbol
|
||||
expansion : a list of symbols
|
||||
order : index of this expansion amongst all rules of the same name
|
||||
"""
|
||||
__slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash')
|
||||
|
||||
__serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options'
|
||||
__serialize_namespace__ = Terminal, NonTerminal, RuleOptions
|
||||
|
||||
origin: NonTerminal
|
||||
expansion: Sequence[Symbol]
|
||||
order: int
|
||||
alias: Optional[str]
|
||||
options: RuleOptions
|
||||
_hash: int
|
||||
|
||||
def __init__(self, origin: NonTerminal, expansion: Sequence[Symbol],
|
||||
order: int=0, alias: Optional[str]=None, options: Optional[RuleOptions]=None):
|
||||
self.origin = origin
|
||||
self.expansion = expansion
|
||||
self.alias = alias
|
||||
self.order = order
|
||||
self.options = options or RuleOptions()
|
||||
self._hash = hash((self.origin, tuple(self.expansion)))
|
||||
|
||||
def _deserialize(self):
|
||||
self._hash = hash((self.origin, tuple(self.expansion)))
|
||||
|
||||
def __str__(self):
|
||||
return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion))
|
||||
|
||||
def __repr__(self):
|
||||
return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options)
|
||||
|
||||
def __hash__(self):
|
||||
return self._hash
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, Rule):
|
||||
return False
|
||||
return self.origin == other.origin and self.expansion == other.expansion
|
||||
|
||||
|
||||
###}
|
||||
Binary file not shown.
@@ -0,0 +1,59 @@
|
||||
// Basic terminals for common use
|
||||
|
||||
|
||||
//
|
||||
// Numbers
|
||||
//
|
||||
|
||||
DIGIT: "0".."9"
|
||||
HEXDIGIT: "a".."f"|"A".."F"|DIGIT
|
||||
|
||||
INT: DIGIT+
|
||||
SIGNED_INT: ["+"|"-"] INT
|
||||
DECIMAL: INT "." INT? | "." INT
|
||||
|
||||
// float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/
|
||||
_EXP: ("e"|"E") SIGNED_INT
|
||||
FLOAT: INT _EXP | DECIMAL _EXP?
|
||||
SIGNED_FLOAT: ["+"|"-"] FLOAT
|
||||
|
||||
NUMBER: FLOAT | INT
|
||||
SIGNED_NUMBER: ["+"|"-"] NUMBER
|
||||
|
||||
//
|
||||
// Strings
|
||||
//
|
||||
_STRING_INNER: /.*?/
|
||||
_STRING_ESC_INNER: _STRING_INNER /(?<!\\)(\\\\)*?/
|
||||
|
||||
ESCAPED_STRING : "\"" _STRING_ESC_INNER "\""
|
||||
|
||||
|
||||
//
|
||||
// Names (Variables)
|
||||
//
|
||||
LCASE_LETTER: "a".."z"
|
||||
UCASE_LETTER: "A".."Z"
|
||||
|
||||
LETTER: UCASE_LETTER | LCASE_LETTER
|
||||
WORD: LETTER+
|
||||
|
||||
CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)*
|
||||
|
||||
|
||||
//
|
||||
// Whitespace
|
||||
//
|
||||
WS_INLINE: (" "|/\t/)+
|
||||
WS: /[ \t\f\r\n]/+
|
||||
|
||||
CR : /\r/
|
||||
LF : /\n/
|
||||
NEWLINE: (CR? LF)+
|
||||
|
||||
|
||||
// Comments
|
||||
SH_COMMENT: /#[^\n]*/
|
||||
CPP_COMMENT: /\/\/[^\n]*/
|
||||
C_COMMENT: "/*" /(.|\n)*?/ "*/"
|
||||
SQL_COMMENT: /--[^\n]*/
|
||||
@@ -0,0 +1,62 @@
|
||||
# Lark grammar of Lark's syntax
|
||||
# Note: Lark is not bootstrapped, its parser is implemented in load_grammar.py
|
||||
|
||||
start: (_item? _NL)* _item?
|
||||
|
||||
_item: rule
|
||||
| token
|
||||
| statement
|
||||
|
||||
rule: RULE rule_params priority? ":" expansions
|
||||
token: TOKEN token_params priority? ":" expansions
|
||||
|
||||
rule_params: ["{" RULE ("," RULE)* "}"]
|
||||
token_params: ["{" TOKEN ("," TOKEN)* "}"]
|
||||
|
||||
priority: "." NUMBER
|
||||
|
||||
statement: "%ignore" expansions -> ignore
|
||||
| "%import" import_path ["->" name] -> import
|
||||
| "%import" import_path name_list -> multi_import
|
||||
| "%override" rule -> override_rule
|
||||
| "%declare" name+ -> declare
|
||||
|
||||
!import_path: "."? name ("." name)*
|
||||
name_list: "(" name ("," name)* ")"
|
||||
|
||||
?expansions: alias (_VBAR alias)*
|
||||
|
||||
?alias: expansion ["->" RULE]
|
||||
|
||||
?expansion: expr*
|
||||
|
||||
?expr: atom [OP | "~" NUMBER [".." NUMBER]]
|
||||
|
||||
?atom: "(" expansions ")"
|
||||
| "[" expansions "]" -> maybe
|
||||
| value
|
||||
|
||||
?value: STRING ".." STRING -> literal_range
|
||||
| name
|
||||
| (REGEXP | STRING) -> literal
|
||||
| name "{" value ("," value)* "}" -> template_usage
|
||||
|
||||
name: RULE
|
||||
| TOKEN
|
||||
|
||||
_VBAR: _NL? "|"
|
||||
OP: /[+*]|[?](?![a-z])/
|
||||
RULE: /!?[_?]?[a-z][_a-z0-9]*/
|
||||
TOKEN: /_?[A-Z][_A-Z0-9]*/
|
||||
STRING: _STRING "i"?
|
||||
REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/])*?\/[imslux]*/
|
||||
_NL: /(\r?\n)+\s*/
|
||||
|
||||
%import common.ESCAPED_STRING -> _STRING
|
||||
%import common.SIGNED_INT -> NUMBER
|
||||
%import common.WS_INLINE
|
||||
|
||||
COMMENT: /\s*/ "//" /[^\n]/* | /\s*/ "#" /[^\n]/*
|
||||
|
||||
%ignore WS_INLINE
|
||||
%ignore COMMENT
|
||||
@@ -0,0 +1,302 @@
|
||||
// Python 3 grammar for Lark
|
||||
|
||||
// This grammar should parse all python 3.x code successfully.
|
||||
|
||||
// Adapted from: https://docs.python.org/3/reference/grammar.html
|
||||
|
||||
// Start symbols for the grammar:
|
||||
// single_input is a single interactive statement;
|
||||
// file_input is a module or sequence of commands read from an input file;
|
||||
// eval_input is the input for the eval() functions.
|
||||
// NB: compound_stmt in single_input is followed by extra NEWLINE!
|
||||
//
|
||||
|
||||
single_input: _NEWLINE | simple_stmt | compound_stmt _NEWLINE
|
||||
file_input: (_NEWLINE | stmt)*
|
||||
eval_input: testlist _NEWLINE*
|
||||
|
||||
decorator: "@" dotted_name [ "(" [arguments] ")" ] _NEWLINE
|
||||
decorators: decorator+
|
||||
decorated: decorators (classdef | funcdef | async_funcdef)
|
||||
|
||||
async_funcdef: "async" funcdef
|
||||
funcdef: "def" name "(" [parameters] ")" ["->" test] ":" suite
|
||||
|
||||
parameters: paramvalue ("," paramvalue)* ["," SLASH ("," paramvalue)*] ["," [starparams | kwparams]]
|
||||
| starparams
|
||||
| kwparams
|
||||
|
||||
SLASH: "/" // Otherwise the it will completely disappear and it will be undisguisable in the result
|
||||
starparams: (starparam | starguard) poststarparams
|
||||
starparam: "*" typedparam
|
||||
starguard: "*"
|
||||
poststarparams: ("," paramvalue)* ["," kwparams]
|
||||
kwparams: "**" typedparam ","?
|
||||
|
||||
?paramvalue: typedparam ("=" test)?
|
||||
?typedparam: name (":" test)?
|
||||
|
||||
|
||||
lambdef: "lambda" [lambda_params] ":" test
|
||||
lambdef_nocond: "lambda" [lambda_params] ":" test_nocond
|
||||
lambda_params: lambda_paramvalue ("," lambda_paramvalue)* ["," [lambda_starparams | lambda_kwparams]]
|
||||
| lambda_starparams
|
||||
| lambda_kwparams
|
||||
?lambda_paramvalue: name ("=" test)?
|
||||
lambda_starparams: "*" [name] ("," lambda_paramvalue)* ["," [lambda_kwparams]]
|
||||
lambda_kwparams: "**" name ","?
|
||||
|
||||
|
||||
?stmt: simple_stmt | compound_stmt
|
||||
?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE
|
||||
?small_stmt: (expr_stmt | assign_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
|
||||
expr_stmt: testlist_star_expr
|
||||
assign_stmt: annassign | augassign | assign
|
||||
|
||||
annassign: testlist_star_expr ":" test ["=" test]
|
||||
assign: testlist_star_expr ("=" (yield_expr|testlist_star_expr))+
|
||||
augassign: testlist_star_expr augassign_op (yield_expr|testlist)
|
||||
!augassign_op: "+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//="
|
||||
?testlist_star_expr: test_or_star_expr
|
||||
| test_or_star_expr ("," test_or_star_expr)+ ","? -> tuple
|
||||
| test_or_star_expr "," -> tuple
|
||||
|
||||
// For normal and annotated assignments, additional restrictions enforced by the interpreter
|
||||
del_stmt: "del" exprlist
|
||||
pass_stmt: "pass"
|
||||
?flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
|
||||
break_stmt: "break"
|
||||
continue_stmt: "continue"
|
||||
return_stmt: "return" [testlist]
|
||||
yield_stmt: yield_expr
|
||||
raise_stmt: "raise" [test ["from" test]]
|
||||
import_stmt: import_name | import_from
|
||||
import_name: "import" dotted_as_names
|
||||
// note below: the ("." | "...") is necessary because "..." is tokenized as ELLIPSIS
|
||||
import_from: "from" (dots? dotted_name | dots) "import" ("*" | "(" import_as_names ")" | import_as_names)
|
||||
!dots: "."+
|
||||
import_as_name: name ["as" name]
|
||||
dotted_as_name: dotted_name ["as" name]
|
||||
import_as_names: import_as_name ("," import_as_name)* [","]
|
||||
dotted_as_names: dotted_as_name ("," dotted_as_name)*
|
||||
dotted_name: name ("." name)*
|
||||
global_stmt: "global" name ("," name)*
|
||||
nonlocal_stmt: "nonlocal" name ("," name)*
|
||||
assert_stmt: "assert" test ["," test]
|
||||
|
||||
?compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | match_stmt
|
||||
| with_stmt | funcdef | classdef | decorated | async_stmt
|
||||
async_stmt: "async" (funcdef | with_stmt | for_stmt)
|
||||
if_stmt: "if" test ":" suite elifs ["else" ":" suite]
|
||||
elifs: elif_*
|
||||
elif_: "elif" test ":" suite
|
||||
while_stmt: "while" test ":" suite ["else" ":" suite]
|
||||
for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite]
|
||||
try_stmt: "try" ":" suite except_clauses ["else" ":" suite] [finally]
|
||||
| "try" ":" suite finally -> try_finally
|
||||
finally: "finally" ":" suite
|
||||
except_clauses: except_clause+
|
||||
except_clause: "except" [test ["as" name]] ":" suite
|
||||
// NB compile.c makes sure that the default except clause is last
|
||||
|
||||
|
||||
with_stmt: "with" with_items ":" suite
|
||||
with_items: with_item ("," with_item)*
|
||||
with_item: test ["as" name]
|
||||
|
||||
match_stmt: "match" test ":" _NEWLINE _INDENT case+ _DEDENT
|
||||
|
||||
case: "case" pattern ["if" test] ":" suite
|
||||
|
||||
?pattern: sequence_item_pattern "," _sequence_pattern -> sequence_pattern
|
||||
| as_pattern
|
||||
?as_pattern: or_pattern ("as" NAME)?
|
||||
?or_pattern: closed_pattern ("|" closed_pattern)*
|
||||
?closed_pattern: literal_pattern
|
||||
| NAME -> capture_pattern
|
||||
| "_" -> any_pattern
|
||||
| attr_pattern
|
||||
| "(" as_pattern ")"
|
||||
| "[" _sequence_pattern "]" -> sequence_pattern
|
||||
| "(" (sequence_item_pattern "," _sequence_pattern)? ")" -> sequence_pattern
|
||||
| "{" (mapping_item_pattern ("," mapping_item_pattern)* ","?)?"}" -> mapping_pattern
|
||||
| "{" (mapping_item_pattern ("," mapping_item_pattern)* ",")? "**" NAME ","? "}" -> mapping_star_pattern
|
||||
| class_pattern
|
||||
|
||||
literal_pattern: inner_literal_pattern
|
||||
|
||||
?inner_literal_pattern: "None" -> const_none
|
||||
| "True" -> const_true
|
||||
| "False" -> const_false
|
||||
| STRING -> string
|
||||
| number
|
||||
|
||||
attr_pattern: NAME ("." NAME)+ -> value
|
||||
|
||||
name_or_attr_pattern: NAME ("." NAME)* -> value
|
||||
|
||||
mapping_item_pattern: (literal_pattern|attr_pattern) ":" as_pattern
|
||||
|
||||
_sequence_pattern: (sequence_item_pattern ("," sequence_item_pattern)* ","?)?
|
||||
?sequence_item_pattern: as_pattern
|
||||
| "*" NAME -> star_pattern
|
||||
|
||||
class_pattern: name_or_attr_pattern "(" [arguments_pattern ","?] ")"
|
||||
arguments_pattern: pos_arg_pattern ["," keyws_arg_pattern]
|
||||
| keyws_arg_pattern -> no_pos_arguments
|
||||
|
||||
pos_arg_pattern: as_pattern ("," as_pattern)*
|
||||
keyws_arg_pattern: keyw_arg_pattern ("," keyw_arg_pattern)*
|
||||
keyw_arg_pattern: NAME "=" as_pattern
|
||||
|
||||
|
||||
|
||||
suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT
|
||||
|
||||
?test: or_test ("if" or_test "else" test)?
|
||||
| lambdef
|
||||
| assign_expr
|
||||
|
||||
assign_expr: name ":=" test
|
||||
|
||||
?test_nocond: or_test | lambdef_nocond
|
||||
|
||||
?or_test: and_test ("or" and_test)*
|
||||
?and_test: not_test_ ("and" not_test_)*
|
||||
?not_test_: "not" not_test_ -> not_test
|
||||
| comparison
|
||||
?comparison: expr (comp_op expr)*
|
||||
star_expr: "*" expr
|
||||
|
||||
?expr: or_expr
|
||||
?or_expr: xor_expr ("|" xor_expr)*
|
||||
?xor_expr: and_expr ("^" and_expr)*
|
||||
?and_expr: shift_expr ("&" shift_expr)*
|
||||
?shift_expr: arith_expr (_shift_op arith_expr)*
|
||||
?arith_expr: term (_add_op term)*
|
||||
?term: factor (_mul_op factor)*
|
||||
?factor: _unary_op factor | power
|
||||
|
||||
!_unary_op: "+"|"-"|"~"
|
||||
!_add_op: "+"|"-"
|
||||
!_shift_op: "<<"|">>"
|
||||
!_mul_op: "*"|"@"|"/"|"%"|"//"
|
||||
// <> isn't actually a valid comparison operator in Python. It's here for the
|
||||
// sake of a __future__ import described in PEP 401 (which really works :-)
|
||||
!comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not"
|
||||
|
||||
?power: await_expr ("**" factor)?
|
||||
?await_expr: AWAIT? atom_expr
|
||||
AWAIT: "await"
|
||||
|
||||
?atom_expr: atom_expr "(" [arguments] ")" -> funccall
|
||||
| atom_expr "[" subscriptlist "]" -> getitem
|
||||
| atom_expr "." name -> getattr
|
||||
| atom
|
||||
|
||||
?atom: "(" yield_expr ")"
|
||||
| "(" _tuple_inner? ")" -> tuple
|
||||
| "(" comprehension{test_or_star_expr} ")" -> tuple_comprehension
|
||||
| "[" _exprlist? "]" -> list
|
||||
| "[" comprehension{test_or_star_expr} "]" -> list_comprehension
|
||||
| "{" _dict_exprlist? "}" -> dict
|
||||
| "{" comprehension{key_value} "}" -> dict_comprehension
|
||||
| "{" _exprlist "}" -> set
|
||||
| "{" comprehension{test} "}" -> set_comprehension
|
||||
| name -> var
|
||||
| number
|
||||
| string_concat
|
||||
| "(" test ")"
|
||||
| "..." -> ellipsis
|
||||
| "None" -> const_none
|
||||
| "True" -> const_true
|
||||
| "False" -> const_false
|
||||
|
||||
|
||||
?string_concat: string+
|
||||
|
||||
_tuple_inner: test_or_star_expr (("," test_or_star_expr)+ [","] | ",")
|
||||
|
||||
?test_or_star_expr: test
|
||||
| star_expr
|
||||
|
||||
?subscriptlist: subscript
|
||||
| subscript (("," subscript)+ [","] | ",") -> subscript_tuple
|
||||
?subscript: test | ([test] ":" [test] [sliceop]) -> slice
|
||||
sliceop: ":" [test]
|
||||
?exprlist: (expr|star_expr)
|
||||
| (expr|star_expr) (("," (expr|star_expr))+ [","]|",")
|
||||
?testlist: test | testlist_tuple
|
||||
testlist_tuple: test (("," test)+ [","] | ",")
|
||||
_dict_exprlist: (key_value | "**" expr) ("," (key_value | "**" expr))* [","]
|
||||
|
||||
key_value: test ":" test
|
||||
|
||||
_exprlist: test_or_star_expr ("," test_or_star_expr)* [","]
|
||||
|
||||
classdef: "class" name ["(" [arguments] ")"] ":" suite
|
||||
|
||||
|
||||
|
||||
arguments: argvalue ("," argvalue)* ("," [ starargs | kwargs])?
|
||||
| starargs
|
||||
| kwargs
|
||||
| comprehension{test}
|
||||
|
||||
starargs: stararg ("," stararg)* ("," argvalue)* ["," kwargs]
|
||||
stararg: "*" test
|
||||
kwargs: "**" test ("," argvalue)*
|
||||
|
||||
?argvalue: test ("=" test)?
|
||||
|
||||
|
||||
comprehension{comp_result}: comp_result comp_fors [comp_if]
|
||||
comp_fors: comp_for+
|
||||
comp_for: [ASYNC] "for" exprlist "in" or_test
|
||||
ASYNC: "async"
|
||||
?comp_if: "if" test_nocond
|
||||
|
||||
// not used in grammar, but may appear in "node" passed from Parser to Compiler
|
||||
encoding_decl: name
|
||||
|
||||
yield_expr: "yield" [testlist]
|
||||
| "yield" "from" test -> yield_from
|
||||
|
||||
number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER
|
||||
string: STRING | LONG_STRING
|
||||
|
||||
// Other terminals
|
||||
|
||||
_NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+
|
||||
|
||||
%ignore /[\t \f]+/ // WS
|
||||
%ignore /\\[\t \f]*\r?\n/ // LINE_CONT
|
||||
%ignore COMMENT
|
||||
%declare _INDENT _DEDENT
|
||||
|
||||
|
||||
// Python terminals
|
||||
|
||||
!name: NAME | "match" | "case"
|
||||
NAME: /[^\W\d]\w*/
|
||||
COMMENT: /#[^\n]*/
|
||||
|
||||
STRING: /([ubf]?r?|r[ubf])("(?!"").*?(?<!\\)(\\\\)*?"|'(?!'').*?(?<!\\)(\\\\)*?')/i
|
||||
LONG_STRING: /([ubf]?r?|r[ubf])(""".*?(?<!\\)(\\\\)*?"""|'''.*?(?<!\\)(\\\\)*?''')/is
|
||||
|
||||
_SPECIAL_DEC: "0".."9" ("_"? "0".."9" )*
|
||||
DEC_NUMBER: "1".."9" ("_"? "0".."9" )*
|
||||
| "0" ("_"? "0" )* /(?![1-9])/
|
||||
HEX_NUMBER.2: "0" ("x" | "X") ("_"? ("0".."9" | "a".."f" | "A".."F"))+
|
||||
OCT_NUMBER.2: "0" ("o" | "O") ("_"? "0".."7" )+
|
||||
BIN_NUMBER.2: "0" ("b" | "B") ("_"? "0".."1" )+
|
||||
|
||||
_EXP: ("e"|"E") ["+" | "-"] _SPECIAL_DEC
|
||||
DECIMAL: "." _SPECIAL_DEC | _SPECIAL_DEC "." _SPECIAL_DEC?
|
||||
FLOAT_NUMBER.2: _SPECIAL_DEC _EXP | DECIMAL _EXP?
|
||||
IMAG_NUMBER.2: (_SPECIAL_DEC | FLOAT_NUMBER) ("J" | "j")
|
||||
|
||||
|
||||
// Comma-separated list (with an optional trailing comma)
|
||||
cs_list{item}: item ("," item)* ","?
|
||||
_cs_list{item}: item ("," item)* ","?
|
||||
@@ -0,0 +1,7 @@
|
||||
// TODO: LETTER, WORD, etc.
|
||||
|
||||
//
|
||||
// Whitespace
|
||||
//
|
||||
WS_INLINE: /[ \t\xa0]/+
|
||||
WS: /[ \t\xa0\f\r\n]/+
|
||||
@@ -0,0 +1,144 @@
|
||||
"Provides a post-lexer for implementing Python-style indentation."
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Iterator
|
||||
|
||||
from .exceptions import LarkError
|
||||
from .lark import PostLex
|
||||
from .lexer import Token
|
||||
|
||||
###{standalone
|
||||
|
||||
class DedentError(LarkError):
|
||||
pass
|
||||
|
||||
class Indenter(PostLex, ABC):
|
||||
"""This is a postlexer that "injects" indent/dedent tokens based on indentation.
|
||||
|
||||
It keeps track of the current indentation, as well as the current level of parentheses.
|
||||
Inside parentheses, the indentation is ignored, and no indent/dedent tokens get generated.
|
||||
|
||||
Note: This is an abstract class. To use it, inherit and implement all its abstract methods:
|
||||
- tab_len
|
||||
- NL_type
|
||||
- OPEN_PAREN_types, CLOSE_PAREN_types
|
||||
- INDENT_type, DEDENT_type
|
||||
|
||||
See also: the ``postlex`` option in `Lark`.
|
||||
"""
|
||||
paren_level: int
|
||||
indent_level: List[int]
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.paren_level = 0
|
||||
self.indent_level = [0]
|
||||
assert self.tab_len > 0
|
||||
|
||||
def handle_NL(self, token: Token) -> Iterator[Token]:
|
||||
if self.paren_level > 0:
|
||||
return
|
||||
|
||||
yield token
|
||||
|
||||
indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
|
||||
indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len
|
||||
|
||||
if indent > self.indent_level[-1]:
|
||||
self.indent_level.append(indent)
|
||||
yield Token.new_borrow_pos(self.INDENT_type, indent_str, token)
|
||||
else:
|
||||
while indent < self.indent_level[-1]:
|
||||
self.indent_level.pop()
|
||||
yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token)
|
||||
|
||||
if indent != self.indent_level[-1]:
|
||||
raise DedentError('Unexpected dedent to column %s. Expected dedent to %s' % (indent, self.indent_level[-1]))
|
||||
|
||||
def _process(self, stream):
|
||||
token = None
|
||||
for token in stream:
|
||||
if token.type == self.NL_type:
|
||||
yield from self.handle_NL(token)
|
||||
else:
|
||||
yield token
|
||||
|
||||
if token.type in self.OPEN_PAREN_types:
|
||||
self.paren_level += 1
|
||||
elif token.type in self.CLOSE_PAREN_types:
|
||||
self.paren_level -= 1
|
||||
assert self.paren_level >= 0
|
||||
|
||||
while len(self.indent_level) > 1:
|
||||
self.indent_level.pop()
|
||||
yield Token.new_borrow_pos(self.DEDENT_type, '', token) if token else Token(self.DEDENT_type, '', 0, 0, 0, 0, 0, 0)
|
||||
|
||||
assert self.indent_level == [0], self.indent_level
|
||||
|
||||
def process(self, stream):
|
||||
self.paren_level = 0
|
||||
self.indent_level = [0]
|
||||
return self._process(stream)
|
||||
|
||||
# XXX Hack for ContextualLexer. Maybe there's a more elegant solution?
|
||||
@property
|
||||
def always_accept(self):
|
||||
return (self.NL_type,)
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def NL_type(self) -> str:
|
||||
"The name of the newline token"
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def OPEN_PAREN_types(self) -> List[str]:
|
||||
"The names of the tokens that open a parenthesis"
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def CLOSE_PAREN_types(self) -> List[str]:
|
||||
"""The names of the tokens that close a parenthesis
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def INDENT_type(self) -> str:
|
||||
"""The name of the token that starts an indentation in the grammar.
|
||||
|
||||
See also: %declare
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def DEDENT_type(self) -> str:
|
||||
"""The name of the token that end an indentation in the grammar.
|
||||
|
||||
See also: %declare
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def tab_len(self) -> int:
|
||||
"""How many spaces does a tab equal"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class PythonIndenter(Indenter):
|
||||
"""A postlexer that "injects" _INDENT/_DEDENT tokens based on indentation, according to the Python syntax.
|
||||
|
||||
See also: the ``postlex`` option in `Lark`.
|
||||
"""
|
||||
|
||||
NL_type = '_NEWLINE'
|
||||
OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE']
|
||||
CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE']
|
||||
INDENT_type = '_INDENT'
|
||||
DEDENT_type = '_DEDENT'
|
||||
tab_len = 8
|
||||
|
||||
###}
|
||||
@@ -0,0 +1,680 @@
|
||||
from abc import ABC, abstractmethod
|
||||
import getpass
|
||||
import sys, os, pickle
|
||||
import tempfile
|
||||
import types
|
||||
import re
|
||||
from typing import (
|
||||
TypeVar, Type, List, Dict, Iterator, Callable, Union, Optional, Sequence,
|
||||
Tuple, Iterable, IO, Any, TYPE_CHECKING, Collection
|
||||
)
|
||||
if TYPE_CHECKING:
|
||||
from .parsers.lalr_interactive_parser import InteractiveParser
|
||||
from .tree import ParseTree
|
||||
from .visitors import Transformer
|
||||
from typing import Literal
|
||||
from .parser_frontends import ParsingFrontend
|
||||
|
||||
from .exceptions import ConfigurationError, assert_config, UnexpectedInput
|
||||
from .utils import Serialize, SerializeMemoizer, FS, logger, TextOrSlice, LarkInput
|
||||
from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest
|
||||
from .tree import Tree
|
||||
from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType
|
||||
|
||||
from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token
|
||||
from .parse_tree_builder import ParseTreeBuilder
|
||||
from .parser_frontends import _validate_frontend_args, _get_lexer_callbacks, _deserialize_parsing_frontend, _construct_parsing_frontend
|
||||
from .grammar import Rule
|
||||
|
||||
|
||||
try:
|
||||
import regex
|
||||
_has_regex = True
|
||||
except ImportError:
|
||||
_has_regex = False
|
||||
|
||||
|
||||
###{standalone
|
||||
|
||||
|
||||
class PostLex(ABC):
|
||||
@abstractmethod
|
||||
def process(self, stream: Iterator[Token]) -> Iterator[Token]:
|
||||
return stream
|
||||
|
||||
always_accept: Iterable[str] = ()
|
||||
|
||||
class LarkOptions(Serialize):
|
||||
"""Specifies the options for Lark
|
||||
|
||||
"""
|
||||
|
||||
start: List[str]
|
||||
debug: bool
|
||||
strict: bool
|
||||
transformer: 'Optional[Transformer]'
|
||||
propagate_positions: Union[bool, str]
|
||||
maybe_placeholders: bool
|
||||
cache: Union[bool, str]
|
||||
cache_grammar: bool
|
||||
regex: bool
|
||||
g_regex_flags: int
|
||||
keep_all_tokens: bool
|
||||
tree_class: Optional[Callable[[str, List], Any]]
|
||||
parser: _ParserArgType
|
||||
lexer: _LexerArgType
|
||||
ambiguity: 'Literal["auto", "resolve", "explicit", "forest"]'
|
||||
postlex: Optional[PostLex]
|
||||
priority: 'Optional[Literal["auto", "normal", "invert"]]'
|
||||
lexer_callbacks: Dict[str, Callable[[Token], Token]]
|
||||
use_bytes: bool
|
||||
ordered_sets: bool
|
||||
edit_terminals: Optional[Callable[[TerminalDef], TerminalDef]]
|
||||
import_paths: 'List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]]'
|
||||
source_path: Optional[str]
|
||||
|
||||
OPTIONS_DOC = r"""
|
||||
**=== General Options ===**
|
||||
|
||||
start
|
||||
The start symbol. Either a string, or a list of strings for multiple possible starts (Default: "start")
|
||||
debug
|
||||
Display debug information and extra warnings. Use only when debugging (Default: ``False``)
|
||||
When used with Earley, it generates a forest graph as "sppf.png", if 'dot' is installed.
|
||||
strict
|
||||
Throw an exception on any potential ambiguity, including shift/reduce conflicts, and regex collisions.
|
||||
transformer
|
||||
Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster)
|
||||
propagate_positions
|
||||
Propagates positional attributes into the 'meta' attribute of all tree branches.
|
||||
Sets attributes: (line, column, end_line, end_column, start_pos, end_pos,
|
||||
container_line, container_column, container_end_line, container_end_column)
|
||||
Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating.
|
||||
maybe_placeholders
|
||||
When ``True``, the ``[]`` operator returns ``None`` when not matched.
|
||||
When ``False``, ``[]`` behaves like the ``?`` operator, and returns no value at all.
|
||||
(default= ``True``)
|
||||
cache
|
||||
Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now.
|
||||
|
||||
- When ``False``, does nothing (default)
|
||||
- When ``True``, caches to a temporary file in the local directory
|
||||
- When given a string, caches to the path pointed by the string
|
||||
cache_grammar
|
||||
For use with ``cache`` option. When ``True``, the unanalyzed grammar is also included in the cache.
|
||||
Useful for classes that require the ``Lark.grammar`` to be present (e.g. Reconstructor).
|
||||
(default= ``False``)
|
||||
regex
|
||||
When True, uses the ``regex`` module instead of the stdlib ``re``.
|
||||
g_regex_flags
|
||||
Flags that are applied to all terminals (both regex and strings)
|
||||
keep_all_tokens
|
||||
Prevent the tree builder from automagically removing "punctuation" tokens (Default: ``False``)
|
||||
tree_class
|
||||
Lark will produce trees comprised of instances of this class instead of the default ``lark.Tree``.
|
||||
|
||||
**=== Algorithm Options ===**
|
||||
|
||||
parser
|
||||
Decides which parser engine to use. Accepts "earley" or "lalr". (Default: "earley").
|
||||
(there is also a "cyk" option for legacy)
|
||||
lexer
|
||||
Decides whether or not to use a lexer stage
|
||||
|
||||
- "auto" (default): Choose for me based on the parser
|
||||
- "basic": Use a basic lexer
|
||||
- "contextual": Stronger lexer (only works with parser="lalr")
|
||||
- "dynamic": Flexible and powerful (only with parser="earley")
|
||||
- "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible.
|
||||
ambiguity
|
||||
Decides how to handle ambiguity in the parse. Only relevant if parser="earley"
|
||||
|
||||
- "resolve": The parser will automatically choose the simplest derivation
|
||||
(it chooses consistently: greedy for tokens, non-greedy for rules)
|
||||
- "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest).
|
||||
- "forest": The parser will return the root of the shared packed parse forest.
|
||||
|
||||
**=== Misc. / Domain Specific Options ===**
|
||||
|
||||
postlex
|
||||
Lexer post-processing (Default: ``None``) Only works with the basic and contextual lexers.
|
||||
priority
|
||||
How priorities should be evaluated - "auto", ``None``, "normal", "invert" (Default: "auto")
|
||||
lexer_callbacks
|
||||
Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
|
||||
use_bytes
|
||||
Accept an input of type ``bytes`` instead of ``str``.
|
||||
ordered_sets
|
||||
Should Earley use ordered-sets to achieve stable output (~10% slower than regular sets. Default: True)
|
||||
edit_terminals
|
||||
A callback for editing the terminals before parse.
|
||||
import_paths
|
||||
A List of either paths or loader functions to specify from where grammars are imported
|
||||
source_path
|
||||
Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading
|
||||
**=== End of Options ===**
|
||||
"""
|
||||
if __doc__:
|
||||
__doc__ += OPTIONS_DOC
|
||||
|
||||
|
||||
# Adding a new option needs to be done in multiple places:
|
||||
# - In the dictionary below. This is the primary truth of which options `Lark.__init__` accepts
|
||||
# - In the docstring above. It is used both for the docstring of `LarkOptions` and `Lark`, and in readthedocs
|
||||
# - As an attribute of `LarkOptions` above
|
||||
# - Potentially in `_LOAD_ALLOWED_OPTIONS` below this class, when the option doesn't change how the grammar is loaded
|
||||
# - Potentially in `lark.tools.__init__`, if it makes sense, and it can easily be passed as a cmd argument
|
||||
_defaults: Dict[str, Any] = {
|
||||
'debug': False,
|
||||
'strict': False,
|
||||
'keep_all_tokens': False,
|
||||
'tree_class': None,
|
||||
'cache': False,
|
||||
'cache_grammar': False,
|
||||
'postlex': None,
|
||||
'parser': 'earley',
|
||||
'lexer': 'auto',
|
||||
'transformer': None,
|
||||
'start': 'start',
|
||||
'priority': 'auto',
|
||||
'ambiguity': 'auto',
|
||||
'regex': False,
|
||||
'propagate_positions': False,
|
||||
'lexer_callbacks': {},
|
||||
'maybe_placeholders': True,
|
||||
'edit_terminals': None,
|
||||
'g_regex_flags': 0,
|
||||
'use_bytes': False,
|
||||
'ordered_sets': True,
|
||||
'import_paths': [],
|
||||
'source_path': None,
|
||||
'_plugins': {},
|
||||
}
|
||||
|
||||
def __init__(self, options_dict: Dict[str, Any]) -> None:
|
||||
o = dict(options_dict)
|
||||
|
||||
options = {}
|
||||
for name, default in self._defaults.items():
|
||||
if name in o:
|
||||
value = o.pop(name)
|
||||
if isinstance(default, bool) and name not in ('cache', 'use_bytes', 'propagate_positions'):
|
||||
value = bool(value)
|
||||
else:
|
||||
value = default
|
||||
|
||||
options[name] = value
|
||||
|
||||
if isinstance(options['start'], str):
|
||||
options['start'] = [options['start']]
|
||||
|
||||
self.__dict__['options'] = options
|
||||
|
||||
|
||||
assert_config(self.parser, ('earley', 'lalr', 'cyk', None))
|
||||
|
||||
if self.parser == 'earley' and self.transformer:
|
||||
raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. '
|
||||
'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')
|
||||
|
||||
if self.cache_grammar and not self.cache:
|
||||
raise ConfigurationError('cache_grammar cannot be set when cache is disabled')
|
||||
|
||||
if o:
|
||||
raise ConfigurationError("Unknown options: %s" % o.keys())
|
||||
|
||||
def __getattr__(self, name: str) -> Any:
|
||||
try:
|
||||
return self.__dict__['options'][name]
|
||||
except KeyError as e:
|
||||
raise AttributeError(e)
|
||||
|
||||
def __setattr__(self, name: str, value: str) -> None:
|
||||
assert_config(name, self.options.keys(), "%r isn't a valid option. Expected one of: %s")
|
||||
self.options[name] = value
|
||||
|
||||
def serialize(self, memo = None) -> Dict[str, Any]:
|
||||
return self.options
|
||||
|
||||
@classmethod
|
||||
def deserialize(cls, data: Dict[str, Any], memo: Dict[int, Union[TerminalDef, Rule]]) -> "LarkOptions":
|
||||
return cls(data)
|
||||
|
||||
|
||||
# Options that can be passed to the Lark parser, even when it was loaded from cache/standalone.
|
||||
# These options are only used outside of `load_grammar`.
|
||||
_LOAD_ALLOWED_OPTIONS = {'postlex', 'transformer', 'lexer_callbacks', 'use_bytes', 'debug', 'g_regex_flags', 'regex', 'propagate_positions', 'tree_class', '_plugins'}
|
||||
|
||||
_VALID_PRIORITY_OPTIONS = ('auto', 'normal', 'invert', None)
|
||||
_VALID_AMBIGUITY_OPTIONS = ('auto', 'resolve', 'explicit', 'forest')
|
||||
|
||||
|
||||
_T = TypeVar('_T', bound="Lark")
|
||||
|
||||
class Lark(Serialize):
|
||||
"""Main interface for the library.
|
||||
|
||||
It's mostly a thin wrapper for the many different parsers, and for the tree constructor.
|
||||
|
||||
Parameters:
|
||||
grammar: a string or file-object containing the grammar spec (using Lark's ebnf syntax)
|
||||
options: a dictionary controlling various aspects of Lark.
|
||||
|
||||
Example:
|
||||
>>> Lark(r'''start: "foo" ''')
|
||||
Lark(...)
|
||||
"""
|
||||
|
||||
source_path: str
|
||||
source_grammar: str
|
||||
grammar: 'Grammar'
|
||||
options: LarkOptions
|
||||
lexer: Lexer
|
||||
parser: 'ParsingFrontend'
|
||||
terminals: Collection[TerminalDef]
|
||||
|
||||
__serialize_fields__ = ['parser', 'rules', 'options']
|
||||
|
||||
def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
|
||||
self.options = LarkOptions(options)
|
||||
re_module: types.ModuleType
|
||||
|
||||
# Update which fields are serialized
|
||||
if self.options.cache_grammar:
|
||||
self.__serialize_fields__ = self.__serialize_fields__ + ['grammar']
|
||||
|
||||
# Set regex or re module
|
||||
use_regex = self.options.regex
|
||||
if use_regex:
|
||||
if _has_regex:
|
||||
re_module = regex
|
||||
else:
|
||||
raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.')
|
||||
else:
|
||||
re_module = re
|
||||
|
||||
# Some, but not all file-like objects have a 'name' attribute
|
||||
if self.options.source_path is None:
|
||||
try:
|
||||
self.source_path = grammar.name # type: ignore[union-attr]
|
||||
except AttributeError:
|
||||
self.source_path = '<string>'
|
||||
else:
|
||||
self.source_path = self.options.source_path
|
||||
|
||||
# Drain file-like objects to get their contents
|
||||
try:
|
||||
read = grammar.read # type: ignore[union-attr]
|
||||
except AttributeError:
|
||||
pass
|
||||
else:
|
||||
grammar = read()
|
||||
|
||||
cache_fn = None
|
||||
cache_sha256 = None
|
||||
if isinstance(grammar, str):
|
||||
self.source_grammar = grammar
|
||||
if self.options.use_bytes:
|
||||
if not grammar.isascii():
|
||||
raise ConfigurationError("Grammar must be ascii only, when use_bytes=True")
|
||||
|
||||
if self.options.cache:
|
||||
if self.options.parser != 'lalr':
|
||||
raise ConfigurationError("cache only works with parser='lalr' for now")
|
||||
|
||||
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals', '_plugins')
|
||||
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
|
||||
from . import __version__
|
||||
s = grammar + options_str + __version__ + str(sys.version_info[:2])
|
||||
cache_sha256 = sha256_digest(s)
|
||||
|
||||
if isinstance(self.options.cache, str):
|
||||
cache_fn = self.options.cache
|
||||
else:
|
||||
if self.options.cache is not True:
|
||||
raise ConfigurationError("cache argument must be bool or str")
|
||||
|
||||
try:
|
||||
username = getpass.getuser()
|
||||
except Exception:
|
||||
# The exception raised may be ImportError or OSError in
|
||||
# the future. For the cache, we don't care about the
|
||||
# specific reason - we just want a username.
|
||||
username = "unknown"
|
||||
|
||||
|
||||
cache_fn = tempfile.gettempdir() + "/.lark_%s_%s_%s_%s_%s.tmp" % (
|
||||
"cache_grammar" if self.options.cache_grammar else "cache", username, cache_sha256, *sys.version_info[:2])
|
||||
|
||||
old_options = self.options
|
||||
try:
|
||||
with FS.open(cache_fn, 'rb') as f:
|
||||
logger.debug('Loading grammar from cache: %s', cache_fn)
|
||||
# Remove options that aren't relevant for loading from cache
|
||||
for name in (set(options) - _LOAD_ALLOWED_OPTIONS):
|
||||
del options[name]
|
||||
file_sha256 = f.readline().rstrip(b'\n')
|
||||
cached_used_files = pickle.load(f)
|
||||
if file_sha256 == cache_sha256.encode('utf8') and verify_used_files(cached_used_files):
|
||||
cached_parser_data = pickle.load(f)
|
||||
self._load(cached_parser_data, **options)
|
||||
return
|
||||
except FileNotFoundError:
|
||||
# The cache file doesn't exist; parse and compose the grammar as normal
|
||||
pass
|
||||
except Exception: # We should probably narrow done which errors we catch here.
|
||||
logger.exception("Failed to load Lark from cache: %r. We will try to carry on.", cache_fn)
|
||||
|
||||
# In theory, the Lark instance might have been messed up by the call to `_load`.
|
||||
# In practice the only relevant thing that might have been overwritten should be `options`
|
||||
self.options = old_options
|
||||
|
||||
|
||||
# Parse the grammar file and compose the grammars
|
||||
self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
|
||||
else:
|
||||
assert isinstance(grammar, Grammar)
|
||||
self.grammar = grammar
|
||||
|
||||
|
||||
if self.options.lexer == 'auto':
|
||||
if self.options.parser == 'lalr':
|
||||
self.options.lexer = 'contextual'
|
||||
elif self.options.parser == 'earley':
|
||||
if self.options.postlex is not None:
|
||||
logger.info("postlex can't be used with the dynamic lexer, so we use 'basic' instead. "
|
||||
"Consider using lalr with contextual instead of earley")
|
||||
self.options.lexer = 'basic'
|
||||
else:
|
||||
self.options.lexer = 'dynamic'
|
||||
elif self.options.parser == 'cyk':
|
||||
self.options.lexer = 'basic'
|
||||
else:
|
||||
assert False, self.options.parser
|
||||
lexer = self.options.lexer
|
||||
if isinstance(lexer, type):
|
||||
assert issubclass(lexer, Lexer) # XXX Is this really important? Maybe just ensure interface compliance
|
||||
else:
|
||||
assert_config(lexer, ('basic', 'contextual', 'dynamic', 'dynamic_complete'))
|
||||
if self.options.postlex is not None and 'dynamic' in lexer:
|
||||
raise ConfigurationError("Can't use postlex with a dynamic lexer. Use basic or contextual instead")
|
||||
|
||||
if self.options.ambiguity == 'auto':
|
||||
if self.options.parser == 'earley':
|
||||
self.options.ambiguity = 'resolve'
|
||||
else:
|
||||
assert_config(self.options.parser, ('earley', 'cyk'), "%r doesn't support disambiguation. Use one of these parsers instead: %s")
|
||||
|
||||
if self.options.priority == 'auto':
|
||||
self.options.priority = 'normal'
|
||||
|
||||
if self.options.priority not in _VALID_PRIORITY_OPTIONS:
|
||||
raise ConfigurationError("invalid priority option: %r. Must be one of %r" % (self.options.priority, _VALID_PRIORITY_OPTIONS))
|
||||
if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS:
|
||||
raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS))
|
||||
|
||||
if self.options.parser is None:
|
||||
terminals_to_keep = '*' # For lexer-only mode, keep all terminals
|
||||
elif self.options.postlex is not None:
|
||||
terminals_to_keep = set(self.options.postlex.always_accept)
|
||||
else:
|
||||
terminals_to_keep = set()
|
||||
|
||||
# Compile the EBNF grammar into BNF
|
||||
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start, terminals_to_keep)
|
||||
|
||||
if self.options.edit_terminals:
|
||||
for t in self.terminals:
|
||||
self.options.edit_terminals(t)
|
||||
|
||||
self._terminals_dict = {t.name: t for t in self.terminals}
|
||||
|
||||
# If the user asked to invert the priorities, negate them all here.
|
||||
if self.options.priority == 'invert':
|
||||
for rule in self.rules:
|
||||
if rule.options.priority is not None:
|
||||
rule.options.priority = -rule.options.priority
|
||||
for term in self.terminals:
|
||||
term.priority = -term.priority
|
||||
# Else, if the user asked to disable priorities, strip them from the
|
||||
# rules and terminals. This allows the Earley parsers to skip an extra forest walk
|
||||
# for improved performance, if you don't need them (or didn't specify any).
|
||||
elif self.options.priority is None:
|
||||
for rule in self.rules:
|
||||
if rule.options.priority is not None:
|
||||
rule.options.priority = None
|
||||
for term in self.terminals:
|
||||
term.priority = 0
|
||||
|
||||
# TODO Deprecate lexer_callbacks?
|
||||
self.lexer_conf = LexerConf(
|
||||
self.terminals, re_module, self.ignore_tokens, self.options.postlex,
|
||||
self.options.lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes, strict=self.options.strict
|
||||
)
|
||||
|
||||
if self.options.parser:
|
||||
self.parser = self._build_parser()
|
||||
elif lexer:
|
||||
self.lexer = self._build_lexer()
|
||||
|
||||
if cache_fn:
|
||||
logger.debug('Saving grammar to cache: %s', cache_fn)
|
||||
try:
|
||||
with FS.open(cache_fn, 'wb') as f:
|
||||
assert cache_sha256 is not None
|
||||
f.write(cache_sha256.encode('utf8') + b'\n')
|
||||
pickle.dump(used_files, f)
|
||||
self.save(f, _LOAD_ALLOWED_OPTIONS)
|
||||
except IOError as e:
|
||||
logger.exception("Failed to save Lark to cache: %r.", cache_fn, e)
|
||||
|
||||
if __doc__:
|
||||
__doc__ += "\n\n" + LarkOptions.OPTIONS_DOC
|
||||
|
||||
def _build_lexer(self, dont_ignore: bool=False) -> BasicLexer:
|
||||
lexer_conf = self.lexer_conf
|
||||
if dont_ignore:
|
||||
from copy import copy
|
||||
lexer_conf = copy(lexer_conf)
|
||||
lexer_conf.ignore = ()
|
||||
return BasicLexer(lexer_conf)
|
||||
|
||||
def _prepare_callbacks(self) -> None:
|
||||
self._callbacks = {}
|
||||
# we don't need these callbacks if we aren't building a tree
|
||||
if self.options.ambiguity != 'forest':
|
||||
self._parse_tree_builder = ParseTreeBuilder(
|
||||
self.rules,
|
||||
self.options.tree_class or Tree,
|
||||
self.options.propagate_positions,
|
||||
self.options.parser != 'lalr' and self.options.ambiguity == 'explicit',
|
||||
self.options.maybe_placeholders
|
||||
)
|
||||
self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer)
|
||||
self._callbacks.update(_get_lexer_callbacks(self.options.transformer, self.terminals))
|
||||
|
||||
def _build_parser(self) -> "ParsingFrontend":
|
||||
self._prepare_callbacks()
|
||||
_validate_frontend_args(self.options.parser, self.options.lexer)
|
||||
parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
|
||||
return _construct_parsing_frontend(
|
||||
self.options.parser,
|
||||
self.options.lexer,
|
||||
self.lexer_conf,
|
||||
parser_conf,
|
||||
options=self.options
|
||||
)
|
||||
|
||||
def save(self, f, exclude_options: Collection[str] = ()) -> None:
|
||||
"""Saves the instance into the given file object
|
||||
|
||||
Useful for caching and multiprocessing.
|
||||
"""
|
||||
if self.options.parser != 'lalr':
|
||||
raise NotImplementedError("Lark.save() is only implemented for the LALR(1) parser.")
|
||||
data, m = self.memo_serialize([TerminalDef, Rule])
|
||||
if exclude_options:
|
||||
data["options"] = {n: v for n, v in data["options"].items() if n not in exclude_options}
|
||||
pickle.dump({'data': data, 'memo': m}, f, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
@classmethod
|
||||
def load(cls: Type[_T], f) -> _T:
|
||||
"""Loads an instance from the given file object
|
||||
|
||||
Useful for caching and multiprocessing.
|
||||
"""
|
||||
inst = cls.__new__(cls)
|
||||
return inst._load(f)
|
||||
|
||||
def _deserialize_lexer_conf(self, data: Dict[str, Any], memo: Dict[int, Union[TerminalDef, Rule]], options: LarkOptions) -> LexerConf:
|
||||
lexer_conf = LexerConf.deserialize(data['lexer_conf'], memo)
|
||||
lexer_conf.callbacks = options.lexer_callbacks or {}
|
||||
lexer_conf.re_module = regex if options.regex else re
|
||||
lexer_conf.use_bytes = options.use_bytes
|
||||
lexer_conf.g_regex_flags = options.g_regex_flags
|
||||
lexer_conf.skip_validation = True
|
||||
lexer_conf.postlex = options.postlex
|
||||
return lexer_conf
|
||||
|
||||
def _load(self: _T, f: Any, **kwargs) -> _T:
|
||||
if isinstance(f, dict):
|
||||
d = f
|
||||
else:
|
||||
d = pickle.load(f)
|
||||
memo_json = d['memo']
|
||||
data = d['data']
|
||||
|
||||
assert memo_json
|
||||
memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
|
||||
if 'grammar' in data:
|
||||
self.grammar = Grammar.deserialize(data['grammar'], memo)
|
||||
options = dict(data['options'])
|
||||
if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults):
|
||||
raise ConfigurationError("Some options are not allowed when loading a Parser: {}"
|
||||
.format(set(kwargs) - _LOAD_ALLOWED_OPTIONS))
|
||||
options.update(kwargs)
|
||||
self.options = LarkOptions.deserialize(options, memo)
|
||||
self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
|
||||
self.source_path = '<deserialized>'
|
||||
_validate_frontend_args(self.options.parser, self.options.lexer)
|
||||
self.lexer_conf = self._deserialize_lexer_conf(data['parser'], memo, self.options)
|
||||
self.terminals = self.lexer_conf.terminals
|
||||
self._prepare_callbacks()
|
||||
self._terminals_dict = {t.name: t for t in self.terminals}
|
||||
self.parser = _deserialize_parsing_frontend(
|
||||
data['parser'],
|
||||
memo,
|
||||
self.lexer_conf,
|
||||
self._callbacks,
|
||||
self.options, # Not all, but multiple attributes are used
|
||||
)
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def _load_from_dict(cls, data, memo, **kwargs):
|
||||
inst = cls.__new__(cls)
|
||||
return inst._load({'data': data, 'memo': memo}, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str]=None, **options) -> _T:
|
||||
"""Create an instance of Lark with the grammar given by its filename
|
||||
|
||||
If ``rel_to`` is provided, the function will find the grammar filename in relation to it.
|
||||
|
||||
Example:
|
||||
|
||||
>>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr")
|
||||
Lark(...)
|
||||
|
||||
"""
|
||||
if rel_to:
|
||||
basepath = os.path.dirname(rel_to)
|
||||
grammar_filename = os.path.join(basepath, grammar_filename)
|
||||
with open(grammar_filename, encoding='utf8') as f:
|
||||
return cls(f, **options)
|
||||
|
||||
@classmethod
|
||||
def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: 'Sequence[str]'=[""], **options) -> _T:
|
||||
"""Create an instance of Lark with the grammar loaded from within the package `package`.
|
||||
This allows grammar loading from zipapps.
|
||||
|
||||
Imports in the grammar will use the `package` and `search_paths` provided, through `FromPackageLoader`
|
||||
|
||||
Example:
|
||||
|
||||
Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...)
|
||||
"""
|
||||
package_loader = FromPackageLoader(package, search_paths)
|
||||
full_path, text = package_loader(None, grammar_path)
|
||||
options.setdefault('source_path', full_path)
|
||||
options.setdefault('import_paths', [])
|
||||
options['import_paths'].append(package_loader)
|
||||
return cls(text, **options)
|
||||
|
||||
def __repr__(self):
|
||||
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)
|
||||
|
||||
|
||||
def lex(self, text: TextOrSlice, dont_ignore: bool=False) -> Iterator[Token]:
|
||||
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic'
|
||||
|
||||
When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.
|
||||
|
||||
:raises UnexpectedCharacters: In case the lexer cannot find a suitable match.
|
||||
"""
|
||||
lexer: Lexer
|
||||
if not hasattr(self, 'lexer') or dont_ignore:
|
||||
lexer = self._build_lexer(dont_ignore)
|
||||
else:
|
||||
lexer = self.lexer
|
||||
lexer_thread = LexerThread.from_text(lexer, text)
|
||||
stream = lexer_thread.lex(None)
|
||||
if self.options.postlex:
|
||||
return self.options.postlex.process(stream)
|
||||
return stream
|
||||
|
||||
def get_terminal(self, name: str) -> TerminalDef:
|
||||
"""Get information about a terminal"""
|
||||
return self._terminals_dict[name]
|
||||
|
||||
def parse_interactive(self, text: Optional[LarkInput]=None, start: Optional[str]=None) -> 'InteractiveParser':
|
||||
"""Start an interactive parsing session. Only works when parser='lalr'.
|
||||
|
||||
Parameters:
|
||||
text (LarkInput, optional): Text to be parsed. Required for ``resume_parse()``.
|
||||
start (str, optional): Start symbol
|
||||
|
||||
Returns:
|
||||
A new InteractiveParser instance.
|
||||
|
||||
See Also: ``Lark.parse()``
|
||||
"""
|
||||
return self.parser.parse_interactive(text, start=start)
|
||||
|
||||
def parse(self, text: LarkInput, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
|
||||
"""Parse the given text, according to the options provided.
|
||||
|
||||
Parameters:
|
||||
text (LarkInput): Text to be parsed, as `str` or `bytes`.
|
||||
TextSlice may also be used, but only when lexer='basic' or 'contextual'.
|
||||
If Lark was created with a custom lexer, this may be an object of any type.
|
||||
start (str, optional): Required if Lark was given multiple possible start symbols (using the start option).
|
||||
on_error (function, optional): if provided, will be called on UnexpectedInput error,
|
||||
with the exception as its argument. Return true to resume parsing, or false to raise the exception.
|
||||
LALR only. See examples/advanced/error_handling.py for an example of how to use on_error.
|
||||
|
||||
Returns:
|
||||
If a transformer is supplied to ``__init__``, returns whatever is the
|
||||
result of the transformation. Otherwise, returns a Tree instance.
|
||||
|
||||
:raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise:
|
||||
``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``.
|
||||
For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``.
|
||||
|
||||
"""
|
||||
if on_error is not None and self.options.parser != 'lalr':
|
||||
raise NotImplementedError("The on_error option is only implemented for the LALR(1) parser.")
|
||||
return self.parser.parse(text, start=start, on_error=on_error)
|
||||
|
||||
|
||||
###}
|
||||
@@ -0,0 +1,702 @@
|
||||
# Lexer Implementation
|
||||
|
||||
from abc import abstractmethod, ABC
|
||||
import re
|
||||
from typing import (
|
||||
TypeVar, Type, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any,
|
||||
ClassVar, TYPE_CHECKING, overload
|
||||
)
|
||||
from types import ModuleType
|
||||
import warnings
|
||||
try:
|
||||
import interegular
|
||||
except ImportError:
|
||||
pass
|
||||
if TYPE_CHECKING:
|
||||
from .common import LexerConf
|
||||
from .parsers.lalr_parser_state import ParserState
|
||||
|
||||
from .utils import classify, get_regexp_width, Serialize, logger, TextSlice, TextOrSlice
|
||||
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
|
||||
from .grammar import TOKEN_DEFAULT_PRIORITY
|
||||
|
||||
|
||||
###{standalone
|
||||
from contextlib import suppress
|
||||
from copy import copy
|
||||
|
||||
try: # For the standalone parser, we need to make sure that has_interegular is False to avoid NameErrors later on
|
||||
has_interegular = bool(interegular)
|
||||
except NameError:
|
||||
has_interegular = False
|
||||
|
||||
class Pattern(Serialize, ABC):
|
||||
"An abstraction over regular expressions."
|
||||
|
||||
value: str
|
||||
flags: Collection[str]
|
||||
raw: Optional[str]
|
||||
type: ClassVar[str]
|
||||
|
||||
def __init__(self, value: str, flags: Collection[str] = (), raw: Optional[str] = None) -> None:
|
||||
self.value = value
|
||||
self.flags = frozenset(flags)
|
||||
self.raw = raw
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self.to_regexp())
|
||||
|
||||
# Pattern Hashing assumes all subclasses have a different priority!
|
||||
def __hash__(self):
|
||||
return hash((type(self), self.value, self.flags))
|
||||
|
||||
def __eq__(self, other):
|
||||
return type(self) == type(other) and self.value == other.value and self.flags == other.flags
|
||||
|
||||
@abstractmethod
|
||||
def to_regexp(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def min_width(self) -> int:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def max_width(self) -> int:
|
||||
raise NotImplementedError()
|
||||
|
||||
def _get_flags(self, value):
|
||||
for f in self.flags:
|
||||
value = ('(?%s:%s)' % (f, value))
|
||||
return value
|
||||
|
||||
|
||||
class PatternStr(Pattern):
|
||||
__serialize_fields__ = 'value', 'flags', 'raw'
|
||||
|
||||
type: ClassVar[str] = "str"
|
||||
|
||||
def to_regexp(self) -> str:
|
||||
return self._get_flags(re.escape(self.value))
|
||||
|
||||
@property
|
||||
def min_width(self) -> int:
|
||||
return len(self.value)
|
||||
|
||||
@property
|
||||
def max_width(self) -> int:
|
||||
return len(self.value)
|
||||
|
||||
|
||||
class PatternRE(Pattern):
|
||||
__serialize_fields__ = 'value', 'flags', 'raw', '_width'
|
||||
|
||||
type: ClassVar[str] = "re"
|
||||
|
||||
def to_regexp(self) -> str:
|
||||
return self._get_flags(self.value)
|
||||
|
||||
_width = None
|
||||
def _get_width(self):
|
||||
if self._width is None:
|
||||
self._width = get_regexp_width(self.to_regexp())
|
||||
return self._width
|
||||
|
||||
@property
|
||||
def min_width(self) -> int:
|
||||
return self._get_width()[0]
|
||||
|
||||
@property
|
||||
def max_width(self) -> int:
|
||||
return self._get_width()[1]
|
||||
|
||||
|
||||
class TerminalDef(Serialize):
|
||||
"A definition of a terminal"
|
||||
__serialize_fields__ = 'name', 'pattern', 'priority'
|
||||
__serialize_namespace__ = PatternStr, PatternRE
|
||||
|
||||
name: str
|
||||
pattern: Pattern
|
||||
priority: int
|
||||
|
||||
def __init__(self, name: str, pattern: Pattern, priority: int = TOKEN_DEFAULT_PRIORITY) -> None:
|
||||
assert isinstance(pattern, Pattern), pattern
|
||||
self.name = name
|
||||
self.pattern = pattern
|
||||
self.priority = priority
|
||||
|
||||
def __repr__(self):
|
||||
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
|
||||
|
||||
def user_repr(self) -> str:
|
||||
if self.name.startswith('__'): # We represent a generated terminal
|
||||
return self.pattern.raw or self.name
|
||||
else:
|
||||
return self.name
|
||||
|
||||
_T = TypeVar('_T', bound="Token")
|
||||
|
||||
class Token(str):
|
||||
"""A string with meta-information, that is produced by the lexer.
|
||||
|
||||
When parsing text, the resulting chunks of the input that haven't been discarded,
|
||||
will end up in the tree as Token instances. The Token class inherits from Python's ``str``,
|
||||
so normal string comparisons and operations will work as expected.
|
||||
|
||||
Attributes:
|
||||
type: Name of the token (as specified in grammar)
|
||||
value: Value of the token (redundant, as ``token.value == token`` will always be true)
|
||||
start_pos: The index of the token in the text
|
||||
line: The line of the token in the text (starting with 1)
|
||||
column: The column of the token in the text (starting with 1)
|
||||
end_line: The line where the token ends
|
||||
end_column: The next column after the end of the token. For example,
|
||||
if the token is a single character with a column value of 4,
|
||||
end_column will be 5.
|
||||
end_pos: the index where the token ends (basically ``start_pos + len(token)``)
|
||||
"""
|
||||
__slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')
|
||||
|
||||
__match_args__ = ('type', 'value')
|
||||
|
||||
type: str
|
||||
start_pos: Optional[int]
|
||||
value: Any
|
||||
line: Optional[int]
|
||||
column: Optional[int]
|
||||
end_line: Optional[int]
|
||||
end_column: Optional[int]
|
||||
end_pos: Optional[int]
|
||||
|
||||
|
||||
@overload
|
||||
def __new__(
|
||||
cls,
|
||||
type: str,
|
||||
value: Any,
|
||||
start_pos: Optional[int] = None,
|
||||
line: Optional[int] = None,
|
||||
column: Optional[int] = None,
|
||||
end_line: Optional[int] = None,
|
||||
end_column: Optional[int] = None,
|
||||
end_pos: Optional[int] = None
|
||||
) -> 'Token':
|
||||
...
|
||||
|
||||
@overload
|
||||
def __new__(
|
||||
cls,
|
||||
type_: str,
|
||||
value: Any,
|
||||
start_pos: Optional[int] = None,
|
||||
line: Optional[int] = None,
|
||||
column: Optional[int] = None,
|
||||
end_line: Optional[int] = None,
|
||||
end_column: Optional[int] = None,
|
||||
end_pos: Optional[int] = None
|
||||
) -> 'Token': ...
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if "type_" in kwargs:
|
||||
warnings.warn("`type_` is deprecated use `type` instead", DeprecationWarning)
|
||||
|
||||
if "type" in kwargs:
|
||||
raise TypeError("Error: using both 'type' and the deprecated 'type_' as arguments.")
|
||||
kwargs["type"] = kwargs.pop("type_")
|
||||
|
||||
return cls._future_new(*args, **kwargs)
|
||||
|
||||
|
||||
@classmethod
|
||||
def _future_new(cls, type, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None):
|
||||
inst = super(Token, cls).__new__(cls, value)
|
||||
|
||||
inst.type = type
|
||||
inst.start_pos = start_pos
|
||||
inst.value = value
|
||||
inst.line = line
|
||||
inst.column = column
|
||||
inst.end_line = end_line
|
||||
inst.end_column = end_column
|
||||
inst.end_pos = end_pos
|
||||
return inst
|
||||
|
||||
@overload
|
||||
def update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
|
||||
...
|
||||
|
||||
@overload
|
||||
def update(self, type_: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
|
||||
...
|
||||
|
||||
def update(self, *args, **kwargs):
|
||||
if "type_" in kwargs:
|
||||
warnings.warn("`type_` is deprecated use `type` instead", DeprecationWarning)
|
||||
|
||||
if "type" in kwargs:
|
||||
raise TypeError("Error: using both 'type' and the deprecated 'type_' as arguments.")
|
||||
kwargs["type"] = kwargs.pop("type_")
|
||||
|
||||
return self._future_update(*args, **kwargs)
|
||||
|
||||
def _future_update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
|
||||
return Token.new_borrow_pos(
|
||||
type if type is not None else self.type,
|
||||
value if value is not None else self.value,
|
||||
self
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def new_borrow_pos(cls: Type[_T], type_: str, value: Any, borrow_t: 'Token') -> _T:
|
||||
return cls(type_, value, borrow_t.start_pos, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos)
|
||||
|
||||
def __reduce__(self):
|
||||
return (self.__class__, (self.type, self.value, self.start_pos, self.line, self.column))
|
||||
|
||||
def __repr__(self):
|
||||
return 'Token(%r, %r)' % (self.type, self.value)
|
||||
|
||||
def __deepcopy__(self, memo):
|
||||
return Token(self.type, self.value, self.start_pos, self.line, self.column)
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, Token) and self.type != other.type:
|
||||
return False
|
||||
|
||||
return str.__eq__(self, other)
|
||||
|
||||
__hash__ = str.__hash__
|
||||
|
||||
|
||||
class LineCounter:
|
||||
"A utility class for keeping track of line & column information"
|
||||
|
||||
__slots__ = 'char_pos', 'line', 'column', 'line_start_pos', 'newline_char'
|
||||
|
||||
def __init__(self, newline_char):
|
||||
self.newline_char = newline_char
|
||||
self.char_pos = 0
|
||||
self.line = 1
|
||||
self.column = 1
|
||||
self.line_start_pos = 0
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, LineCounter):
|
||||
return NotImplemented
|
||||
|
||||
return self.char_pos == other.char_pos and self.newline_char == other.newline_char
|
||||
|
||||
def feed(self, token: TextOrSlice, test_newline=True):
|
||||
"""Consume a token and calculate the new line & column.
|
||||
|
||||
As an optional optimization, set test_newline=False if token doesn't contain a newline.
|
||||
"""
|
||||
if test_newline:
|
||||
newlines = token.count(self.newline_char)
|
||||
if newlines:
|
||||
self.line += newlines
|
||||
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
|
||||
|
||||
self.char_pos += len(token)
|
||||
self.column = self.char_pos - self.line_start_pos + 1
|
||||
|
||||
|
||||
class UnlessCallback:
|
||||
def __init__(self, scanner: 'Scanner'):
|
||||
self.scanner = scanner
|
||||
|
||||
def __call__(self, t: Token):
|
||||
res = self.scanner.fullmatch(t.value)
|
||||
if res is not None:
|
||||
t.type = res
|
||||
return t
|
||||
|
||||
|
||||
class CallChain:
|
||||
def __init__(self, callback1, callback2, cond):
|
||||
self.callback1 = callback1
|
||||
self.callback2 = callback2
|
||||
self.cond = cond
|
||||
|
||||
def __call__(self, t):
|
||||
t2 = self.callback1(t)
|
||||
return self.callback2(t) if self.cond(t2) else t2
|
||||
|
||||
|
||||
def _get_match(re_, regexp, s, flags):
|
||||
m = re_.match(regexp, s, flags)
|
||||
if m:
|
||||
return m.group(0)
|
||||
|
||||
def _create_unless(terminals, g_regex_flags, re_, use_bytes):
|
||||
tokens_by_type = classify(terminals, lambda t: type(t.pattern))
|
||||
assert len(tokens_by_type) <= 2, tokens_by_type.keys()
|
||||
embedded_strs = set()
|
||||
callback = {}
|
||||
for retok in tokens_by_type.get(PatternRE, []):
|
||||
unless = []
|
||||
for strtok in tokens_by_type.get(PatternStr, []):
|
||||
if strtok.priority != retok.priority:
|
||||
continue
|
||||
s = strtok.pattern.value
|
||||
if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags):
|
||||
unless.append(strtok)
|
||||
if strtok.pattern.flags <= retok.pattern.flags:
|
||||
embedded_strs.add(strtok)
|
||||
if unless:
|
||||
callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, use_bytes=use_bytes))
|
||||
|
||||
new_terminals = [t for t in terminals if t not in embedded_strs]
|
||||
return new_terminals, callback
|
||||
|
||||
|
||||
class Scanner:
|
||||
def __init__(self, terminals, g_regex_flags, re_, use_bytes):
|
||||
self.terminals = terminals
|
||||
self.g_regex_flags = g_regex_flags
|
||||
self.re_ = re_
|
||||
self.use_bytes = use_bytes
|
||||
|
||||
self.allowed_types = {t.name for t in self.terminals}
|
||||
|
||||
self._mres = self._build_mres(terminals, len(terminals))
|
||||
|
||||
def _build_mres(self, terminals, max_size):
|
||||
# Python sets an unreasonable group limit (currently 100) in its re module
|
||||
# Worse, the only way to know we reached it is by catching an AssertionError!
|
||||
# This function recursively tries less and less groups until it's successful.
|
||||
mres = []
|
||||
while terminals:
|
||||
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp()) for t in terminals[:max_size])
|
||||
if self.use_bytes:
|
||||
pattern = pattern.encode('latin-1')
|
||||
try:
|
||||
mre = self.re_.compile(pattern, self.g_regex_flags)
|
||||
except AssertionError: # Yes, this is what Python provides us.. :/
|
||||
return self._build_mres(terminals, max_size // 2)
|
||||
|
||||
mres.append(mre)
|
||||
terminals = terminals[max_size:]
|
||||
return mres
|
||||
|
||||
def match(self, text: TextSlice, pos):
|
||||
for mre in self._mres:
|
||||
m = mre.match(text.text, pos, text.end)
|
||||
if m:
|
||||
return m.group(0), m.lastgroup
|
||||
|
||||
|
||||
def fullmatch(self, text: str) -> Optional[str]:
|
||||
for mre in self._mres:
|
||||
m = mre.fullmatch(text)
|
||||
if m:
|
||||
return m.lastgroup
|
||||
return None
|
||||
|
||||
def _regexp_has_newline(r: str):
|
||||
r"""Expressions that may indicate newlines in a regexp:
|
||||
- newlines (\n)
|
||||
- escaped newline (\\n)
|
||||
- anything but ([^...])
|
||||
- any-char (.) when the flag (?s) exists
|
||||
- spaces (\s)
|
||||
"""
|
||||
return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r)
|
||||
|
||||
|
||||
class LexerState:
|
||||
"""Represents the current state of the lexer as it scans the text
|
||||
(Lexer objects are only instantiated per grammar, not per text)
|
||||
"""
|
||||
|
||||
__slots__ = 'text', 'line_ctr', 'last_token'
|
||||
|
||||
text: TextSlice
|
||||
line_ctr: LineCounter
|
||||
last_token: Optional[Token]
|
||||
|
||||
def __init__(self, text: TextSlice, line_ctr: Optional[LineCounter] = None, last_token: Optional[Token]=None):
|
||||
if isinstance(text, TextSlice):
|
||||
if line_ctr is None:
|
||||
line_ctr = LineCounter(b'\n' if isinstance(text.text, bytes) else '\n')
|
||||
|
||||
if text.start > 0:
|
||||
# Advance the line-count until line_ctr.char_pos == text.start
|
||||
line_ctr.feed(TextSlice(text.text, 0, text.start))
|
||||
|
||||
if not (text.start <= line_ctr.char_pos <= text.end):
|
||||
raise ValueError("LineCounter.char_pos is out of bounds")
|
||||
|
||||
self.text = text
|
||||
self.line_ctr = line_ctr
|
||||
self.last_token = last_token
|
||||
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, LexerState):
|
||||
return NotImplemented
|
||||
|
||||
return self.text == other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token
|
||||
|
||||
def __copy__(self):
|
||||
return type(self)(self.text, copy(self.line_ctr), self.last_token)
|
||||
|
||||
|
||||
class LexerThread:
|
||||
"""A thread that ties a lexer instance and a lexer state, to be used by the parser
|
||||
"""
|
||||
|
||||
def __init__(self, lexer: 'Lexer', lexer_state: Optional[LexerState]):
|
||||
self.lexer = lexer
|
||||
self.state = lexer_state
|
||||
|
||||
@classmethod
|
||||
def from_text(cls, lexer: 'Lexer', text_or_slice: TextOrSlice) -> 'LexerThread':
|
||||
text = TextSlice.cast_from(text_or_slice)
|
||||
return cls(lexer, LexerState(text))
|
||||
|
||||
@classmethod
|
||||
def from_custom_input(cls, lexer: 'Lexer', text: Any) -> 'LexerThread':
|
||||
return cls(lexer, LexerState(text))
|
||||
|
||||
def lex(self, parser_state):
|
||||
if self.state is None:
|
||||
raise TypeError("Cannot lex: No text assigned to lexer state")
|
||||
return self.lexer.lex(self.state, parser_state)
|
||||
|
||||
def __copy__(self):
|
||||
return type(self)(self.lexer, copy(self.state))
|
||||
|
||||
_Token = Token
|
||||
|
||||
|
||||
_Callback = Callable[[Token], Token]
|
||||
|
||||
class Lexer(ABC):
|
||||
"""Lexer interface
|
||||
|
||||
Method Signatures:
|
||||
lex(self, lexer_state, parser_state) -> Iterator[Token]
|
||||
"""
|
||||
@abstractmethod
|
||||
def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
|
||||
return NotImplemented
|
||||
|
||||
def make_lexer_state(self, text: str):
|
||||
"Deprecated"
|
||||
return LexerState(TextSlice.cast_from(text))
|
||||
|
||||
|
||||
def _check_regex_collisions(terminal_to_regexp: Dict[TerminalDef, str], comparator, strict_mode, max_collisions_to_show=8):
|
||||
if not comparator:
|
||||
comparator = interegular.Comparator.from_regexes(terminal_to_regexp)
|
||||
|
||||
# When in strict mode, we only ever try to provide one example, so taking
|
||||
# a long time for that should be fine
|
||||
max_time = 2 if strict_mode else 0.2
|
||||
|
||||
# We don't want to show too many collisions.
|
||||
if comparator.count_marked_pairs() >= max_collisions_to_show:
|
||||
return
|
||||
for group in classify(terminal_to_regexp, lambda t: t.priority).values():
|
||||
for a, b in comparator.check(group, skip_marked=True):
|
||||
assert a.priority == b.priority
|
||||
# Mark this pair to not repeat warnings when multiple different BasicLexers see the same collision
|
||||
comparator.mark(a, b)
|
||||
|
||||
# Notify the user
|
||||
message = f"Collision between Terminals {a.name} and {b.name}. "
|
||||
try:
|
||||
example = comparator.get_example_overlap(a, b, max_time).format_multiline()
|
||||
except ValueError:
|
||||
# Couldn't find an example within max_time steps.
|
||||
example = "No example could be found fast enough. However, the collision does still exists"
|
||||
if strict_mode:
|
||||
raise LexError(f"{message}\n{example}")
|
||||
logger.warning("%s The lexer will choose between them arbitrarily.\n%s", message, example)
|
||||
if comparator.count_marked_pairs() >= max_collisions_to_show:
|
||||
logger.warning("Found 8 regex collisions, will not check for more.")
|
||||
return
|
||||
|
||||
|
||||
class AbstractBasicLexer(Lexer):
|
||||
terminals_by_name: Dict[str, TerminalDef]
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, conf: 'LexerConf', comparator=None) -> None:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
|
||||
...
|
||||
|
||||
def lex(self, state: LexerState, parser_state: Any) -> Iterator[Token]:
|
||||
with suppress(EOFError):
|
||||
while True:
|
||||
yield self.next_token(state, parser_state)
|
||||
|
||||
|
||||
class BasicLexer(AbstractBasicLexer):
|
||||
terminals: Collection[TerminalDef]
|
||||
ignore_types: FrozenSet[str]
|
||||
newline_types: FrozenSet[str]
|
||||
user_callbacks: Dict[str, _Callback]
|
||||
callback: Dict[str, _Callback]
|
||||
re: ModuleType
|
||||
|
||||
def __init__(self, conf: 'LexerConf', comparator=None) -> None:
|
||||
terminals = list(conf.terminals)
|
||||
assert all(isinstance(t, TerminalDef) for t in terminals), terminals
|
||||
|
||||
self.re = conf.re_module
|
||||
|
||||
if not conf.skip_validation:
|
||||
# Sanitization
|
||||
terminal_to_regexp = {}
|
||||
for t in terminals:
|
||||
regexp = t.pattern.to_regexp()
|
||||
try:
|
||||
self.re.compile(regexp, conf.g_regex_flags)
|
||||
except self.re.error:
|
||||
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
|
||||
|
||||
if t.pattern.min_width == 0:
|
||||
raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))
|
||||
if t.pattern.type == "re":
|
||||
terminal_to_regexp[t] = regexp
|
||||
|
||||
if not (set(conf.ignore) <= {t.name for t in terminals}):
|
||||
raise LexError("Ignore terminals are not defined: %s" % (set(conf.ignore) - {t.name for t in terminals}))
|
||||
|
||||
if has_interegular:
|
||||
_check_regex_collisions(terminal_to_regexp, comparator, conf.strict)
|
||||
elif conf.strict:
|
||||
raise LexError("interegular must be installed for strict mode. Use `pip install 'lark[interegular]'`.")
|
||||
|
||||
# Init
|
||||
self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp()))
|
||||
self.ignore_types = frozenset(conf.ignore)
|
||||
|
||||
terminals.sort(key=lambda x: (-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
|
||||
self.terminals = terminals
|
||||
self.user_callbacks = conf.callbacks
|
||||
self.g_regex_flags = conf.g_regex_flags
|
||||
self.use_bytes = conf.use_bytes
|
||||
self.terminals_by_name = conf.terminals_by_name
|
||||
|
||||
self._scanner: Optional[Scanner] = None
|
||||
|
||||
def _build_scanner(self) -> Scanner:
|
||||
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
|
||||
assert all(self.callback.values())
|
||||
|
||||
for type_, f in self.user_callbacks.items():
|
||||
if type_ in self.callback:
|
||||
# Already a callback there, probably UnlessCallback
|
||||
self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_)
|
||||
else:
|
||||
self.callback[type_] = f
|
||||
|
||||
return Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)
|
||||
|
||||
@property
|
||||
def scanner(self) -> Scanner:
|
||||
if self._scanner is None:
|
||||
self._scanner = self._build_scanner()
|
||||
return self._scanner
|
||||
|
||||
def match(self, text, pos):
|
||||
return self.scanner.match(text, pos)
|
||||
|
||||
def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
|
||||
line_ctr = lex_state.line_ctr
|
||||
while line_ctr.char_pos < lex_state.text.end:
|
||||
res = self.match(lex_state.text, line_ctr.char_pos)
|
||||
if not res:
|
||||
allowed = self.scanner.allowed_types - self.ignore_types
|
||||
if not allowed:
|
||||
allowed = {"<END-OF-FILE>"}
|
||||
raise UnexpectedCharacters(lex_state.text.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
|
||||
allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
|
||||
state=parser_state, terminals_by_name=self.terminals_by_name)
|
||||
|
||||
value, type_ = res
|
||||
|
||||
ignored = type_ in self.ignore_types
|
||||
t = None
|
||||
if not ignored or type_ in self.callback:
|
||||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
|
||||
line_ctr.feed(value, type_ in self.newline_types)
|
||||
if t is not None:
|
||||
t.end_line = line_ctr.line
|
||||
t.end_column = line_ctr.column
|
||||
t.end_pos = line_ctr.char_pos
|
||||
if t.type in self.callback:
|
||||
t = self.callback[t.type](t)
|
||||
if not ignored:
|
||||
if not isinstance(t, Token):
|
||||
raise LexError("Callbacks must return a token (returned %r)" % t)
|
||||
lex_state.last_token = t
|
||||
return t
|
||||
|
||||
# EOF
|
||||
raise EOFError(self)
|
||||
|
||||
|
||||
class ContextualLexer(Lexer):
|
||||
lexers: Dict[int, AbstractBasicLexer]
|
||||
root_lexer: AbstractBasicLexer
|
||||
|
||||
BasicLexer: Type[AbstractBasicLexer] = BasicLexer
|
||||
|
||||
def __init__(self, conf: 'LexerConf', states: Dict[int, Collection[str]], always_accept: Collection[str]=()) -> None:
|
||||
terminals = list(conf.terminals)
|
||||
terminals_by_name = conf.terminals_by_name
|
||||
|
||||
trad_conf = copy(conf)
|
||||
trad_conf.terminals = terminals
|
||||
|
||||
if has_interegular and not conf.skip_validation:
|
||||
comparator = interegular.Comparator.from_regexes({t: t.pattern.to_regexp() for t in terminals})
|
||||
else:
|
||||
comparator = None
|
||||
lexer_by_tokens: Dict[FrozenSet[str], AbstractBasicLexer] = {}
|
||||
self.lexers = {}
|
||||
for state, accepts in states.items():
|
||||
key = frozenset(accepts)
|
||||
try:
|
||||
lexer = lexer_by_tokens[key]
|
||||
except KeyError:
|
||||
accepts = set(accepts) | set(conf.ignore) | set(always_accept)
|
||||
lexer_conf = copy(trad_conf)
|
||||
lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name]
|
||||
lexer = self.BasicLexer(lexer_conf, comparator)
|
||||
lexer_by_tokens[key] = lexer
|
||||
|
||||
self.lexers[state] = lexer
|
||||
|
||||
assert trad_conf.terminals is terminals
|
||||
trad_conf.skip_validation = True # We don't need to verify all terminals again
|
||||
self.root_lexer = self.BasicLexer(trad_conf, comparator)
|
||||
|
||||
def lex(self, lexer_state: LexerState, parser_state: 'ParserState') -> Iterator[Token]:
|
||||
try:
|
||||
while True:
|
||||
lexer = self.lexers[parser_state.position]
|
||||
yield lexer.next_token(lexer_state, parser_state)
|
||||
except EOFError:
|
||||
pass
|
||||
except UnexpectedCharacters as e:
|
||||
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
|
||||
# This tests the input against the global context, to provide a nicer error.
|
||||
try:
|
||||
last_token = lexer_state.last_token # Save last_token. Calling root_lexer.next_token will change this to the wrong token
|
||||
token = self.root_lexer.next_token(lexer_state, parser_state)
|
||||
raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_name)
|
||||
except UnexpectedCharacters:
|
||||
raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set.
|
||||
|
||||
###}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,391 @@
|
||||
"""Provides functions for the automatic building and shaping of the parse-tree."""
|
||||
|
||||
from typing import List
|
||||
|
||||
from .exceptions import GrammarError, ConfigurationError
|
||||
from .lexer import Token
|
||||
from .tree import Tree
|
||||
from .visitors import Transformer_InPlace
|
||||
from .visitors import _vargs_meta, _vargs_meta_inline
|
||||
|
||||
###{standalone
|
||||
from functools import partial, wraps
|
||||
from itertools import product
|
||||
|
||||
|
||||
class ExpandSingleChild:
|
||||
def __init__(self, node_builder):
|
||||
self.node_builder = node_builder
|
||||
|
||||
def __call__(self, children):
|
||||
if len(children) == 1:
|
||||
return children[0]
|
||||
else:
|
||||
return self.node_builder(children)
|
||||
|
||||
|
||||
|
||||
class PropagatePositions:
|
||||
def __init__(self, node_builder, node_filter=None):
|
||||
self.node_builder = node_builder
|
||||
self.node_filter = node_filter
|
||||
|
||||
def __call__(self, children):
|
||||
res = self.node_builder(children)
|
||||
|
||||
if isinstance(res, Tree):
|
||||
# Calculate positions while the tree is streaming, according to the rule:
|
||||
# - nodes start at the start of their first child's container,
|
||||
# and end at the end of their last child's container.
|
||||
# Containers are nodes that take up space in text, but have been inlined in the tree.
|
||||
|
||||
res_meta = res.meta
|
||||
|
||||
first_meta = self._pp_get_meta(children)
|
||||
if first_meta is not None:
|
||||
if not hasattr(res_meta, 'line'):
|
||||
# meta was already set, probably because the rule has been inlined (e.g. `?rule`)
|
||||
res_meta.line = getattr(first_meta, 'container_line', first_meta.line)
|
||||
res_meta.column = getattr(first_meta, 'container_column', first_meta.column)
|
||||
res_meta.start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos)
|
||||
res_meta.empty = False
|
||||
|
||||
res_meta.container_line = getattr(first_meta, 'container_line', first_meta.line)
|
||||
res_meta.container_column = getattr(first_meta, 'container_column', first_meta.column)
|
||||
res_meta.container_start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos)
|
||||
|
||||
last_meta = self._pp_get_meta(reversed(children))
|
||||
if last_meta is not None:
|
||||
if not hasattr(res_meta, 'end_line'):
|
||||
res_meta.end_line = getattr(last_meta, 'container_end_line', last_meta.end_line)
|
||||
res_meta.end_column = getattr(last_meta, 'container_end_column', last_meta.end_column)
|
||||
res_meta.end_pos = getattr(last_meta, 'container_end_pos', last_meta.end_pos)
|
||||
res_meta.empty = False
|
||||
|
||||
res_meta.container_end_line = getattr(last_meta, 'container_end_line', last_meta.end_line)
|
||||
res_meta.container_end_column = getattr(last_meta, 'container_end_column', last_meta.end_column)
|
||||
res_meta.container_end_pos = getattr(last_meta, 'container_end_pos', last_meta.end_pos)
|
||||
|
||||
return res
|
||||
|
||||
def _pp_get_meta(self, children):
|
||||
for c in children:
|
||||
if self.node_filter is not None and not self.node_filter(c):
|
||||
continue
|
||||
if isinstance(c, Tree):
|
||||
if not c.meta.empty:
|
||||
return c.meta
|
||||
elif isinstance(c, Token):
|
||||
return c
|
||||
elif hasattr(c, '__lark_meta__'):
|
||||
return c.__lark_meta__()
|
||||
|
||||
def make_propagate_positions(option):
|
||||
if callable(option):
|
||||
return partial(PropagatePositions, node_filter=option)
|
||||
elif option is True:
|
||||
return PropagatePositions
|
||||
elif option is False:
|
||||
return None
|
||||
|
||||
raise ConfigurationError('Invalid option for propagate_positions: %r' % option)
|
||||
|
||||
|
||||
class ChildFilter:
|
||||
def __init__(self, to_include, append_none, node_builder):
|
||||
self.node_builder = node_builder
|
||||
self.to_include = to_include
|
||||
self.append_none = append_none
|
||||
|
||||
def __call__(self, children):
|
||||
filtered = []
|
||||
|
||||
for i, to_expand, add_none in self.to_include:
|
||||
if add_none:
|
||||
filtered += [None] * add_none
|
||||
if to_expand:
|
||||
filtered += children[i].children
|
||||
else:
|
||||
filtered.append(children[i])
|
||||
|
||||
if self.append_none:
|
||||
filtered += [None] * self.append_none
|
||||
|
||||
return self.node_builder(filtered)
|
||||
|
||||
|
||||
class ChildFilterLALR(ChildFilter):
|
||||
"""Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)"""
|
||||
|
||||
def __call__(self, children):
|
||||
filtered = []
|
||||
for i, to_expand, add_none in self.to_include:
|
||||
if add_none:
|
||||
filtered += [None] * add_none
|
||||
if to_expand:
|
||||
if filtered:
|
||||
filtered += children[i].children
|
||||
else: # Optimize for left-recursion
|
||||
filtered = children[i].children
|
||||
else:
|
||||
filtered.append(children[i])
|
||||
|
||||
if self.append_none:
|
||||
filtered += [None] * self.append_none
|
||||
|
||||
return self.node_builder(filtered)
|
||||
|
||||
|
||||
class ChildFilterLALR_NoPlaceholders(ChildFilter):
|
||||
"Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)"
|
||||
def __init__(self, to_include, node_builder):
|
||||
self.node_builder = node_builder
|
||||
self.to_include = to_include
|
||||
|
||||
def __call__(self, children):
|
||||
filtered = []
|
||||
for i, to_expand in self.to_include:
|
||||
if to_expand:
|
||||
if filtered:
|
||||
filtered += children[i].children
|
||||
else: # Optimize for left-recursion
|
||||
filtered = children[i].children
|
||||
else:
|
||||
filtered.append(children[i])
|
||||
return self.node_builder(filtered)
|
||||
|
||||
|
||||
def _should_expand(sym):
|
||||
return not sym.is_term and sym.name.startswith('_')
|
||||
|
||||
|
||||
def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices: List[bool]):
|
||||
# Prepare empty_indices as: How many Nones to insert at each index?
|
||||
if _empty_indices:
|
||||
assert _empty_indices.count(False) == len(expansion)
|
||||
s = ''.join(str(int(b)) for b in _empty_indices)
|
||||
empty_indices = [len(ones) for ones in s.split('0')]
|
||||
assert len(empty_indices) == len(expansion)+1, (empty_indices, len(expansion))
|
||||
else:
|
||||
empty_indices = [0] * (len(expansion)+1)
|
||||
|
||||
to_include = []
|
||||
nones_to_add = 0
|
||||
for i, sym in enumerate(expansion):
|
||||
nones_to_add += empty_indices[i]
|
||||
if keep_all_tokens or not (sym.is_term and sym.filter_out):
|
||||
to_include.append((i, _should_expand(sym), nones_to_add))
|
||||
nones_to_add = 0
|
||||
|
||||
nones_to_add += empty_indices[len(expansion)]
|
||||
|
||||
if _empty_indices or len(to_include) < len(expansion) or any(to_expand for i, to_expand,_ in to_include):
|
||||
if _empty_indices or ambiguous:
|
||||
return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include, nones_to_add)
|
||||
else:
|
||||
# LALR without placeholders
|
||||
return partial(ChildFilterLALR_NoPlaceholders, [(i, x) for i,x,_ in to_include])
|
||||
|
||||
|
||||
class AmbiguousExpander:
|
||||
"""Deal with the case where we're expanding children ('_rule') into a parent but the children
|
||||
are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself
|
||||
ambiguous with as many copies as there are ambiguous children, and then copy the ambiguous children
|
||||
into the right parents in the right places, essentially shifting the ambiguity up the tree."""
|
||||
def __init__(self, to_expand, tree_class, node_builder):
|
||||
self.node_builder = node_builder
|
||||
self.tree_class = tree_class
|
||||
self.to_expand = to_expand
|
||||
|
||||
def __call__(self, children):
|
||||
def _is_ambig_tree(t):
|
||||
return hasattr(t, 'data') and t.data == '_ambig'
|
||||
|
||||
# -- When we're repeatedly expanding ambiguities we can end up with nested ambiguities.
|
||||
# All children of an _ambig node should be a derivation of that ambig node, hence
|
||||
# it is safe to assume that if we see an _ambig node nested within an ambig node
|
||||
# it is safe to simply expand it into the parent _ambig node as an alternative derivation.
|
||||
ambiguous = []
|
||||
for i, child in enumerate(children):
|
||||
if _is_ambig_tree(child):
|
||||
if i in self.to_expand:
|
||||
ambiguous.append(i)
|
||||
|
||||
child.expand_kids_by_data('_ambig')
|
||||
|
||||
if not ambiguous:
|
||||
return self.node_builder(children)
|
||||
|
||||
expand = [child.children if i in ambiguous else (child,) for i, child in enumerate(children)]
|
||||
return self.tree_class('_ambig', [self.node_builder(list(f)) for f in product(*expand)])
|
||||
|
||||
|
||||
def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens):
|
||||
to_expand = [i for i, sym in enumerate(expansion)
|
||||
if keep_all_tokens or ((not (sym.is_term and sym.filter_out)) and _should_expand(sym))]
|
||||
if to_expand:
|
||||
return partial(AmbiguousExpander, to_expand, tree_class)
|
||||
|
||||
|
||||
class AmbiguousIntermediateExpander:
|
||||
"""
|
||||
Propagate ambiguous intermediate nodes and their derivations up to the
|
||||
current rule.
|
||||
|
||||
In general, converts
|
||||
|
||||
rule
|
||||
_iambig
|
||||
_inter
|
||||
someChildren1
|
||||
...
|
||||
_inter
|
||||
someChildren2
|
||||
...
|
||||
someChildren3
|
||||
...
|
||||
|
||||
to
|
||||
|
||||
_ambig
|
||||
rule
|
||||
someChildren1
|
||||
...
|
||||
someChildren3
|
||||
...
|
||||
rule
|
||||
someChildren2
|
||||
...
|
||||
someChildren3
|
||||
...
|
||||
rule
|
||||
childrenFromNestedIambigs
|
||||
...
|
||||
someChildren3
|
||||
...
|
||||
...
|
||||
|
||||
propagating up any nested '_iambig' nodes along the way.
|
||||
"""
|
||||
|
||||
def __init__(self, tree_class, node_builder):
|
||||
self.node_builder = node_builder
|
||||
self.tree_class = tree_class
|
||||
|
||||
def __call__(self, children):
|
||||
def _is_iambig_tree(child):
|
||||
return hasattr(child, 'data') and child.data == '_iambig'
|
||||
|
||||
def _collapse_iambig(children):
|
||||
"""
|
||||
Recursively flatten the derivations of the parent of an '_iambig'
|
||||
node. Returns a list of '_inter' nodes guaranteed not
|
||||
to contain any nested '_iambig' nodes, or None if children does
|
||||
not contain an '_iambig' node.
|
||||
"""
|
||||
|
||||
# Due to the structure of the SPPF,
|
||||
# an '_iambig' node can only appear as the first child
|
||||
if children and _is_iambig_tree(children[0]):
|
||||
iambig_node = children[0]
|
||||
result = []
|
||||
for grandchild in iambig_node.children:
|
||||
collapsed = _collapse_iambig(grandchild.children)
|
||||
if collapsed:
|
||||
for child in collapsed:
|
||||
child.children += children[1:]
|
||||
result += collapsed
|
||||
else:
|
||||
new_tree = self.tree_class('_inter', grandchild.children + children[1:])
|
||||
result.append(new_tree)
|
||||
return result
|
||||
|
||||
collapsed = _collapse_iambig(children)
|
||||
if collapsed:
|
||||
processed_nodes = [self.node_builder(c.children) for c in collapsed]
|
||||
return self.tree_class('_ambig', processed_nodes)
|
||||
|
||||
return self.node_builder(children)
|
||||
|
||||
|
||||
|
||||
def inplace_transformer(func):
|
||||
@wraps(func)
|
||||
def f(children):
|
||||
# function name in a Transformer is a rule name.
|
||||
tree = Tree(func.__name__, children)
|
||||
return func(tree)
|
||||
return f
|
||||
|
||||
|
||||
def apply_visit_wrapper(func, name, wrapper):
|
||||
if wrapper is _vargs_meta or wrapper is _vargs_meta_inline:
|
||||
raise NotImplementedError("Meta args not supported for internal transformer; use YourTransformer().transform(parser.parse()) instead")
|
||||
|
||||
@wraps(func)
|
||||
def f(children):
|
||||
return wrapper(func, name, children, None)
|
||||
return f
|
||||
|
||||
|
||||
class ParseTreeBuilder:
|
||||
def __init__(self, rules, tree_class, propagate_positions=False, ambiguous=False, maybe_placeholders=False):
|
||||
self.tree_class = tree_class
|
||||
self.propagate_positions = propagate_positions
|
||||
self.ambiguous = ambiguous
|
||||
self.maybe_placeholders = maybe_placeholders
|
||||
|
||||
self.rule_builders = list(self._init_builders(rules))
|
||||
|
||||
def _init_builders(self, rules):
|
||||
propagate_positions = make_propagate_positions(self.propagate_positions)
|
||||
|
||||
for rule in rules:
|
||||
options = rule.options
|
||||
keep_all_tokens = options.keep_all_tokens
|
||||
expand_single_child = options.expand1
|
||||
|
||||
wrapper_chain = list(filter(None, [
|
||||
(expand_single_child and not rule.alias) and ExpandSingleChild,
|
||||
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders else None),
|
||||
propagate_positions,
|
||||
self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens),
|
||||
self.ambiguous and partial(AmbiguousIntermediateExpander, self.tree_class)
|
||||
]))
|
||||
|
||||
yield rule, wrapper_chain
|
||||
|
||||
def create_callback(self, transformer=None):
|
||||
callbacks = {}
|
||||
|
||||
default_handler = getattr(transformer, '__default__', None)
|
||||
if default_handler:
|
||||
def default_callback(data, children):
|
||||
return default_handler(data, children, None)
|
||||
else:
|
||||
default_callback = self.tree_class
|
||||
|
||||
for rule, wrapper_chain in self.rule_builders:
|
||||
|
||||
user_callback_name = rule.alias or rule.options.template_source or rule.origin.name
|
||||
try:
|
||||
f = getattr(transformer, user_callback_name)
|
||||
wrapper = getattr(f, 'visit_wrapper', None)
|
||||
if wrapper is not None:
|
||||
f = apply_visit_wrapper(f, user_callback_name, wrapper)
|
||||
elif isinstance(transformer, Transformer_InPlace):
|
||||
f = inplace_transformer(f)
|
||||
except AttributeError:
|
||||
f = partial(default_callback, user_callback_name)
|
||||
|
||||
for w in wrapper_chain:
|
||||
f = w(f)
|
||||
|
||||
if rule in callbacks:
|
||||
raise GrammarError("Rule '%s' already exists" % (rule,))
|
||||
|
||||
callbacks[rule] = f
|
||||
|
||||
return callbacks
|
||||
|
||||
###}
|
||||
@@ -0,0 +1,284 @@
|
||||
from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING
|
||||
|
||||
from .exceptions import ConfigurationError, GrammarError, assert_config
|
||||
from .utils import get_regexp_width, Serialize, TextOrSlice, TextSlice, LarkInput
|
||||
from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer
|
||||
from .parsers import earley, xearley, cyk
|
||||
from .parsers.lalr_parser import LALR_Parser
|
||||
from .tree import Tree
|
||||
from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .parsers.lalr_analysis import ParseTableBase
|
||||
|
||||
|
||||
###{standalone
|
||||
|
||||
def _wrap_lexer(lexer_class):
|
||||
future_interface = getattr(lexer_class, '__future_interface__', 0)
|
||||
if future_interface == 2:
|
||||
return lexer_class
|
||||
elif future_interface == 1:
|
||||
class CustomLexerWrapper1(Lexer):
|
||||
def __init__(self, lexer_conf):
|
||||
self.lexer = lexer_class(lexer_conf)
|
||||
def lex(self, lexer_state, parser_state):
|
||||
if isinstance(lexer_state.text, TextSlice) and not lexer_state.text.is_complete_text():
|
||||
raise TypeError("Interface=1 Custom Lexer don't support TextSlice")
|
||||
lexer_state.text = lexer_state.text
|
||||
return self.lexer.lex(lexer_state, parser_state)
|
||||
return CustomLexerWrapper1
|
||||
elif future_interface == 0:
|
||||
class CustomLexerWrapper0(Lexer):
|
||||
def __init__(self, lexer_conf):
|
||||
self.lexer = lexer_class(lexer_conf)
|
||||
|
||||
def lex(self, lexer_state, parser_state):
|
||||
if isinstance(lexer_state.text, TextSlice):
|
||||
if not lexer_state.text.is_complete_text():
|
||||
raise TypeError("Interface=0 Custom Lexer don't support TextSlice")
|
||||
return self.lexer.lex(lexer_state.text.text)
|
||||
return self.lexer.lex(lexer_state.text)
|
||||
return CustomLexerWrapper0
|
||||
else:
|
||||
raise ValueError(f"Unknown __future_interface__ value {future_interface}, integer 0-2 expected")
|
||||
|
||||
|
||||
def _deserialize_parsing_frontend(data, memo, lexer_conf, callbacks, options):
|
||||
parser_conf = ParserConf.deserialize(data['parser_conf'], memo)
|
||||
cls = (options and options._plugins.get('LALR_Parser')) or LALR_Parser
|
||||
parser = cls.deserialize(data['parser'], memo, callbacks, options.debug)
|
||||
parser_conf.callbacks = callbacks
|
||||
return ParsingFrontend(lexer_conf, parser_conf, options, parser=parser)
|
||||
|
||||
|
||||
_parser_creators: 'Dict[str, Callable[[LexerConf, Any, Any], Any]]' = {}
|
||||
|
||||
|
||||
class ParsingFrontend(Serialize):
|
||||
__serialize_fields__ = 'lexer_conf', 'parser_conf', 'parser'
|
||||
|
||||
lexer_conf: LexerConf
|
||||
parser_conf: ParserConf
|
||||
options: Any
|
||||
|
||||
def __init__(self, lexer_conf: LexerConf, parser_conf: ParserConf, options, parser=None):
|
||||
self.parser_conf = parser_conf
|
||||
self.lexer_conf = lexer_conf
|
||||
self.options = options
|
||||
|
||||
# Set-up parser
|
||||
if parser: # From cache
|
||||
self.parser = parser
|
||||
else:
|
||||
create_parser = _parser_creators.get(parser_conf.parser_type)
|
||||
assert create_parser is not None, "{} is not supported in standalone mode".format(
|
||||
parser_conf.parser_type
|
||||
)
|
||||
self.parser = create_parser(lexer_conf, parser_conf, options)
|
||||
|
||||
# Set-up lexer
|
||||
lexer_type = lexer_conf.lexer_type
|
||||
self.skip_lexer = False
|
||||
if lexer_type in ('dynamic', 'dynamic_complete'):
|
||||
assert lexer_conf.postlex is None
|
||||
self.skip_lexer = True
|
||||
return
|
||||
|
||||
if isinstance(lexer_type, type):
|
||||
assert issubclass(lexer_type, Lexer)
|
||||
self.lexer = _wrap_lexer(lexer_type)(lexer_conf)
|
||||
elif isinstance(lexer_type, str):
|
||||
create_lexer = {
|
||||
'basic': create_basic_lexer,
|
||||
'contextual': create_contextual_lexer,
|
||||
}[lexer_type]
|
||||
self.lexer = create_lexer(lexer_conf, self.parser, lexer_conf.postlex, options)
|
||||
else:
|
||||
raise TypeError("Bad value for lexer_type: {lexer_type}")
|
||||
|
||||
if lexer_conf.postlex:
|
||||
self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex)
|
||||
|
||||
def _verify_start(self, start=None):
|
||||
if start is None:
|
||||
start_decls = self.parser_conf.start
|
||||
if len(start_decls) > 1:
|
||||
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start_decls)
|
||||
start ,= start_decls
|
||||
elif start not in self.parser_conf.start:
|
||||
raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start))
|
||||
return start
|
||||
|
||||
def _make_lexer_thread(self, text: Optional[LarkInput]) -> Union[LarkInput, LexerThread, None]:
|
||||
cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread
|
||||
if self.skip_lexer:
|
||||
return text
|
||||
if text is None:
|
||||
return cls(self.lexer, None)
|
||||
if isinstance(text, (str, bytes, TextSlice)):
|
||||
return cls.from_text(self.lexer, text)
|
||||
return cls.from_custom_input(self.lexer, text)
|
||||
|
||||
def parse(self, text: Optional[LarkInput], start=None, on_error=None):
|
||||
if self.lexer_conf.lexer_type in ("dynamic", "dynamic_complete"):
|
||||
if isinstance(text, TextSlice) and not text.is_complete_text():
|
||||
raise TypeError(f"Lexer {self.lexer_conf.lexer_type} does not support text slices.")
|
||||
|
||||
chosen_start = self._verify_start(start)
|
||||
kw = {} if on_error is None else {'on_error': on_error}
|
||||
stream = self._make_lexer_thread(text)
|
||||
return self.parser.parse(stream, chosen_start, **kw)
|
||||
|
||||
def parse_interactive(self, text: Optional[TextOrSlice]=None, start=None):
|
||||
# TODO BREAK - Change text from Optional[str] to text: str = ''.
|
||||
# Would break behavior of exhaust_lexer(), which currently raises TypeError, and after the change would just return []
|
||||
chosen_start = self._verify_start(start)
|
||||
if self.parser_conf.parser_type != 'lalr':
|
||||
raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ")
|
||||
stream = self._make_lexer_thread(text)
|
||||
return self.parser.parse_interactive(stream, chosen_start)
|
||||
|
||||
|
||||
def _validate_frontend_args(parser, lexer) -> None:
|
||||
assert_config(parser, ('lalr', 'earley', 'cyk'))
|
||||
if not isinstance(lexer, type): # not custom lexer?
|
||||
expected = {
|
||||
'lalr': ('basic', 'contextual'),
|
||||
'earley': ('basic', 'dynamic', 'dynamic_complete'),
|
||||
'cyk': ('basic', ),
|
||||
}[parser]
|
||||
assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser)
|
||||
|
||||
|
||||
def _get_lexer_callbacks(transformer, terminals):
|
||||
result = {}
|
||||
for terminal in terminals:
|
||||
callback = getattr(transformer, terminal.name, None)
|
||||
if callback is not None:
|
||||
result[terminal.name] = callback
|
||||
return result
|
||||
|
||||
class PostLexConnector:
|
||||
def __init__(self, lexer, postlexer):
|
||||
self.lexer = lexer
|
||||
self.postlexer = postlexer
|
||||
|
||||
def lex(self, lexer_state, parser_state):
|
||||
i = self.lexer.lex(lexer_state, parser_state)
|
||||
return self.postlexer.process(i)
|
||||
|
||||
|
||||
|
||||
def create_basic_lexer(lexer_conf, parser, postlex, options) -> BasicLexer:
|
||||
cls = (options and options._plugins.get('BasicLexer')) or BasicLexer
|
||||
return cls(lexer_conf)
|
||||
|
||||
def create_contextual_lexer(lexer_conf: LexerConf, parser, postlex, options) -> ContextualLexer:
|
||||
cls = (options and options._plugins.get('ContextualLexer')) or ContextualLexer
|
||||
parse_table: ParseTableBase[int] = parser._parse_table
|
||||
states: Dict[int, Collection[str]] = {idx:list(t.keys()) for idx, t in parse_table.states.items()}
|
||||
always_accept: Collection[str] = postlex.always_accept if postlex else ()
|
||||
return cls(lexer_conf, states, always_accept=always_accept)
|
||||
|
||||
def create_lalr_parser(lexer_conf: LexerConf, parser_conf: ParserConf, options=None) -> LALR_Parser:
|
||||
debug = options.debug if options else False
|
||||
strict = options.strict if options else False
|
||||
cls = (options and options._plugins.get('LALR_Parser')) or LALR_Parser
|
||||
return cls(parser_conf, debug=debug, strict=strict)
|
||||
|
||||
_parser_creators['lalr'] = create_lalr_parser
|
||||
|
||||
###}
|
||||
|
||||
class EarleyRegexpMatcher:
|
||||
def __init__(self, lexer_conf):
|
||||
self.regexps = {}
|
||||
for t in lexer_conf.terminals:
|
||||
regexp = t.pattern.to_regexp()
|
||||
try:
|
||||
width = get_regexp_width(regexp)[0]
|
||||
except ValueError:
|
||||
raise GrammarError("Bad regexp in token %s: %s" % (t.name, regexp))
|
||||
else:
|
||||
if width == 0:
|
||||
raise GrammarError("Dynamic Earley doesn't allow zero-width regexps", t)
|
||||
if lexer_conf.use_bytes:
|
||||
regexp = regexp.encode('utf-8')
|
||||
|
||||
self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)
|
||||
|
||||
def match(self, term, text, index=0):
|
||||
return self.regexps[term.name].match(text, index)
|
||||
|
||||
|
||||
def create_earley_parser__dynamic(lexer_conf: LexerConf, parser_conf: ParserConf, **kw):
|
||||
if lexer_conf.callbacks:
|
||||
raise GrammarError("Earley's dynamic lexer doesn't support lexer_callbacks.")
|
||||
|
||||
earley_matcher = EarleyRegexpMatcher(lexer_conf)
|
||||
return xearley.Parser(lexer_conf, parser_conf, earley_matcher.match, **kw)
|
||||
|
||||
def _match_earley_basic(term, token):
|
||||
return term.name == token.type
|
||||
|
||||
def create_earley_parser__basic(lexer_conf: LexerConf, parser_conf: ParserConf, **kw):
|
||||
return earley.Parser(lexer_conf, parser_conf, _match_earley_basic, **kw)
|
||||
|
||||
def create_earley_parser(lexer_conf: LexerConf, parser_conf: ParserConf, options) -> earley.Parser:
|
||||
resolve_ambiguity = options.ambiguity == 'resolve'
|
||||
debug = options.debug if options else False
|
||||
tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
|
||||
|
||||
extra = {}
|
||||
if lexer_conf.lexer_type == 'dynamic':
|
||||
f = create_earley_parser__dynamic
|
||||
elif lexer_conf.lexer_type == 'dynamic_complete':
|
||||
extra['complete_lex'] = True
|
||||
f = create_earley_parser__dynamic
|
||||
else:
|
||||
f = create_earley_parser__basic
|
||||
|
||||
return f(lexer_conf, parser_conf, resolve_ambiguity=resolve_ambiguity,
|
||||
debug=debug, tree_class=tree_class, ordered_sets=options.ordered_sets, **extra)
|
||||
|
||||
|
||||
|
||||
class CYK_FrontEnd:
|
||||
def __init__(self, lexer_conf, parser_conf, options=None):
|
||||
self.parser = cyk.Parser(parser_conf.rules)
|
||||
|
||||
self.callbacks = parser_conf.callbacks
|
||||
|
||||
def parse(self, lexer_thread, start):
|
||||
tokens = list(lexer_thread.lex(None))
|
||||
tree = self.parser.parse(tokens, start)
|
||||
return self._transform(tree)
|
||||
|
||||
def _transform(self, tree):
|
||||
subtrees = list(tree.iter_subtrees())
|
||||
for subtree in subtrees:
|
||||
subtree.children = [self._apply_callback(c) if isinstance(c, Tree) else c for c in subtree.children]
|
||||
|
||||
return self._apply_callback(tree)
|
||||
|
||||
def _apply_callback(self, tree):
|
||||
return self.callbacks[tree.rule](tree.children)
|
||||
|
||||
|
||||
_parser_creators['earley'] = create_earley_parser
|
||||
_parser_creators['cyk'] = CYK_FrontEnd
|
||||
|
||||
|
||||
def _construct_parsing_frontend(
|
||||
parser_type: _ParserArgType,
|
||||
lexer_type: _LexerArgType,
|
||||
lexer_conf,
|
||||
parser_conf,
|
||||
options
|
||||
):
|
||||
assert isinstance(lexer_conf, LexerConf)
|
||||
assert isinstance(parser_conf, ParserConf)
|
||||
parser_conf.parser_type = parser_type
|
||||
lexer_conf.lexer_type = lexer_type
|
||||
return ParsingFrontend(lexer_conf, parser_conf, options)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,340 @@
|
||||
"""This module implements a CYK parser."""
|
||||
|
||||
# Author: https://github.com/ehudt (2018)
|
||||
#
|
||||
# Adapted by Erez
|
||||
|
||||
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
|
||||
from ..exceptions import ParseError
|
||||
from ..lexer import Token
|
||||
from ..tree import Tree
|
||||
from ..grammar import Terminal as T, NonTerminal as NT, Symbol
|
||||
|
||||
def match(t, s):
|
||||
assert isinstance(t, T)
|
||||
return t.name == s.type
|
||||
|
||||
|
||||
class Rule:
|
||||
"""Context-free grammar rule."""
|
||||
|
||||
def __init__(self, lhs, rhs, weight, alias):
|
||||
super(Rule, self).__init__()
|
||||
assert isinstance(lhs, NT), lhs
|
||||
assert all(isinstance(x, NT) or isinstance(x, T) for x in rhs), rhs
|
||||
self.lhs = lhs
|
||||
self.rhs = rhs
|
||||
self.weight = weight
|
||||
self.alias = alias
|
||||
|
||||
def __str__(self):
|
||||
return '%s -> %s' % (str(self.lhs), ' '.join(str(x) for x in self.rhs))
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.lhs, tuple(self.rhs)))
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.lhs == other.lhs and self.rhs == other.rhs
|
||||
|
||||
def __ne__(self, other):
|
||||
return not (self == other)
|
||||
|
||||
|
||||
class Grammar:
|
||||
"""Context-free grammar."""
|
||||
|
||||
def __init__(self, rules):
|
||||
self.rules = frozenset(rules)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.rules == other.rules
|
||||
|
||||
def __str__(self):
|
||||
return '\n' + '\n'.join(sorted(repr(x) for x in self.rules)) + '\n'
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
|
||||
# Parse tree data structures
|
||||
class RuleNode:
|
||||
"""A node in the parse tree, which also contains the full rhs rule."""
|
||||
|
||||
def __init__(self, rule, children, weight=0):
|
||||
self.rule = rule
|
||||
self.children = children
|
||||
self.weight = weight
|
||||
|
||||
def __repr__(self):
|
||||
return 'RuleNode(%s, [%s])' % (repr(self.rule.lhs), ', '.join(str(x) for x in self.children))
|
||||
|
||||
|
||||
|
||||
class Parser:
|
||||
"""Parser wrapper."""
|
||||
|
||||
def __init__(self, rules):
|
||||
super(Parser, self).__init__()
|
||||
self.orig_rules = {rule: rule for rule in rules}
|
||||
rules = [self._to_rule(rule) for rule in rules]
|
||||
self.grammar = to_cnf(Grammar(rules))
|
||||
|
||||
def _to_rule(self, lark_rule):
|
||||
"""Converts a lark rule, (lhs, rhs, callback, options), to a Rule."""
|
||||
assert isinstance(lark_rule.origin, NT)
|
||||
assert all(isinstance(x, Symbol) for x in lark_rule.expansion)
|
||||
return Rule(
|
||||
lark_rule.origin, lark_rule.expansion,
|
||||
weight=lark_rule.options.priority if lark_rule.options.priority else 0,
|
||||
alias=lark_rule)
|
||||
|
||||
def parse(self, tokenized, start): # pylint: disable=invalid-name
|
||||
"""Parses input, which is a list of tokens."""
|
||||
assert start
|
||||
start = NT(start)
|
||||
|
||||
table, trees = _parse(tokenized, self.grammar)
|
||||
# Check if the parse succeeded.
|
||||
if all(r.lhs != start for r in table[(0, len(tokenized) - 1)]):
|
||||
raise ParseError('Parsing failed.')
|
||||
parse = trees[(0, len(tokenized) - 1)][start]
|
||||
return self._to_tree(revert_cnf(parse))
|
||||
|
||||
def _to_tree(self, rule_node):
|
||||
"""Converts a RuleNode parse tree to a lark Tree."""
|
||||
orig_rule = self.orig_rules[rule_node.rule.alias]
|
||||
children = []
|
||||
for child in rule_node.children:
|
||||
if isinstance(child, RuleNode):
|
||||
children.append(self._to_tree(child))
|
||||
else:
|
||||
assert isinstance(child.name, Token)
|
||||
children.append(child.name)
|
||||
t = Tree(orig_rule.origin, children)
|
||||
t.rule=orig_rule
|
||||
return t
|
||||
|
||||
|
||||
def print_parse(node, indent=0):
|
||||
if isinstance(node, RuleNode):
|
||||
print(' ' * (indent * 2) + str(node.rule.lhs))
|
||||
for child in node.children:
|
||||
print_parse(child, indent + 1)
|
||||
else:
|
||||
print(' ' * (indent * 2) + str(node.s))
|
||||
|
||||
|
||||
def _parse(s, g):
|
||||
"""Parses sentence 's' using CNF grammar 'g'."""
|
||||
# The CYK table. Indexed with a 2-tuple: (start pos, end pos)
|
||||
table = defaultdict(set)
|
||||
# Top-level structure is similar to the CYK table. Each cell is a dict from
|
||||
# rule name to the best (lightest) tree for that rule.
|
||||
trees = defaultdict(dict)
|
||||
# Populate base case with existing terminal production rules
|
||||
for i, w in enumerate(s):
|
||||
for terminal, rules in g.terminal_rules.items():
|
||||
if match(terminal, w):
|
||||
for rule in rules:
|
||||
table[(i, i)].add(rule)
|
||||
if (rule.lhs not in trees[(i, i)] or
|
||||
rule.weight < trees[(i, i)][rule.lhs].weight):
|
||||
trees[(i, i)][rule.lhs] = RuleNode(rule, [T(w)], weight=rule.weight)
|
||||
|
||||
# Iterate over lengths of sub-sentences
|
||||
for l in range(2, len(s) + 1):
|
||||
# Iterate over sub-sentences with the given length
|
||||
for i in range(len(s) - l + 1):
|
||||
# Choose partition of the sub-sentence in [1, l)
|
||||
for p in range(i + 1, i + l):
|
||||
span1 = (i, p - 1)
|
||||
span2 = (p, i + l - 1)
|
||||
for r1, r2 in itertools.product(table[span1], table[span2]):
|
||||
for rule in g.nonterminal_rules.get((r1.lhs, r2.lhs), []):
|
||||
table[(i, i + l - 1)].add(rule)
|
||||
r1_tree = trees[span1][r1.lhs]
|
||||
r2_tree = trees[span2][r2.lhs]
|
||||
rule_total_weight = rule.weight + r1_tree.weight + r2_tree.weight
|
||||
if (rule.lhs not in trees[(i, i + l - 1)]
|
||||
or rule_total_weight < trees[(i, i + l - 1)][rule.lhs].weight):
|
||||
trees[(i, i + l - 1)][rule.lhs] = RuleNode(rule, [r1_tree, r2_tree], weight=rule_total_weight)
|
||||
return table, trees
|
||||
|
||||
|
||||
# This section implements context-free grammar converter to Chomsky normal form.
|
||||
# It also implements a conversion of parse trees from its CNF to the original
|
||||
# grammar.
|
||||
# Overview:
|
||||
# Applies the following operations in this order:
|
||||
# * TERM: Eliminates non-solitary terminals from all rules
|
||||
# * BIN: Eliminates rules with more than 2 symbols on their right-hand-side.
|
||||
# * UNIT: Eliminates non-terminal unit rules
|
||||
#
|
||||
# The following grammar characteristics aren't featured:
|
||||
# * Start symbol appears on RHS
|
||||
# * Empty rules (epsilon rules)
|
||||
|
||||
|
||||
class CnfWrapper:
|
||||
"""CNF wrapper for grammar.
|
||||
|
||||
Validates that the input grammar is CNF and provides helper data structures.
|
||||
"""
|
||||
|
||||
def __init__(self, grammar):
|
||||
super(CnfWrapper, self).__init__()
|
||||
self.grammar = grammar
|
||||
self.rules = grammar.rules
|
||||
self.terminal_rules = defaultdict(list)
|
||||
self.nonterminal_rules = defaultdict(list)
|
||||
for r in self.rules:
|
||||
# Validate that the grammar is CNF and populate auxiliary data structures.
|
||||
assert isinstance(r.lhs, NT), r
|
||||
if len(r.rhs) not in [1, 2]:
|
||||
raise ParseError("CYK doesn't support empty rules")
|
||||
if len(r.rhs) == 1 and isinstance(r.rhs[0], T):
|
||||
self.terminal_rules[r.rhs[0]].append(r)
|
||||
elif len(r.rhs) == 2 and all(isinstance(x, NT) for x in r.rhs):
|
||||
self.nonterminal_rules[tuple(r.rhs)].append(r)
|
||||
else:
|
||||
assert False, r
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.grammar == other.grammar
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self.grammar)
|
||||
|
||||
|
||||
class UnitSkipRule(Rule):
|
||||
"""A rule that records NTs that were skipped during transformation."""
|
||||
|
||||
def __init__(self, lhs, rhs, skipped_rules, weight, alias):
|
||||
super(UnitSkipRule, self).__init__(lhs, rhs, weight, alias)
|
||||
self.skipped_rules = skipped_rules
|
||||
|
||||
def __eq__(self, other):
|
||||
return isinstance(other, type(self)) and self.skipped_rules == other.skipped_rules
|
||||
|
||||
__hash__ = Rule.__hash__
|
||||
|
||||
|
||||
def build_unit_skiprule(unit_rule, target_rule):
|
||||
skipped_rules = []
|
||||
if isinstance(unit_rule, UnitSkipRule):
|
||||
skipped_rules += unit_rule.skipped_rules
|
||||
skipped_rules.append(target_rule)
|
||||
if isinstance(target_rule, UnitSkipRule):
|
||||
skipped_rules += target_rule.skipped_rules
|
||||
return UnitSkipRule(unit_rule.lhs, target_rule.rhs, skipped_rules,
|
||||
weight=unit_rule.weight + target_rule.weight, alias=unit_rule.alias)
|
||||
|
||||
|
||||
def get_any_nt_unit_rule(g):
|
||||
"""Returns a non-terminal unit rule from 'g', or None if there is none."""
|
||||
for rule in g.rules:
|
||||
if len(rule.rhs) == 1 and isinstance(rule.rhs[0], NT):
|
||||
return rule
|
||||
return None
|
||||
|
||||
|
||||
def _remove_unit_rule(g, rule):
|
||||
"""Removes 'rule' from 'g' without changing the language produced by 'g'."""
|
||||
new_rules = [x for x in g.rules if x != rule]
|
||||
refs = [x for x in g.rules if x.lhs == rule.rhs[0]]
|
||||
new_rules += [build_unit_skiprule(rule, ref) for ref in refs]
|
||||
return Grammar(new_rules)
|
||||
|
||||
|
||||
def _split(rule):
|
||||
"""Splits a rule whose len(rhs) > 2 into shorter rules."""
|
||||
rule_str = str(rule.lhs) + '__' + '_'.join(str(x) for x in rule.rhs)
|
||||
rule_name = '__SP_%s' % (rule_str) + '_%d'
|
||||
yield Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias)
|
||||
for i in range(1, len(rule.rhs) - 2):
|
||||
yield Rule(NT(rule_name % i), [rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split')
|
||||
yield Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split')
|
||||
|
||||
|
||||
def _term(g):
|
||||
"""Applies the TERM rule on 'g' (see top comment)."""
|
||||
all_t = {x for rule in g.rules for x in rule.rhs if isinstance(x, T)}
|
||||
t_rules = {t: Rule(NT('__T_%s' % str(t)), [t], weight=0, alias='Term') for t in all_t}
|
||||
new_rules = []
|
||||
for rule in g.rules:
|
||||
if len(rule.rhs) > 1 and any(isinstance(x, T) for x in rule.rhs):
|
||||
new_rhs = [t_rules[x].lhs if isinstance(x, T) else x for x in rule.rhs]
|
||||
new_rules.append(Rule(rule.lhs, new_rhs, weight=rule.weight, alias=rule.alias))
|
||||
new_rules.extend(v for k, v in t_rules.items() if k in rule.rhs)
|
||||
else:
|
||||
new_rules.append(rule)
|
||||
return Grammar(new_rules)
|
||||
|
||||
|
||||
def _bin(g):
|
||||
"""Applies the BIN rule to 'g' (see top comment)."""
|
||||
new_rules = []
|
||||
for rule in g.rules:
|
||||
if len(rule.rhs) > 2:
|
||||
new_rules += _split(rule)
|
||||
else:
|
||||
new_rules.append(rule)
|
||||
return Grammar(new_rules)
|
||||
|
||||
|
||||
def _unit(g):
|
||||
"""Applies the UNIT rule to 'g' (see top comment)."""
|
||||
nt_unit_rule = get_any_nt_unit_rule(g)
|
||||
while nt_unit_rule:
|
||||
g = _remove_unit_rule(g, nt_unit_rule)
|
||||
nt_unit_rule = get_any_nt_unit_rule(g)
|
||||
return g
|
||||
|
||||
|
||||
def to_cnf(g):
|
||||
"""Creates a CNF grammar from a general context-free grammar 'g'."""
|
||||
g = _unit(_bin(_term(g)))
|
||||
return CnfWrapper(g)
|
||||
|
||||
|
||||
def unroll_unit_skiprule(lhs, orig_rhs, skipped_rules, children, weight, alias):
|
||||
if not skipped_rules:
|
||||
return RuleNode(Rule(lhs, orig_rhs, weight=weight, alias=alias), children, weight=weight)
|
||||
else:
|
||||
weight = weight - skipped_rules[0].weight
|
||||
return RuleNode(
|
||||
Rule(lhs, [skipped_rules[0].lhs], weight=weight, alias=alias), [
|
||||
unroll_unit_skiprule(skipped_rules[0].lhs, orig_rhs,
|
||||
skipped_rules[1:], children,
|
||||
skipped_rules[0].weight, skipped_rules[0].alias)
|
||||
], weight=weight)
|
||||
|
||||
|
||||
def revert_cnf(node):
|
||||
"""Reverts a parse tree (RuleNode) to its original non-CNF form (Node)."""
|
||||
if isinstance(node, T):
|
||||
return node
|
||||
# Reverts TERM rule.
|
||||
if node.rule.lhs.name.startswith('__T_'):
|
||||
return node.children[0]
|
||||
else:
|
||||
children = []
|
||||
for child in map(revert_cnf, node.children):
|
||||
# Reverts BIN rule.
|
||||
if isinstance(child, RuleNode) and child.rule.lhs.name.startswith('__SP_'):
|
||||
children += child.children
|
||||
else:
|
||||
children.append(child)
|
||||
# Reverts UNIT rule.
|
||||
if isinstance(node.rule, UnitSkipRule):
|
||||
return unroll_unit_skiprule(node.rule.lhs, node.rule.rhs,
|
||||
node.rule.skipped_rules, children,
|
||||
node.rule.weight, node.rule.alias)
|
||||
else:
|
||||
return RuleNode(node.rule, children)
|
||||
@@ -0,0 +1,312 @@
|
||||
"""This module implements an Earley parser.
|
||||
|
||||
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
|
||||
https://www.sciencedirect.com/science/article/pii/S1571066108001497
|
||||
|
||||
That is probably the best reference for understanding the algorithm here.
|
||||
|
||||
The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
|
||||
is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING, Callable, Optional, List, Any
|
||||
from collections import deque
|
||||
|
||||
from ..lexer import Token
|
||||
from ..tree import Tree
|
||||
from ..exceptions import UnexpectedEOF, UnexpectedToken
|
||||
from ..utils import logger, OrderedSet, dedup_list
|
||||
from .grammar_analysis import GrammarAnalyzer
|
||||
from ..grammar import NonTerminal
|
||||
from .earley_common import Item
|
||||
from .earley_forest import ForestSumVisitor, SymbolNode, StableSymbolNode, TokenNode, ForestToParseTree
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..common import LexerConf, ParserConf
|
||||
|
||||
class Parser:
|
||||
lexer_conf: 'LexerConf'
|
||||
parser_conf: 'ParserConf'
|
||||
debug: bool
|
||||
|
||||
def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable,
|
||||
resolve_ambiguity: bool=True, debug: bool=False,
|
||||
tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True):
|
||||
analysis = GrammarAnalyzer(parser_conf)
|
||||
self.lexer_conf = lexer_conf
|
||||
self.parser_conf = parser_conf
|
||||
self.resolve_ambiguity = resolve_ambiguity
|
||||
self.debug = debug
|
||||
self.Tree = tree_class
|
||||
self.Set = OrderedSet if ordered_sets else set
|
||||
self.SymbolNode = StableSymbolNode if ordered_sets else SymbolNode
|
||||
|
||||
self.FIRST = analysis.FIRST
|
||||
self.NULLABLE = analysis.NULLABLE
|
||||
self.callbacks = parser_conf.callbacks
|
||||
# TODO add typing info
|
||||
self.predictions = {} # type: ignore[var-annotated]
|
||||
|
||||
## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than
|
||||
# the slow 'isupper' in is_terminal.
|
||||
self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }
|
||||
self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }
|
||||
|
||||
self.forest_sum_visitor = None
|
||||
for rule in parser_conf.rules:
|
||||
if rule.origin not in self.predictions:
|
||||
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]
|
||||
|
||||
## Detect if any rules/terminals have priorities set. If the user specified priority = None, then
|
||||
# the priorities will be stripped from all rules/terminals before they reach us, allowing us to
|
||||
# skip the extra tree walk. We'll also skip this if the user just didn't specify priorities
|
||||
# on any rules/terminals.
|
||||
if self.forest_sum_visitor is None and rule.options.priority is not None:
|
||||
self.forest_sum_visitor = ForestSumVisitor
|
||||
|
||||
# Check terminals for priorities
|
||||
# Ignore terminal priorities if the basic lexer is used
|
||||
if self.lexer_conf.lexer_type != 'basic' and self.forest_sum_visitor is None:
|
||||
for term in self.lexer_conf.terminals:
|
||||
if term.priority:
|
||||
self.forest_sum_visitor = ForestSumVisitor
|
||||
break
|
||||
|
||||
self.term_matcher = term_matcher
|
||||
|
||||
|
||||
def predict_and_complete(self, i, to_scan, columns, transitives, node_cache):
|
||||
"""The core Earley Predictor and Completer.
|
||||
|
||||
At each stage of the input, we handling any completed items (things
|
||||
that matched on the last cycle) and use those to predict what should
|
||||
come next in the input stream. The completions and any predicted
|
||||
non-terminals are recursively processed until we reach a set of,
|
||||
which can be added to the scan list for the next scanner cycle."""
|
||||
# Held Completions (H in E.Scotts paper).
|
||||
held_completions = {}
|
||||
|
||||
column = columns[i]
|
||||
# R (items) = Ei (column.items)
|
||||
items = deque(column)
|
||||
while items:
|
||||
item = items.pop() # remove an element, A say, from R
|
||||
|
||||
### The Earley completer
|
||||
if item.is_complete: ### (item.s == string)
|
||||
if item.node is None:
|
||||
label = (item.s, item.start, i)
|
||||
item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
item.node.add_family(item.s, item.rule, item.start, None, None)
|
||||
|
||||
# create_leo_transitives(item.rule.origin, item.start)
|
||||
|
||||
###R Joop Leo right recursion Completer
|
||||
if item.rule.origin in transitives[item.start]:
|
||||
transitive = transitives[item.start][item.s]
|
||||
if transitive.previous in transitives[transitive.column]:
|
||||
root_transitive = transitives[transitive.column][transitive.previous]
|
||||
else:
|
||||
root_transitive = transitive
|
||||
|
||||
new_item = Item(transitive.rule, transitive.ptr, transitive.start)
|
||||
label = (root_transitive.s, root_transitive.start, i)
|
||||
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
new_item.node.add_path(root_transitive, item.node)
|
||||
if new_item.expect in self.TERMINALS:
|
||||
# Add (B :: aC.B, h, y) to Q
|
||||
to_scan.add(new_item)
|
||||
elif new_item not in column:
|
||||
# Add (B :: aC.B, h, y) to Ei and R
|
||||
column.add(new_item)
|
||||
items.append(new_item)
|
||||
###R Regular Earley completer
|
||||
else:
|
||||
# Empty has 0 length. If we complete an empty symbol in a particular
|
||||
# parse step, we need to be able to use that same empty symbol to complete
|
||||
# any predictions that result, that themselves require empty. Avoids
|
||||
# infinite recursion on empty symbols.
|
||||
# held_completions is 'H' in E.Scott's paper.
|
||||
is_empty_item = item.start == i
|
||||
if is_empty_item:
|
||||
held_completions[item.rule.origin] = item.node
|
||||
|
||||
originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
|
||||
for originator in originators:
|
||||
new_item = originator.advance()
|
||||
label = (new_item.s, originator.start, i)
|
||||
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
|
||||
if new_item.expect in self.TERMINALS:
|
||||
# Add (B :: aC.B, h, y) to Q
|
||||
to_scan.add(new_item)
|
||||
elif new_item not in column:
|
||||
# Add (B :: aC.B, h, y) to Ei and R
|
||||
column.add(new_item)
|
||||
items.append(new_item)
|
||||
|
||||
### The Earley predictor
|
||||
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
|
||||
new_items = []
|
||||
for rule in self.predictions[item.expect]:
|
||||
new_item = Item(rule, 0, i)
|
||||
new_items.append(new_item)
|
||||
|
||||
# Process any held completions (H).
|
||||
if item.expect in held_completions:
|
||||
new_item = item.advance()
|
||||
label = (new_item.s, item.start, i)
|
||||
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
|
||||
new_items.append(new_item)
|
||||
|
||||
for new_item in new_items:
|
||||
if new_item.expect in self.TERMINALS:
|
||||
to_scan.add(new_item)
|
||||
elif new_item not in column:
|
||||
column.add(new_item)
|
||||
items.append(new_item)
|
||||
|
||||
def _parse(self, lexer, columns, to_scan, start_symbol=None):
|
||||
|
||||
def is_quasi_complete(item):
|
||||
if item.is_complete:
|
||||
return True
|
||||
|
||||
quasi = item.advance()
|
||||
while not quasi.is_complete:
|
||||
if quasi.expect not in self.NULLABLE:
|
||||
return False
|
||||
if quasi.rule.origin == start_symbol and quasi.expect == start_symbol:
|
||||
return False
|
||||
quasi = quasi.advance()
|
||||
return True
|
||||
|
||||
# def create_leo_transitives(origin, start):
|
||||
# ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420
|
||||
|
||||
def scan(i, token, to_scan):
|
||||
"""The core Earley Scanner.
|
||||
|
||||
This is a custom implementation of the scanner that uses the
|
||||
Lark lexer to match tokens. The scan list is built by the
|
||||
Earley predictor, based on the previously completed tokens.
|
||||
This ensures that at each phase of the parse we have a custom
|
||||
lexer context, allowing for more complex ambiguities."""
|
||||
next_to_scan = self.Set()
|
||||
next_set = self.Set()
|
||||
columns.append(next_set)
|
||||
transitives.append({})
|
||||
node_cache = {}
|
||||
|
||||
for item in self.Set(to_scan):
|
||||
if match(item.expect, token):
|
||||
new_item = item.advance()
|
||||
label = (new_item.s, new_item.start, i + 1)
|
||||
# 'terminals' may not contain token.type when using %declare
|
||||
# Additionally, token is not always a Token
|
||||
# For example, it can be a Tree when using TreeMatcher
|
||||
term = terminals.get(token.type) if isinstance(token, Token) else None
|
||||
# Set the priority of the token node to 0 so that the
|
||||
# terminal priorities do not affect the Tree chosen by
|
||||
# ForestSumVisitor after the basic lexer has already
|
||||
# "used up" the terminal priorities
|
||||
token_node = TokenNode(token, term, priority=0)
|
||||
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)
|
||||
|
||||
if new_item.expect in self.TERMINALS:
|
||||
# add (B ::= Aai+1.B, h, y) to Q'
|
||||
next_to_scan.add(new_item)
|
||||
else:
|
||||
# add (B ::= Aa+1.B, h, y) to Ei+1
|
||||
next_set.add(new_item)
|
||||
|
||||
if not next_set and not next_to_scan:
|
||||
expect = {i.expect.name for i in to_scan}
|
||||
raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))
|
||||
|
||||
return next_to_scan, node_cache
|
||||
|
||||
|
||||
# Define parser functions
|
||||
match = self.term_matcher
|
||||
|
||||
terminals = self.lexer_conf.terminals_by_name
|
||||
|
||||
# Cache for nodes & tokens created in a particular parse step.
|
||||
transitives = [{}]
|
||||
|
||||
## The main Earley loop.
|
||||
# Run the Prediction/Completion cycle for any Items in the current Earley set.
|
||||
# Completions will be added to the SPPF tree, and predictions will be recursively
|
||||
# processed down to terminals/empty nodes to be added to the scanner for the next
|
||||
# step.
|
||||
expects = {i.expect for i in to_scan}
|
||||
i = 0
|
||||
node_cache = {}
|
||||
for token in lexer.lex(expects):
|
||||
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
|
||||
|
||||
to_scan, node_cache = scan(i, token, to_scan)
|
||||
i += 1
|
||||
|
||||
expects.clear()
|
||||
expects |= {i.expect for i in to_scan}
|
||||
|
||||
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
|
||||
|
||||
## Column is now the final column in the parse.
|
||||
assert i == len(columns)-1
|
||||
return to_scan
|
||||
|
||||
def parse(self, lexer, start):
|
||||
assert start, start
|
||||
start_symbol = NonTerminal(start)
|
||||
|
||||
columns = [self.Set()]
|
||||
to_scan = self.Set() # The scan buffer. 'Q' in E.Scott's paper.
|
||||
|
||||
## Predict for the start_symbol.
|
||||
# Add predicted items to the first Earley set (for the predictor) if they
|
||||
# result in a non-terminal, or the scanner if they result in a terminal.
|
||||
for rule in self.predictions[start_symbol]:
|
||||
item = Item(rule, 0, 0)
|
||||
if item.expect in self.TERMINALS:
|
||||
to_scan.add(item)
|
||||
else:
|
||||
columns[0].add(item)
|
||||
|
||||
to_scan = self._parse(lexer, columns, to_scan, start_symbol)
|
||||
|
||||
# If the parse was successful, the start
|
||||
# symbol should have been completed in the last step of the Earley cycle, and will be in
|
||||
# this column. Find the item for the start_symbol, which is the root of the SPPF tree.
|
||||
solutions = dedup_list(n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0)
|
||||
if not solutions:
|
||||
expected_terminals = [t.expect.name for t in to_scan]
|
||||
raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))
|
||||
if len(solutions) > 1:
|
||||
raise RuntimeError('Earley should not generate multiple start symbol items! Please report this bug.')
|
||||
solution ,= solutions
|
||||
|
||||
if self.debug:
|
||||
from .earley_forest import ForestToPyDotVisitor
|
||||
try:
|
||||
debug_walker = ForestToPyDotVisitor()
|
||||
except ImportError:
|
||||
logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
|
||||
else:
|
||||
debug_walker.visit(solution, "sppf.png")
|
||||
|
||||
|
||||
if self.Tree is not None:
|
||||
# Perform our SPPF -> AST conversion
|
||||
# Disable the ForestToParseTree cache when ambiguity='resolve'
|
||||
# to prevent a tree construction bug. See issue #1283
|
||||
use_cache = not self.resolve_ambiguity
|
||||
transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity, use_cache)
|
||||
return transformer.transform(solution)
|
||||
|
||||
# return the root of the SPPF
|
||||
return solution
|
||||
@@ -0,0 +1,42 @@
|
||||
"""This module implements useful building blocks for the Earley parser
|
||||
"""
|
||||
|
||||
|
||||
class Item:
|
||||
"An Earley Item, the atom of the algorithm."
|
||||
|
||||
__slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash')
|
||||
def __init__(self, rule, ptr, start):
|
||||
self.is_complete = len(rule.expansion) == ptr
|
||||
self.rule = rule # rule
|
||||
self.ptr = ptr # ptr
|
||||
self.start = start # j
|
||||
self.node = None # w
|
||||
if self.is_complete:
|
||||
self.s = rule.origin
|
||||
self.expect = None
|
||||
self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
|
||||
else:
|
||||
self.s = (rule, ptr)
|
||||
self.expect = rule.expansion[ptr]
|
||||
self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
|
||||
self._hash = hash((self.s, self.start, self.rule))
|
||||
|
||||
def advance(self):
|
||||
return Item(self.rule, self.ptr + 1, self.start)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self is other or (self.s == other.s and self.start == other.start and self.rule == other.rule)
|
||||
|
||||
def __hash__(self):
|
||||
return self._hash
|
||||
|
||||
def __repr__(self):
|
||||
before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] )
|
||||
after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] )
|
||||
symbol = "{} ::= {}* {}".format(self.rule.origin.name, ' '.join(before), ' '.join(after))
|
||||
return '%s (%d)' % (symbol, self.start)
|
||||
|
||||
|
||||
# class TransitiveItem(Item):
|
||||
# ... # removed at commit 4c1cfb2faf24e8f8bff7112627a00b94d261b420
|
||||
@@ -0,0 +1,802 @@
|
||||
""""This module implements an SPPF implementation
|
||||
|
||||
This is used as the primary output mechanism for the Earley parser
|
||||
in order to store complex ambiguities.
|
||||
|
||||
Full reference and more details is here:
|
||||
https://web.archive.org/web/20190616123959/http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
|
||||
"""
|
||||
|
||||
from typing import Type, AbstractSet
|
||||
from random import randint
|
||||
from collections import deque
|
||||
from operator import attrgetter
|
||||
from importlib import import_module
|
||||
from functools import partial
|
||||
|
||||
from ..parse_tree_builder import AmbiguousIntermediateExpander
|
||||
from ..visitors import Discard
|
||||
from ..utils import logger, OrderedSet
|
||||
from ..tree import Tree
|
||||
|
||||
class ForestNode:
|
||||
pass
|
||||
|
||||
class SymbolNode(ForestNode):
|
||||
"""
|
||||
A Symbol Node represents a symbol (or Intermediate LR0).
|
||||
|
||||
Symbol nodes are keyed by the symbol (s). For intermediate nodes
|
||||
s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol
|
||||
nodes, s will be a string representing the non-terminal origin (i.e.
|
||||
the left hand side of the rule).
|
||||
|
||||
The children of a Symbol or Intermediate Node will always be Packed Nodes;
|
||||
with each Packed Node child representing a single derivation of a production.
|
||||
|
||||
Hence a Symbol Node with a single child is unambiguous.
|
||||
|
||||
Parameters:
|
||||
s: A Symbol, or a tuple of (rule, ptr) for an intermediate node.
|
||||
start: For dynamic lexers, the index of the start of the substring matched by this symbol (inclusive).
|
||||
end: For dynamic lexers, the index of the end of the substring matched by this symbol (exclusive).
|
||||
|
||||
Properties:
|
||||
is_intermediate: True if this node is an intermediate node.
|
||||
priority: The priority of the node's symbol.
|
||||
"""
|
||||
Set: Type[AbstractSet] = set # Overridden by StableSymbolNode
|
||||
__slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate')
|
||||
def __init__(self, s, start, end):
|
||||
self.s = s
|
||||
self.start = start
|
||||
self.end = end
|
||||
self._children = self.Set()
|
||||
self.paths = self.Set()
|
||||
self.paths_loaded = False
|
||||
|
||||
### We use inf here as it can be safely negated without resorting to conditionals,
|
||||
# unlike None or float('NaN'), and sorts appropriately.
|
||||
self.priority = float('-inf')
|
||||
self.is_intermediate = isinstance(s, tuple)
|
||||
|
||||
def add_family(self, lr0, rule, start, left, right):
|
||||
self._children.add(PackedNode(self, lr0, rule, start, left, right))
|
||||
|
||||
def add_path(self, transitive, node):
|
||||
self.paths.add((transitive, node))
|
||||
|
||||
def load_paths(self):
|
||||
for transitive, node in self.paths:
|
||||
if transitive.next_titem is not None:
|
||||
vn = type(self)(transitive.next_titem.s, transitive.next_titem.start, self.end)
|
||||
vn.add_path(transitive.next_titem, node)
|
||||
self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn)
|
||||
else:
|
||||
self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node)
|
||||
self.paths_loaded = True
|
||||
|
||||
@property
|
||||
def is_ambiguous(self):
|
||||
"""Returns True if this node is ambiguous."""
|
||||
return len(self.children) > 1
|
||||
|
||||
@property
|
||||
def children(self):
|
||||
"""Returns a list of this node's children sorted from greatest to
|
||||
least priority."""
|
||||
if not self.paths_loaded:
|
||||
self.load_paths()
|
||||
return sorted(self._children, key=attrgetter('sort_key'))
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._children)
|
||||
|
||||
def __repr__(self):
|
||||
if self.is_intermediate:
|
||||
rule = self.s[0]
|
||||
ptr = self.s[1]
|
||||
before = ( expansion.name for expansion in rule.expansion[:ptr] )
|
||||
after = ( expansion.name for expansion in rule.expansion[ptr:] )
|
||||
symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
|
||||
else:
|
||||
symbol = self.s.name
|
||||
return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority)
|
||||
|
||||
class StableSymbolNode(SymbolNode):
|
||||
"A version of SymbolNode that uses OrderedSet for output stability"
|
||||
Set = OrderedSet
|
||||
|
||||
class PackedNode(ForestNode):
|
||||
"""
|
||||
A Packed Node represents a single derivation in a symbol node.
|
||||
|
||||
Parameters:
|
||||
rule: The rule associated with this node.
|
||||
parent: The parent of this node.
|
||||
left: The left child of this node. ``None`` if one does not exist.
|
||||
right: The right child of this node. ``None`` if one does not exist.
|
||||
priority: The priority of this node.
|
||||
"""
|
||||
__slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash')
|
||||
def __init__(self, parent, s, rule, start, left, right):
|
||||
self.parent = parent
|
||||
self.s = s
|
||||
self.start = start
|
||||
self.rule = rule
|
||||
self.left = left
|
||||
self.right = right
|
||||
self.priority = float('-inf')
|
||||
self._hash = hash((self.left, self.right))
|
||||
|
||||
@property
|
||||
def is_empty(self):
|
||||
return self.left is None and self.right is None
|
||||
|
||||
@property
|
||||
def sort_key(self):
|
||||
"""
|
||||
Used to sort PackedNode children of SymbolNodes.
|
||||
A SymbolNode has multiple PackedNodes if it matched
|
||||
ambiguously. Hence, we use the sort order to identify
|
||||
the order in which ambiguous children should be considered.
|
||||
"""
|
||||
return self.is_empty, -self.priority, self.rule.order
|
||||
|
||||
@property
|
||||
def children(self):
|
||||
"""Returns a list of this node's children."""
|
||||
return [x for x in [self.left, self.right] if x is not None]
|
||||
|
||||
def __iter__(self):
|
||||
yield self.left
|
||||
yield self.right
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, PackedNode):
|
||||
return False
|
||||
return self is other or (self.left == other.left and self.right == other.right)
|
||||
|
||||
def __hash__(self):
|
||||
return self._hash
|
||||
|
||||
def __repr__(self):
|
||||
if isinstance(self.s, tuple):
|
||||
rule = self.s[0]
|
||||
ptr = self.s[1]
|
||||
before = ( expansion.name for expansion in rule.expansion[:ptr] )
|
||||
after = ( expansion.name for expansion in rule.expansion[ptr:] )
|
||||
symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
|
||||
else:
|
||||
symbol = self.s.name
|
||||
return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order)
|
||||
|
||||
class TokenNode(ForestNode):
|
||||
"""
|
||||
A Token Node represents a matched terminal and is always a leaf node.
|
||||
|
||||
Parameters:
|
||||
token: The Token associated with this node.
|
||||
term: The TerminalDef matched by the token.
|
||||
priority: The priority of this node.
|
||||
"""
|
||||
__slots__ = ('token', 'term', 'priority', '_hash')
|
||||
def __init__(self, token, term, priority=None):
|
||||
self.token = token
|
||||
self.term = term
|
||||
if priority is not None:
|
||||
self.priority = priority
|
||||
else:
|
||||
self.priority = term.priority if term is not None else 0
|
||||
self._hash = hash(token)
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, TokenNode):
|
||||
return False
|
||||
return self is other or (self.token == other.token)
|
||||
|
||||
def __hash__(self):
|
||||
return self._hash
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self.token)
|
||||
|
||||
class ForestVisitor:
|
||||
"""
|
||||
An abstract base class for building forest visitors.
|
||||
|
||||
This class performs a controllable depth-first walk of an SPPF.
|
||||
The visitor will not enter cycles and will backtrack if one is encountered.
|
||||
Subclasses are notified of cycles through the ``on_cycle`` method.
|
||||
|
||||
Behavior for visit events is defined by overriding the
|
||||
``visit*node*`` functions.
|
||||
|
||||
The walk is controlled by the return values of the ``visit*node_in``
|
||||
methods. Returning a node(s) will schedule them to be visited. The visitor
|
||||
will begin to backtrack if no nodes are returned.
|
||||
|
||||
Parameters:
|
||||
single_visit: If ``True``, non-Token nodes will only be visited once.
|
||||
"""
|
||||
|
||||
def __init__(self, single_visit=False):
|
||||
self.single_visit = single_visit
|
||||
|
||||
def visit_token_node(self, node):
|
||||
"""Called when a ``Token`` is visited. ``Token`` nodes are always leaves."""
|
||||
pass
|
||||
|
||||
def visit_symbol_node_in(self, node):
|
||||
"""Called when a symbol node is visited. Nodes that are returned
|
||||
will be scheduled to be visited. If ``visit_intermediate_node_in``
|
||||
is not implemented, this function will be called for intermediate
|
||||
nodes as well."""
|
||||
pass
|
||||
|
||||
def visit_symbol_node_out(self, node):
|
||||
"""Called after all nodes returned from a corresponding ``visit_symbol_node_in``
|
||||
call have been visited. If ``visit_intermediate_node_out``
|
||||
is not implemented, this function will be called for intermediate
|
||||
nodes as well."""
|
||||
pass
|
||||
|
||||
def visit_packed_node_in(self, node):
|
||||
"""Called when a packed node is visited. Nodes that are returned
|
||||
will be scheduled to be visited. """
|
||||
pass
|
||||
|
||||
def visit_packed_node_out(self, node):
|
||||
"""Called after all nodes returned from a corresponding ``visit_packed_node_in``
|
||||
call have been visited."""
|
||||
pass
|
||||
|
||||
def on_cycle(self, node, path):
|
||||
"""Called when a cycle is encountered.
|
||||
|
||||
Parameters:
|
||||
node: The node that causes a cycle.
|
||||
path: The list of nodes being visited: nodes that have been
|
||||
entered but not exited. The first element is the root in a forest
|
||||
visit, and the last element is the node visited most recently.
|
||||
``path`` should be treated as read-only.
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_cycle_in_path(self, node, path):
|
||||
"""A utility function for use in ``on_cycle`` to obtain a slice of
|
||||
``path`` that only contains the nodes that make up the cycle."""
|
||||
index = len(path) - 1
|
||||
while id(path[index]) != id(node):
|
||||
index -= 1
|
||||
return path[index:]
|
||||
|
||||
def visit(self, root):
|
||||
# Visiting is a list of IDs of all symbol/intermediate nodes currently in
|
||||
# the stack. It serves two purposes: to detect when we 'recurse' in and out
|
||||
# of a symbol/intermediate so that we can process both up and down. Also,
|
||||
# since the SPPF can have cycles it allows us to detect if we're trying
|
||||
# to recurse into a node that's already on the stack (infinite recursion).
|
||||
visiting = set()
|
||||
|
||||
# set of all nodes that have been visited
|
||||
visited = set()
|
||||
|
||||
# a list of nodes that are currently being visited
|
||||
# used for the `on_cycle` callback
|
||||
path = []
|
||||
|
||||
# We do not use recursion here to walk the Forest due to the limited
|
||||
# stack size in python. Therefore input_stack is essentially our stack.
|
||||
input_stack = deque([root])
|
||||
|
||||
# It is much faster to cache these as locals since they are called
|
||||
# many times in large parses.
|
||||
vpno = getattr(self, 'visit_packed_node_out')
|
||||
vpni = getattr(self, 'visit_packed_node_in')
|
||||
vsno = getattr(self, 'visit_symbol_node_out')
|
||||
vsni = getattr(self, 'visit_symbol_node_in')
|
||||
vino = getattr(self, 'visit_intermediate_node_out', vsno)
|
||||
vini = getattr(self, 'visit_intermediate_node_in', vsni)
|
||||
vtn = getattr(self, 'visit_token_node')
|
||||
oc = getattr(self, 'on_cycle')
|
||||
|
||||
while input_stack:
|
||||
current = next(reversed(input_stack))
|
||||
try:
|
||||
next_node = next(current)
|
||||
except StopIteration:
|
||||
input_stack.pop()
|
||||
continue
|
||||
except TypeError:
|
||||
### If the current object is not an iterator, pass through to Token/SymbolNode
|
||||
pass
|
||||
else:
|
||||
if next_node is None:
|
||||
continue
|
||||
|
||||
if id(next_node) in visiting:
|
||||
oc(next_node, path)
|
||||
continue
|
||||
|
||||
input_stack.append(next_node)
|
||||
continue
|
||||
|
||||
if isinstance(current, TokenNode):
|
||||
vtn(current.token)
|
||||
input_stack.pop()
|
||||
continue
|
||||
|
||||
current_id = id(current)
|
||||
if current_id in visiting:
|
||||
if isinstance(current, PackedNode):
|
||||
vpno(current)
|
||||
elif current.is_intermediate:
|
||||
vino(current)
|
||||
else:
|
||||
vsno(current)
|
||||
input_stack.pop()
|
||||
path.pop()
|
||||
visiting.remove(current_id)
|
||||
visited.add(current_id)
|
||||
elif self.single_visit and current_id in visited:
|
||||
input_stack.pop()
|
||||
else:
|
||||
visiting.add(current_id)
|
||||
path.append(current)
|
||||
if isinstance(current, PackedNode):
|
||||
next_node = vpni(current)
|
||||
elif current.is_intermediate:
|
||||
next_node = vini(current)
|
||||
else:
|
||||
next_node = vsni(current)
|
||||
if next_node is None:
|
||||
continue
|
||||
|
||||
if not isinstance(next_node, ForestNode):
|
||||
next_node = iter(next_node)
|
||||
elif id(next_node) in visiting:
|
||||
oc(next_node, path)
|
||||
continue
|
||||
|
||||
input_stack.append(next_node)
|
||||
|
||||
class ForestTransformer(ForestVisitor):
|
||||
"""The base class for a bottom-up forest transformation. Most users will
|
||||
want to use ``TreeForestTransformer`` instead as it has a friendlier
|
||||
interface and covers most use cases.
|
||||
|
||||
Transformations are applied via inheritance and overriding of the
|
||||
``transform*node`` methods.
|
||||
|
||||
``transform_token_node`` receives a ``Token`` as an argument.
|
||||
All other methods receive the node that is being transformed and
|
||||
a list of the results of the transformations of that node's children.
|
||||
The return value of these methods are the resulting transformations.
|
||||
|
||||
If ``Discard`` is raised in a node's transformation, no data from that node
|
||||
will be passed to its parent's transformation.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super(ForestTransformer, self).__init__()
|
||||
# results of transformations
|
||||
self.data = dict()
|
||||
# used to track parent nodes
|
||||
self.node_stack = deque()
|
||||
|
||||
def transform(self, root):
|
||||
"""Perform a transformation on an SPPF."""
|
||||
self.node_stack.append('result')
|
||||
self.data['result'] = []
|
||||
self.visit(root)
|
||||
assert len(self.data['result']) <= 1
|
||||
if self.data['result']:
|
||||
return self.data['result'][0]
|
||||
|
||||
def transform_symbol_node(self, node, data):
|
||||
"""Transform a symbol node."""
|
||||
return node
|
||||
|
||||
def transform_intermediate_node(self, node, data):
|
||||
"""Transform an intermediate node."""
|
||||
return node
|
||||
|
||||
def transform_packed_node(self, node, data):
|
||||
"""Transform a packed node."""
|
||||
return node
|
||||
|
||||
def transform_token_node(self, node):
|
||||
"""Transform a ``Token``."""
|
||||
return node
|
||||
|
||||
def visit_symbol_node_in(self, node):
|
||||
self.node_stack.append(id(node))
|
||||
self.data[id(node)] = []
|
||||
return node.children
|
||||
|
||||
def visit_packed_node_in(self, node):
|
||||
self.node_stack.append(id(node))
|
||||
self.data[id(node)] = []
|
||||
return node.children
|
||||
|
||||
def visit_token_node(self, node):
|
||||
transformed = self.transform_token_node(node)
|
||||
if transformed is not Discard:
|
||||
self.data[self.node_stack[-1]].append(transformed)
|
||||
|
||||
def _visit_node_out_helper(self, node, method):
|
||||
self.node_stack.pop()
|
||||
transformed = method(node, self.data[id(node)])
|
||||
if transformed is not Discard:
|
||||
self.data[self.node_stack[-1]].append(transformed)
|
||||
del self.data[id(node)]
|
||||
|
||||
def visit_symbol_node_out(self, node):
|
||||
self._visit_node_out_helper(node, self.transform_symbol_node)
|
||||
|
||||
def visit_intermediate_node_out(self, node):
|
||||
self._visit_node_out_helper(node, self.transform_intermediate_node)
|
||||
|
||||
def visit_packed_node_out(self, node):
|
||||
self._visit_node_out_helper(node, self.transform_packed_node)
|
||||
|
||||
|
||||
class ForestSumVisitor(ForestVisitor):
|
||||
"""
|
||||
A visitor for prioritizing ambiguous parts of the Forest.
|
||||
|
||||
This visitor is used when support for explicit priorities on
|
||||
rules is requested (whether normal, or invert). It walks the
|
||||
forest (or subsets thereof) and cascades properties upwards
|
||||
from the leaves.
|
||||
|
||||
It would be ideal to do this during parsing, however this would
|
||||
require processing each Earley item multiple times. That's
|
||||
a big performance drawback; so running a forest walk is the
|
||||
lesser of two evils: there can be significantly more Earley
|
||||
items created during parsing than there are SPPF nodes in the
|
||||
final tree.
|
||||
"""
|
||||
def __init__(self):
|
||||
super(ForestSumVisitor, self).__init__(single_visit=True)
|
||||
|
||||
def visit_packed_node_in(self, node):
|
||||
yield node.left
|
||||
yield node.right
|
||||
|
||||
def visit_symbol_node_in(self, node):
|
||||
return iter(node.children)
|
||||
|
||||
def visit_packed_node_out(self, node):
|
||||
priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options.priority else 0
|
||||
priority += getattr(node.right, 'priority', 0)
|
||||
priority += getattr(node.left, 'priority', 0)
|
||||
node.priority = priority
|
||||
|
||||
def visit_symbol_node_out(self, node):
|
||||
node.priority = max(child.priority for child in node.children)
|
||||
|
||||
class PackedData():
|
||||
"""Used in transformationss of packed nodes to distinguish the data
|
||||
that comes from the left child and the right child.
|
||||
"""
|
||||
|
||||
class _NoData():
|
||||
pass
|
||||
|
||||
NO_DATA = _NoData()
|
||||
|
||||
def __init__(self, node, data):
|
||||
self.left = self.NO_DATA
|
||||
self.right = self.NO_DATA
|
||||
if data:
|
||||
if node.left is not None:
|
||||
self.left = data[0]
|
||||
if len(data) > 1:
|
||||
self.right = data[1]
|
||||
else:
|
||||
self.right = data[0]
|
||||
|
||||
class ForestToParseTree(ForestTransformer):
|
||||
"""Used by the earley parser when ambiguity equals 'resolve' or
|
||||
'explicit'. Transforms an SPPF into an (ambiguous) parse tree.
|
||||
|
||||
Parameters:
|
||||
tree_class: The tree class to use for construction
|
||||
callbacks: A dictionary of rules to functions that output a tree
|
||||
prioritizer: A ``ForestVisitor`` that manipulates the priorities of ForestNodes
|
||||
resolve_ambiguity: If True, ambiguities will be resolved based on
|
||||
priorities. Otherwise, `_ambig` nodes will be in the resulting tree.
|
||||
use_cache: If True, the results of packed node transformations will be cached.
|
||||
"""
|
||||
|
||||
def __init__(self, tree_class=Tree, callbacks=dict(), prioritizer=ForestSumVisitor(), resolve_ambiguity=True, use_cache=True):
|
||||
super(ForestToParseTree, self).__init__()
|
||||
self.tree_class = tree_class
|
||||
self.callbacks = callbacks
|
||||
self.prioritizer = prioritizer
|
||||
self.resolve_ambiguity = resolve_ambiguity
|
||||
self._use_cache = use_cache
|
||||
self._cache = {}
|
||||
self._on_cycle_retreat = False
|
||||
self._cycle_node = None
|
||||
self._successful_visits = set()
|
||||
|
||||
def visit(self, root):
|
||||
if self.prioritizer:
|
||||
self.prioritizer.visit(root)
|
||||
super(ForestToParseTree, self).visit(root)
|
||||
self._cache = {}
|
||||
|
||||
def on_cycle(self, node, path):
|
||||
logger.debug("Cycle encountered in the SPPF at node: %s. "
|
||||
"As infinite ambiguities cannot be represented in a tree, "
|
||||
"this family of derivations will be discarded.", node)
|
||||
self._cycle_node = node
|
||||
self._on_cycle_retreat = True
|
||||
|
||||
def _check_cycle(self, node):
|
||||
if self._on_cycle_retreat:
|
||||
if id(node) == id(self._cycle_node) or id(node) in self._successful_visits:
|
||||
self._cycle_node = None
|
||||
self._on_cycle_retreat = False
|
||||
else:
|
||||
return Discard
|
||||
|
||||
def _collapse_ambig(self, children):
|
||||
new_children = []
|
||||
for child in children:
|
||||
if hasattr(child, 'data') and child.data == '_ambig':
|
||||
new_children += child.children
|
||||
else:
|
||||
new_children.append(child)
|
||||
return new_children
|
||||
|
||||
def _call_rule_func(self, node, data):
|
||||
# called when transforming children of symbol nodes
|
||||
# data is a list of trees or tokens that correspond to the
|
||||
# symbol's rule expansion
|
||||
return self.callbacks[node.rule](data)
|
||||
|
||||
def _call_ambig_func(self, node, data):
|
||||
# called when transforming a symbol node
|
||||
# data is a list of trees where each tree's data is
|
||||
# equal to the name of the symbol or one of its aliases.
|
||||
if len(data) > 1:
|
||||
return self.tree_class('_ambig', data)
|
||||
elif data:
|
||||
return data[0]
|
||||
return Discard
|
||||
|
||||
def transform_symbol_node(self, node, data):
|
||||
if id(node) not in self._successful_visits:
|
||||
return Discard
|
||||
r = self._check_cycle(node)
|
||||
if r is Discard:
|
||||
return r
|
||||
self._successful_visits.remove(id(node))
|
||||
data = self._collapse_ambig(data)
|
||||
return self._call_ambig_func(node, data)
|
||||
|
||||
def transform_intermediate_node(self, node, data):
|
||||
if id(node) not in self._successful_visits:
|
||||
return Discard
|
||||
r = self._check_cycle(node)
|
||||
if r is Discard:
|
||||
return r
|
||||
self._successful_visits.remove(id(node))
|
||||
if len(data) > 1:
|
||||
children = [self.tree_class('_inter', c) for c in data]
|
||||
return self.tree_class('_iambig', children)
|
||||
return data[0]
|
||||
|
||||
def transform_packed_node(self, node, data):
|
||||
r = self._check_cycle(node)
|
||||
if r is Discard:
|
||||
return r
|
||||
if self.resolve_ambiguity and id(node.parent) in self._successful_visits:
|
||||
return Discard
|
||||
if self._use_cache and id(node) in self._cache:
|
||||
return self._cache[id(node)]
|
||||
children = []
|
||||
assert len(data) <= 2
|
||||
data = PackedData(node, data)
|
||||
if data.left is not PackedData.NO_DATA:
|
||||
if node.left.is_intermediate and isinstance(data.left, list):
|
||||
children += data.left
|
||||
else:
|
||||
children.append(data.left)
|
||||
if data.right is not PackedData.NO_DATA:
|
||||
children.append(data.right)
|
||||
transformed = children if node.parent.is_intermediate else self._call_rule_func(node, children)
|
||||
if self._use_cache:
|
||||
self._cache[id(node)] = transformed
|
||||
return transformed
|
||||
|
||||
def visit_symbol_node_in(self, node):
|
||||
super(ForestToParseTree, self).visit_symbol_node_in(node)
|
||||
if self._on_cycle_retreat:
|
||||
return
|
||||
return node.children
|
||||
|
||||
def visit_packed_node_in(self, node):
|
||||
self._on_cycle_retreat = False
|
||||
to_visit = super(ForestToParseTree, self).visit_packed_node_in(node)
|
||||
if not self.resolve_ambiguity or id(node.parent) not in self._successful_visits:
|
||||
if not self._use_cache or id(node) not in self._cache:
|
||||
return to_visit
|
||||
|
||||
def visit_packed_node_out(self, node):
|
||||
super(ForestToParseTree, self).visit_packed_node_out(node)
|
||||
if not self._on_cycle_retreat:
|
||||
self._successful_visits.add(id(node.parent))
|
||||
|
||||
def handles_ambiguity(func):
|
||||
"""Decorator for methods of subclasses of ``TreeForestTransformer``.
|
||||
Denotes that the method should receive a list of transformed derivations."""
|
||||
func.handles_ambiguity = True
|
||||
return func
|
||||
|
||||
class TreeForestTransformer(ForestToParseTree):
|
||||
"""A ``ForestTransformer`` with a tree ``Transformer``-like interface.
|
||||
By default, it will construct a tree.
|
||||
|
||||
Methods provided via inheritance are called based on the rule/symbol
|
||||
names of nodes in the forest.
|
||||
|
||||
Methods that act on rules will receive a list of the results of the
|
||||
transformations of the rule's children. By default, trees and tokens.
|
||||
|
||||
Methods that act on tokens will receive a token.
|
||||
|
||||
Alternatively, methods that act on rules may be annotated with
|
||||
``handles_ambiguity``. In this case, the function will receive a list
|
||||
of all the transformations of all the derivations of the rule.
|
||||
By default, a list of trees where each tree.data is equal to the
|
||||
rule name or one of its aliases.
|
||||
|
||||
Non-tree transformations are made possible by override of
|
||||
``__default__``, ``__default_token__``, and ``__default_ambig__``.
|
||||
|
||||
Note:
|
||||
Tree shaping features such as inlined rules and token filtering are
|
||||
not built into the transformation. Positions are also not propagated.
|
||||
|
||||
Parameters:
|
||||
tree_class: The tree class to use for construction
|
||||
prioritizer: A ``ForestVisitor`` that manipulates the priorities of nodes in the SPPF.
|
||||
resolve_ambiguity: If True, ambiguities will be resolved based on priorities.
|
||||
use_cache (bool): If True, caches the results of some transformations,
|
||||
potentially improving performance when ``resolve_ambiguity==False``.
|
||||
Only use if you know what you are doing: i.e. All transformation
|
||||
functions are pure and referentially transparent.
|
||||
"""
|
||||
|
||||
def __init__(self, tree_class=Tree, prioritizer=ForestSumVisitor(), resolve_ambiguity=True, use_cache=False):
|
||||
super(TreeForestTransformer, self).__init__(tree_class, dict(), prioritizer, resolve_ambiguity, use_cache)
|
||||
|
||||
def __default__(self, name, data):
|
||||
"""Default operation on tree (for override).
|
||||
|
||||
Returns a tree with name with data as children.
|
||||
"""
|
||||
return self.tree_class(name, data)
|
||||
|
||||
def __default_ambig__(self, name, data):
|
||||
"""Default operation on ambiguous rule (for override).
|
||||
|
||||
Wraps data in an '_ambig_' node if it contains more than
|
||||
one element.
|
||||
"""
|
||||
if len(data) > 1:
|
||||
return self.tree_class('_ambig', data)
|
||||
elif data:
|
||||
return data[0]
|
||||
return Discard
|
||||
|
||||
def __default_token__(self, node):
|
||||
"""Default operation on ``Token`` (for override).
|
||||
|
||||
Returns ``node``.
|
||||
"""
|
||||
return node
|
||||
|
||||
def transform_token_node(self, node):
|
||||
return getattr(self, node.type, self.__default_token__)(node)
|
||||
|
||||
def _call_rule_func(self, node, data):
|
||||
name = node.rule.alias or node.rule.options.template_source or node.rule.origin.name
|
||||
user_func = getattr(self, name, self.__default__)
|
||||
if user_func == self.__default__ or hasattr(user_func, 'handles_ambiguity'):
|
||||
user_func = partial(self.__default__, name)
|
||||
if not self.resolve_ambiguity:
|
||||
wrapper = partial(AmbiguousIntermediateExpander, self.tree_class)
|
||||
user_func = wrapper(user_func)
|
||||
return user_func(data)
|
||||
|
||||
def _call_ambig_func(self, node, data):
|
||||
name = node.s.name
|
||||
user_func = getattr(self, name, self.__default_ambig__)
|
||||
if user_func == self.__default_ambig__ or not hasattr(user_func, 'handles_ambiguity'):
|
||||
user_func = partial(self.__default_ambig__, name)
|
||||
return user_func(data)
|
||||
|
||||
class ForestToPyDotVisitor(ForestVisitor):
|
||||
"""
|
||||
A Forest visitor which writes the SPPF to a PNG.
|
||||
|
||||
The SPPF can get really large, really quickly because
|
||||
of the amount of meta-data it stores, so this is probably
|
||||
only useful for trivial trees and learning how the SPPF
|
||||
is structured.
|
||||
"""
|
||||
def __init__(self, rankdir="TB"):
|
||||
super(ForestToPyDotVisitor, self).__init__(single_visit=True)
|
||||
self.pydot = import_module('pydot')
|
||||
self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir)
|
||||
|
||||
def visit(self, root, filename):
|
||||
super(ForestToPyDotVisitor, self).visit(root)
|
||||
try:
|
||||
self.graph.write_png(filename)
|
||||
except FileNotFoundError as e:
|
||||
logger.error("Could not write png: ", e)
|
||||
|
||||
def visit_token_node(self, node):
|
||||
graph_node_id = str(id(node))
|
||||
graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"'))
|
||||
graph_node_color = 0x808080
|
||||
graph_node_style = "\"filled,rounded\""
|
||||
graph_node_shape = "diamond"
|
||||
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
|
||||
self.graph.add_node(graph_node)
|
||||
|
||||
def visit_packed_node_in(self, node):
|
||||
graph_node_id = str(id(node))
|
||||
graph_node_label = repr(node)
|
||||
graph_node_color = 0x808080
|
||||
graph_node_style = "filled"
|
||||
graph_node_shape = "diamond"
|
||||
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
|
||||
self.graph.add_node(graph_node)
|
||||
yield node.left
|
||||
yield node.right
|
||||
|
||||
def visit_packed_node_out(self, node):
|
||||
graph_node_id = str(id(node))
|
||||
graph_node = self.graph.get_node(graph_node_id)[0]
|
||||
for child in [node.left, node.right]:
|
||||
if child is not None:
|
||||
child_graph_node_id = str(id(child.token if isinstance(child, TokenNode) else child))
|
||||
child_graph_node = self.graph.get_node(child_graph_node_id)[0]
|
||||
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))
|
||||
else:
|
||||
#### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay.
|
||||
child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890))
|
||||
child_graph_node_style = "invis"
|
||||
child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None")
|
||||
child_edge_style = "invis"
|
||||
self.graph.add_node(child_graph_node)
|
||||
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style))
|
||||
|
||||
def visit_symbol_node_in(self, node):
|
||||
graph_node_id = str(id(node))
|
||||
graph_node_label = repr(node)
|
||||
graph_node_color = 0x808080
|
||||
graph_node_style = "\"filled\""
|
||||
if node.is_intermediate:
|
||||
graph_node_shape = "ellipse"
|
||||
else:
|
||||
graph_node_shape = "rectangle"
|
||||
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
|
||||
self.graph.add_node(graph_node)
|
||||
return iter(node.children)
|
||||
|
||||
def visit_symbol_node_out(self, node):
|
||||
graph_node_id = str(id(node))
|
||||
graph_node = self.graph.get_node(graph_node_id)[0]
|
||||
for child in node.children:
|
||||
child_graph_node_id = str(id(child))
|
||||
child_graph_node = self.graph.get_node(child_graph_node_id)[0]
|
||||
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))
|
||||
@@ -0,0 +1,203 @@
|
||||
"Provides for superficial grammar analysis."
|
||||
|
||||
from collections import Counter, defaultdict
|
||||
from typing import List, Dict, Iterator, FrozenSet, Set
|
||||
|
||||
from ..utils import bfs, fzset, classify, OrderedSet
|
||||
from ..exceptions import GrammarError
|
||||
from ..grammar import Rule, Terminal, NonTerminal, Symbol
|
||||
from ..common import ParserConf
|
||||
|
||||
|
||||
class RulePtr:
|
||||
__slots__ = ('rule', 'index')
|
||||
rule: Rule
|
||||
index: int
|
||||
|
||||
def __init__(self, rule: Rule, index: int):
|
||||
assert isinstance(rule, Rule)
|
||||
assert index <= len(rule.expansion)
|
||||
self.rule = rule
|
||||
self.index = index
|
||||
|
||||
def __repr__(self):
|
||||
before = [x.name for x in self.rule.expansion[:self.index]]
|
||||
after = [x.name for x in self.rule.expansion[self.index:]]
|
||||
return '<%s : %s * %s>' % (self.rule.origin.name, ' '.join(before), ' '.join(after))
|
||||
|
||||
@property
|
||||
def next(self) -> Symbol:
|
||||
return self.rule.expansion[self.index]
|
||||
|
||||
def advance(self, sym: Symbol) -> 'RulePtr':
|
||||
assert self.next == sym
|
||||
return RulePtr(self.rule, self.index+1)
|
||||
|
||||
@property
|
||||
def is_satisfied(self) -> bool:
|
||||
return self.index == len(self.rule.expansion)
|
||||
|
||||
def __eq__(self, other) -> bool:
|
||||
if not isinstance(other, RulePtr):
|
||||
return NotImplemented
|
||||
return self.rule == other.rule and self.index == other.index
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((self.rule, self.index))
|
||||
|
||||
|
||||
State = FrozenSet[RulePtr]
|
||||
|
||||
# state generation ensures no duplicate LR0ItemSets
|
||||
class LR0ItemSet:
|
||||
__slots__ = ('kernel', 'closure', 'transitions', 'lookaheads')
|
||||
|
||||
kernel: State
|
||||
closure: State
|
||||
transitions: Dict[Symbol, 'LR0ItemSet']
|
||||
lookaheads: Dict[Symbol, Set[Rule]]
|
||||
|
||||
def __init__(self, kernel, closure):
|
||||
self.kernel = fzset(kernel)
|
||||
self.closure = fzset(closure)
|
||||
self.transitions = {}
|
||||
self.lookaheads = defaultdict(set)
|
||||
|
||||
def __repr__(self):
|
||||
return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure]))
|
||||
|
||||
|
||||
def update_set(set1, set2):
|
||||
if not set2 or set1 > set2:
|
||||
return False
|
||||
|
||||
copy = set(set1)
|
||||
set1 |= set2
|
||||
return set1 != copy
|
||||
|
||||
def calculate_sets(rules):
|
||||
"""Calculate FOLLOW sets.
|
||||
|
||||
Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets"""
|
||||
symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules}
|
||||
|
||||
# foreach grammar rule X ::= Y(1) ... Y(k)
|
||||
# if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then
|
||||
# NULLABLE = NULLABLE union {X}
|
||||
# for i = 1 to k
|
||||
# if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then
|
||||
# FIRST(X) = FIRST(X) union FIRST(Y(i))
|
||||
# for j = i+1 to k
|
||||
# if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then
|
||||
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X)
|
||||
# if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then
|
||||
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j))
|
||||
# until none of NULLABLE,FIRST,FOLLOW changed in last iteration
|
||||
|
||||
NULLABLE = set()
|
||||
FIRST = {}
|
||||
FOLLOW = {}
|
||||
for sym in symbols:
|
||||
FIRST[sym]={sym} if sym.is_term else set()
|
||||
FOLLOW[sym]=set()
|
||||
|
||||
# Calculate NULLABLE and FIRST
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
|
||||
for rule in rules:
|
||||
if set(rule.expansion) <= NULLABLE:
|
||||
if update_set(NULLABLE, {rule.origin}):
|
||||
changed = True
|
||||
|
||||
for i, sym in enumerate(rule.expansion):
|
||||
if set(rule.expansion[:i]) <= NULLABLE:
|
||||
if update_set(FIRST[rule.origin], FIRST[sym]):
|
||||
changed = True
|
||||
else:
|
||||
break
|
||||
|
||||
# Calculate FOLLOW
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
|
||||
for rule in rules:
|
||||
for i, sym in enumerate(rule.expansion):
|
||||
if i==len(rule.expansion)-1 or set(rule.expansion[i+1:]) <= NULLABLE:
|
||||
if update_set(FOLLOW[sym], FOLLOW[rule.origin]):
|
||||
changed = True
|
||||
|
||||
for j in range(i+1, len(rule.expansion)):
|
||||
if set(rule.expansion[i+1:j]) <= NULLABLE:
|
||||
if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]):
|
||||
changed = True
|
||||
|
||||
return FIRST, FOLLOW, NULLABLE
|
||||
|
||||
|
||||
class GrammarAnalyzer:
|
||||
def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
|
||||
self.debug = debug
|
||||
self.strict = strict
|
||||
|
||||
root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
|
||||
for start in parser_conf.start}
|
||||
|
||||
rules = parser_conf.rules + list(root_rules.values())
|
||||
self.rules_by_origin: Dict[NonTerminal, List[Rule]] = classify(rules, lambda r: r.origin)
|
||||
|
||||
if len(rules) != len(set(rules)):
|
||||
duplicates = [item for item, count in Counter(rules).items() if count > 1]
|
||||
raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates))
|
||||
|
||||
for r in rules:
|
||||
for sym in r.expansion:
|
||||
if not (sym.is_term or sym in self.rules_by_origin):
|
||||
raise GrammarError("Using an undefined rule: %s" % sym)
|
||||
|
||||
self.start_states = {start: self.expand_rule(root_rule.origin)
|
||||
for start, root_rule in root_rules.items()}
|
||||
|
||||
self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))})
|
||||
for start, root_rule in root_rules.items()}
|
||||
|
||||
lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)])
|
||||
for start in parser_conf.start}
|
||||
|
||||
lr0_rules = parser_conf.rules + list(lr0_root_rules.values())
|
||||
assert(len(lr0_rules) == len(set(lr0_rules)))
|
||||
|
||||
self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin)
|
||||
|
||||
# cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
|
||||
self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin))
|
||||
for start, root_rule in lr0_root_rules.items()}
|
||||
|
||||
self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)
|
||||
|
||||
def expand_rule(self, source_rule: NonTerminal, rules_by_origin=None) -> OrderedSet[RulePtr]:
|
||||
"Returns all init_ptrs accessible by rule (recursive)"
|
||||
|
||||
if rules_by_origin is None:
|
||||
rules_by_origin = self.rules_by_origin
|
||||
|
||||
init_ptrs = OrderedSet[RulePtr]()
|
||||
def _expand_rule(rule: NonTerminal) -> Iterator[NonTerminal]:
|
||||
assert not rule.is_term, rule
|
||||
|
||||
for r in rules_by_origin[rule]:
|
||||
init_ptr = RulePtr(r, 0)
|
||||
init_ptrs.add(init_ptr)
|
||||
|
||||
if r.expansion: # if not empty rule
|
||||
new_r = init_ptr.next
|
||||
if not new_r.is_term:
|
||||
assert isinstance(new_r, NonTerminal)
|
||||
yield new_r
|
||||
|
||||
for _ in bfs([source_rule], _expand_rule):
|
||||
pass
|
||||
|
||||
return init_ptrs
|
||||
@@ -0,0 +1,334 @@
|
||||
"""This module builds a LALR(1) transition-table for lalr_parser.py
|
||||
|
||||
For now, shift/reduce conflicts are automatically resolved as shifts.
|
||||
"""
|
||||
|
||||
# Author: Erez Shinan (2017)
|
||||
# Email : erezshin@gmail.com
|
||||
|
||||
from typing import Dict, Set, Iterator, Tuple, List, TypeVar, Generic
|
||||
from collections import defaultdict
|
||||
|
||||
from ..utils import classify, classify_bool, bfs, fzset, Enumerator, logger
|
||||
from ..exceptions import GrammarError
|
||||
|
||||
from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet, RulePtr, State
|
||||
from ..grammar import Rule, Symbol
|
||||
from ..common import ParserConf
|
||||
|
||||
###{standalone
|
||||
|
||||
class Action:
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
def __str__(self):
|
||||
return self.name
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
Shift = Action('Shift')
|
||||
Reduce = Action('Reduce')
|
||||
|
||||
StateT = TypeVar("StateT")
|
||||
|
||||
class ParseTableBase(Generic[StateT]):
|
||||
states: Dict[StateT, Dict[str, Tuple]]
|
||||
start_states: Dict[str, StateT]
|
||||
end_states: Dict[str, StateT]
|
||||
|
||||
def __init__(self, states, start_states, end_states):
|
||||
self.states = states
|
||||
self.start_states = start_states
|
||||
self.end_states = end_states
|
||||
|
||||
def serialize(self, memo):
|
||||
tokens = Enumerator()
|
||||
|
||||
states = {
|
||||
state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg))
|
||||
for token, (action, arg) in actions.items()}
|
||||
for state, actions in self.states.items()
|
||||
}
|
||||
|
||||
return {
|
||||
'tokens': tokens.reversed(),
|
||||
'states': states,
|
||||
'start_states': self.start_states,
|
||||
'end_states': self.end_states,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def deserialize(cls, data, memo):
|
||||
tokens = data['tokens']
|
||||
states = {
|
||||
state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg))
|
||||
for token, (action, arg) in actions.items()}
|
||||
for state, actions in data['states'].items()
|
||||
}
|
||||
return cls(states, data['start_states'], data['end_states'])
|
||||
|
||||
class ParseTable(ParseTableBase['State']):
|
||||
"""Parse-table whose key is State, i.e. set[RulePtr]
|
||||
|
||||
Slower than IntParseTable, but useful for debugging
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class IntParseTable(ParseTableBase[int]):
|
||||
"""Parse-table whose key is int. Best for performance."""
|
||||
|
||||
@classmethod
|
||||
def from_ParseTable(cls, parse_table: ParseTable):
|
||||
enum = list(parse_table.states)
|
||||
state_to_idx: Dict['State', int] = {s:i for i,s in enumerate(enum)}
|
||||
int_states = {}
|
||||
|
||||
for s, la in parse_table.states.items():
|
||||
la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v
|
||||
for k,v in la.items()}
|
||||
int_states[ state_to_idx[s] ] = la
|
||||
|
||||
|
||||
start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()}
|
||||
end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()}
|
||||
return cls(int_states, start_states, end_states)
|
||||
|
||||
###}
|
||||
|
||||
|
||||
# digraph and traverse, see The Theory and Practice of Compiler Writing
|
||||
|
||||
# computes F(x) = G(x) union (union { G(y) | x R y })
|
||||
# X: nodes
|
||||
# R: relation (function mapping node -> list of nodes that satisfy the relation)
|
||||
# G: set valued function
|
||||
def digraph(X, R, G):
|
||||
F = {}
|
||||
S = []
|
||||
N = dict.fromkeys(X, 0)
|
||||
for x in X:
|
||||
# this is always true for the first iteration, but N[x] may be updated in traverse below
|
||||
if N[x] == 0:
|
||||
traverse(x, S, N, X, R, G, F)
|
||||
return F
|
||||
|
||||
# x: single node
|
||||
# S: stack
|
||||
# N: weights
|
||||
# X: nodes
|
||||
# R: relation (see above)
|
||||
# G: set valued function
|
||||
# F: set valued function we are computing (map of input -> output)
|
||||
def traverse(x, S, N, X, R, G, F):
|
||||
S.append(x)
|
||||
d = len(S)
|
||||
N[x] = d
|
||||
F[x] = G[x]
|
||||
for y in R[x]:
|
||||
if N[y] == 0:
|
||||
traverse(y, S, N, X, R, G, F)
|
||||
n_x = N[x]
|
||||
assert(n_x > 0)
|
||||
n_y = N[y]
|
||||
assert(n_y != 0)
|
||||
if (n_y > 0) and (n_y < n_x):
|
||||
N[x] = n_y
|
||||
F[x].update(F[y])
|
||||
if N[x] == d:
|
||||
f_x = F[x]
|
||||
while True:
|
||||
z = S.pop()
|
||||
N[z] = -1
|
||||
F[z] = f_x
|
||||
if z == x:
|
||||
break
|
||||
|
||||
|
||||
class LALR_Analyzer(GrammarAnalyzer):
|
||||
lr0_itemsets: Set[LR0ItemSet]
|
||||
nonterminal_transitions: List[Tuple[LR0ItemSet, Symbol]]
|
||||
lookback: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Rule]]]
|
||||
includes: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Symbol]]]
|
||||
reads: Dict[Tuple[LR0ItemSet, Symbol], Set[Tuple[LR0ItemSet, Symbol]]]
|
||||
directly_reads: Dict[Tuple[LR0ItemSet, Symbol], Set[Symbol]]
|
||||
|
||||
|
||||
def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
|
||||
GrammarAnalyzer.__init__(self, parser_conf, debug, strict)
|
||||
self.nonterminal_transitions = []
|
||||
self.directly_reads = defaultdict(set)
|
||||
self.reads = defaultdict(set)
|
||||
self.includes = defaultdict(set)
|
||||
self.lookback = defaultdict(set)
|
||||
|
||||
|
||||
def compute_lr0_states(self) -> None:
|
||||
self.lr0_itemsets = set()
|
||||
# map of kernels to LR0ItemSets
|
||||
cache: Dict['State', LR0ItemSet] = {}
|
||||
|
||||
def step(state: LR0ItemSet) -> Iterator[LR0ItemSet]:
|
||||
_, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied)
|
||||
|
||||
d = classify(unsat, lambda rp: rp.next)
|
||||
for sym, rps in d.items():
|
||||
kernel = fzset({rp.advance(sym) for rp in rps})
|
||||
new_state = cache.get(kernel, None)
|
||||
if new_state is None:
|
||||
closure = set(kernel)
|
||||
for rp in kernel:
|
||||
if not rp.is_satisfied and not rp.next.is_term:
|
||||
closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin)
|
||||
new_state = LR0ItemSet(kernel, closure)
|
||||
cache[kernel] = new_state
|
||||
|
||||
state.transitions[sym] = new_state
|
||||
yield new_state
|
||||
|
||||
self.lr0_itemsets.add(state)
|
||||
|
||||
for _ in bfs(self.lr0_start_states.values(), step):
|
||||
pass
|
||||
|
||||
def compute_reads_relations(self):
|
||||
# handle start state
|
||||
for root in self.lr0_start_states.values():
|
||||
assert(len(root.kernel) == 1)
|
||||
for rp in root.kernel:
|
||||
assert(rp.index == 0)
|
||||
self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ])
|
||||
|
||||
for state in self.lr0_itemsets:
|
||||
seen = set()
|
||||
for rp in state.closure:
|
||||
if rp.is_satisfied:
|
||||
continue
|
||||
s = rp.next
|
||||
# if s is a not a nonterminal
|
||||
if s not in self.lr0_rules_by_origin:
|
||||
continue
|
||||
if s in seen:
|
||||
continue
|
||||
seen.add(s)
|
||||
nt = (state, s)
|
||||
self.nonterminal_transitions.append(nt)
|
||||
dr = self.directly_reads[nt]
|
||||
r = self.reads[nt]
|
||||
next_state = state.transitions[s]
|
||||
for rp2 in next_state.closure:
|
||||
if rp2.is_satisfied:
|
||||
continue
|
||||
s2 = rp2.next
|
||||
# if s2 is a terminal
|
||||
if s2 not in self.lr0_rules_by_origin:
|
||||
dr.add(s2)
|
||||
if s2 in self.NULLABLE:
|
||||
r.add((next_state, s2))
|
||||
|
||||
def compute_includes_lookback(self):
|
||||
for nt in self.nonterminal_transitions:
|
||||
state, nonterminal = nt
|
||||
includes = []
|
||||
lookback = self.lookback[nt]
|
||||
for rp in state.closure:
|
||||
if rp.rule.origin != nonterminal:
|
||||
continue
|
||||
# traverse the states for rp(.rule)
|
||||
state2 = state
|
||||
for i in range(rp.index, len(rp.rule.expansion)):
|
||||
s = rp.rule.expansion[i]
|
||||
nt2 = (state2, s)
|
||||
state2 = state2.transitions[s]
|
||||
if nt2 not in self.reads:
|
||||
continue
|
||||
for j in range(i + 1, len(rp.rule.expansion)):
|
||||
if rp.rule.expansion[j] not in self.NULLABLE:
|
||||
break
|
||||
else:
|
||||
includes.append(nt2)
|
||||
# state2 is at the final state for rp.rule
|
||||
if rp.index == 0:
|
||||
for rp2 in state2.closure:
|
||||
if (rp2.rule == rp.rule) and rp2.is_satisfied:
|
||||
lookback.add((state2, rp2.rule))
|
||||
for nt2 in includes:
|
||||
self.includes[nt2].add(nt)
|
||||
|
||||
def compute_lookaheads(self):
|
||||
read_sets = digraph(self.nonterminal_transitions, self.reads, self.directly_reads)
|
||||
follow_sets = digraph(self.nonterminal_transitions, self.includes, read_sets)
|
||||
|
||||
for nt, lookbacks in self.lookback.items():
|
||||
for state, rule in lookbacks:
|
||||
for s in follow_sets[nt]:
|
||||
state.lookaheads[s].add(rule)
|
||||
|
||||
def compute_lalr1_states(self) -> None:
|
||||
m: Dict[LR0ItemSet, Dict[str, Tuple]] = {}
|
||||
reduce_reduce = []
|
||||
for itemset in self.lr0_itemsets:
|
||||
actions: Dict[Symbol, Tuple] = {la: (Shift, next_state.closure)
|
||||
for la, next_state in itemset.transitions.items()}
|
||||
for la, rules in itemset.lookaheads.items():
|
||||
if len(rules) > 1:
|
||||
# Try to resolve conflict based on priority
|
||||
p = [(r.options.priority or 0, r) for r in rules]
|
||||
p.sort(key=lambda r: r[0], reverse=True)
|
||||
best, second_best = p[:2]
|
||||
if best[0] > second_best[0]:
|
||||
rules = {best[1]}
|
||||
else:
|
||||
reduce_reduce.append((itemset, la, rules))
|
||||
continue
|
||||
|
||||
rule ,= rules
|
||||
if la in actions:
|
||||
if self.strict:
|
||||
msg = f'Shift/Reduce conflict for terminal {la.name}. [strict-mode]\n' \
|
||||
f' * {rule}\n'
|
||||
raise GrammarError(msg)
|
||||
elif self.debug:
|
||||
logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
|
||||
logger.warning(' * %s', rule)
|
||||
else:
|
||||
logger.debug('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
|
||||
logger.debug(' * %s', rule)
|
||||
else:
|
||||
actions[la] = (Reduce, rule)
|
||||
m[itemset] = { k.name: v for k, v in actions.items() }
|
||||
|
||||
if reduce_reduce:
|
||||
msgs = []
|
||||
for itemset, la, rules in reduce_reduce:
|
||||
msg = 'Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t- ' + str(r) for r in rules ]))
|
||||
if self.debug:
|
||||
msg += '\n collision occurred in state: {%s\n }' % ''.join(['\n\t' + str(x) for x in itemset.closure])
|
||||
msgs.append(msg)
|
||||
raise GrammarError('\n\n'.join(msgs))
|
||||
|
||||
states = { k.closure: v for k, v in m.items() }
|
||||
|
||||
# compute end states
|
||||
end_states: Dict[str, 'State'] = {}
|
||||
for state in states:
|
||||
for rp in state:
|
||||
for start in self.lr0_start_states:
|
||||
if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied:
|
||||
assert start not in end_states
|
||||
end_states[start] = state
|
||||
|
||||
start_states = { start: state.closure for start, state in self.lr0_start_states.items() }
|
||||
_parse_table = ParseTable(states, start_states, end_states)
|
||||
|
||||
if self.debug:
|
||||
self.parse_table = _parse_table
|
||||
else:
|
||||
self.parse_table = IntParseTable.from_ParseTable(_parse_table)
|
||||
|
||||
def compute_lalr(self):
|
||||
self.compute_lr0_states()
|
||||
self.compute_reads_relations()
|
||||
self.compute_includes_lookback()
|
||||
self.compute_lookaheads()
|
||||
self.compute_lalr1_states()
|
||||
@@ -0,0 +1,158 @@
|
||||
# This module provides a LALR interactive parser, which is used for debugging and error handling
|
||||
|
||||
from typing import Iterator, List
|
||||
from copy import copy
|
||||
import warnings
|
||||
|
||||
from lark.exceptions import UnexpectedToken
|
||||
from lark.lexer import Token, LexerThread
|
||||
from .lalr_parser_state import ParserState
|
||||
|
||||
###{standalone
|
||||
|
||||
class InteractiveParser:
|
||||
"""InteractiveParser gives you advanced control over parsing and error handling when parsing with LALR.
|
||||
|
||||
For a simpler interface, see the ``on_error`` argument to ``Lark.parse()``.
|
||||
"""
|
||||
def __init__(self, parser, parser_state: ParserState, lexer_thread: LexerThread):
|
||||
self.parser = parser
|
||||
self.parser_state = parser_state
|
||||
self.lexer_thread = lexer_thread
|
||||
self.result = None
|
||||
|
||||
@property
|
||||
def lexer_state(self) -> LexerThread:
|
||||
warnings.warn("lexer_state will be removed in subsequent releases. Use lexer_thread instead.", DeprecationWarning)
|
||||
return self.lexer_thread
|
||||
|
||||
def feed_token(self, token: Token):
|
||||
"""Feed the parser with a token, and advance it to the next state, as if it received it from the lexer.
|
||||
|
||||
Note that ``token`` has to be an instance of ``Token``.
|
||||
"""
|
||||
return self.parser_state.feed_token(token, token.type == '$END')
|
||||
|
||||
def iter_parse(self) -> Iterator[Token]:
|
||||
"""Step through the different stages of the parse, by reading tokens from the lexer
|
||||
and feeding them to the parser, one per iteration.
|
||||
|
||||
Returns an iterator of the tokens it encounters.
|
||||
|
||||
When the parse is over, the resulting tree can be found in ``InteractiveParser.result``.
|
||||
"""
|
||||
for token in self.lexer_thread.lex(self.parser_state):
|
||||
yield token
|
||||
self.result = self.feed_token(token)
|
||||
|
||||
def exhaust_lexer(self) -> List[Token]:
|
||||
"""Try to feed the rest of the lexer state into the interactive parser.
|
||||
|
||||
Note that this modifies the instance in place and does not feed an '$END' Token
|
||||
"""
|
||||
return list(self.iter_parse())
|
||||
|
||||
|
||||
def feed_eof(self, last_token=None):
|
||||
"""Feed a '$END' Token. Borrows from 'last_token' if given."""
|
||||
eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else self.lexer_thread._Token('$END', '', 0, 1, 1)
|
||||
return self.feed_token(eof)
|
||||
|
||||
|
||||
def __copy__(self):
|
||||
"""Create a new interactive parser with a separate state.
|
||||
|
||||
Calls to feed_token() won't affect the old instance, and vice-versa.
|
||||
"""
|
||||
return self.copy()
|
||||
|
||||
def copy(self, deepcopy_values=True):
|
||||
return type(self)(
|
||||
self.parser,
|
||||
self.parser_state.copy(deepcopy_values=deepcopy_values),
|
||||
copy(self.lexer_thread),
|
||||
)
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, InteractiveParser):
|
||||
return False
|
||||
|
||||
return self.parser_state == other.parser_state and self.lexer_thread == other.lexer_thread
|
||||
|
||||
def as_immutable(self):
|
||||
"""Convert to an ``ImmutableInteractiveParser``."""
|
||||
p = copy(self)
|
||||
return ImmutableInteractiveParser(p.parser, p.parser_state, p.lexer_thread)
|
||||
|
||||
def pretty(self):
|
||||
"""Print the output of ``choices()`` in a way that's easier to read."""
|
||||
out = ["Parser choices:"]
|
||||
for k, v in self.choices().items():
|
||||
out.append('\t- %s -> %r' % (k, v))
|
||||
out.append('stack size: %s' % len(self.parser_state.state_stack))
|
||||
return '\n'.join(out)
|
||||
|
||||
def choices(self):
|
||||
"""Returns a dictionary of token types, matched to their action in the parser.
|
||||
|
||||
Only returns token types that are accepted by the current state.
|
||||
|
||||
Updated by ``feed_token()``.
|
||||
"""
|
||||
return self.parser_state.parse_conf.parse_table.states[self.parser_state.position]
|
||||
|
||||
def accepts(self):
|
||||
"""Returns the set of possible tokens that will advance the parser into a new valid state."""
|
||||
accepts = set()
|
||||
conf_no_callbacks = copy(self.parser_state.parse_conf)
|
||||
# We don't want to call callbacks here since those might have arbitrary side effects
|
||||
# and are unnecessarily slow.
|
||||
conf_no_callbacks.callbacks = {}
|
||||
for t in self.choices():
|
||||
if t.isupper(): # is terminal?
|
||||
new_cursor = self.copy(deepcopy_values=False)
|
||||
new_cursor.parser_state.parse_conf = conf_no_callbacks
|
||||
try:
|
||||
new_cursor.feed_token(self.lexer_thread._Token(t, ''))
|
||||
except UnexpectedToken:
|
||||
pass
|
||||
else:
|
||||
accepts.add(t)
|
||||
return accepts
|
||||
|
||||
def resume_parse(self):
|
||||
"""Resume automated parsing from the current state.
|
||||
"""
|
||||
return self.parser.parse_from_state(self.parser_state, last_token=self.lexer_thread.state.last_token)
|
||||
|
||||
|
||||
|
||||
class ImmutableInteractiveParser(InteractiveParser):
|
||||
"""Same as ``InteractiveParser``, but operations create a new instance instead
|
||||
of changing it in-place.
|
||||
"""
|
||||
|
||||
result = None
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.parser_state, self.lexer_thread))
|
||||
|
||||
def feed_token(self, token):
|
||||
c = copy(self)
|
||||
c.result = InteractiveParser.feed_token(c, token)
|
||||
return c
|
||||
|
||||
def exhaust_lexer(self):
|
||||
"""Try to feed the rest of the lexer state into the parser.
|
||||
|
||||
Note that this returns a new ImmutableInteractiveParser and does not feed an '$END' Token"""
|
||||
cursor = self.as_mutable()
|
||||
cursor.exhaust_lexer()
|
||||
return cursor.as_immutable()
|
||||
|
||||
def as_mutable(self):
|
||||
"""Convert to an ``InteractiveParser``."""
|
||||
p = copy(self)
|
||||
return InteractiveParser(p.parser, p.parser_state, p.lexer_thread)
|
||||
|
||||
###}
|
||||
@@ -0,0 +1,122 @@
|
||||
"""This module implements a LALR(1) Parser
|
||||
"""
|
||||
# Author: Erez Shinan (2017)
|
||||
# Email : erezshin@gmail.com
|
||||
from typing import Dict, Any, Optional
|
||||
from ..lexer import Token, LexerThread
|
||||
from ..utils import Serialize
|
||||
from ..common import ParserConf, ParserCallbacks
|
||||
|
||||
from .lalr_analysis import LALR_Analyzer, IntParseTable, ParseTableBase
|
||||
from .lalr_interactive_parser import InteractiveParser
|
||||
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
|
||||
from .lalr_parser_state import ParserState, ParseConf
|
||||
|
||||
###{standalone
|
||||
|
||||
class LALR_Parser(Serialize):
|
||||
def __init__(self, parser_conf: ParserConf, debug: bool=False, strict: bool=False):
|
||||
analysis = LALR_Analyzer(parser_conf, debug=debug, strict=strict)
|
||||
analysis.compute_lalr()
|
||||
callbacks = parser_conf.callbacks
|
||||
|
||||
self._parse_table = analysis.parse_table
|
||||
self.parser_conf = parser_conf
|
||||
self.parser = _Parser(analysis.parse_table, callbacks, debug)
|
||||
|
||||
@classmethod
|
||||
def deserialize(cls, data, memo, callbacks, debug=False):
|
||||
inst = cls.__new__(cls)
|
||||
inst._parse_table = IntParseTable.deserialize(data, memo)
|
||||
inst.parser = _Parser(inst._parse_table, callbacks, debug)
|
||||
return inst
|
||||
|
||||
def serialize(self, memo: Any = None) -> Dict[str, Any]:
|
||||
return self._parse_table.serialize(memo)
|
||||
|
||||
def parse_interactive(self, lexer: LexerThread, start: str):
|
||||
return self.parser.parse(lexer, start, start_interactive=True)
|
||||
|
||||
def parse(self, lexer, start, on_error=None):
|
||||
try:
|
||||
return self.parser.parse(lexer, start)
|
||||
except UnexpectedInput as e:
|
||||
if on_error is None:
|
||||
raise
|
||||
|
||||
while True:
|
||||
if isinstance(e, UnexpectedCharacters):
|
||||
s = e.interactive_parser.lexer_thread.state
|
||||
p = s.line_ctr.char_pos
|
||||
|
||||
if not on_error(e):
|
||||
raise e
|
||||
|
||||
if isinstance(e, UnexpectedCharacters):
|
||||
# If user didn't change the character position, then we should
|
||||
if p == s.line_ctr.char_pos:
|
||||
s.line_ctr.feed(s.text.text[p:p+1])
|
||||
|
||||
try:
|
||||
return e.interactive_parser.resume_parse()
|
||||
except UnexpectedToken as e2:
|
||||
if (isinstance(e, UnexpectedToken)
|
||||
and e.token.type == e2.token.type == '$END'
|
||||
and e.interactive_parser == e2.interactive_parser):
|
||||
# Prevent infinite loop
|
||||
raise e2
|
||||
e = e2
|
||||
except UnexpectedCharacters as e2:
|
||||
e = e2
|
||||
|
||||
|
||||
class _Parser:
|
||||
parse_table: ParseTableBase
|
||||
callbacks: ParserCallbacks
|
||||
debug: bool
|
||||
|
||||
def __init__(self, parse_table: ParseTableBase, callbacks: ParserCallbacks, debug: bool=False):
|
||||
self.parse_table = parse_table
|
||||
self.callbacks = callbacks
|
||||
self.debug = debug
|
||||
|
||||
def parse(self, lexer: LexerThread, start: str, value_stack=None, state_stack=None, start_interactive=False):
|
||||
parse_conf = ParseConf(self.parse_table, self.callbacks, start)
|
||||
parser_state = ParserState(parse_conf, lexer, state_stack, value_stack)
|
||||
if start_interactive:
|
||||
return InteractiveParser(self, parser_state, parser_state.lexer)
|
||||
return self.parse_from_state(parser_state)
|
||||
|
||||
|
||||
def parse_from_state(self, state: ParserState, last_token: Optional[Token]=None):
|
||||
"""Run the main LALR parser loop
|
||||
|
||||
Parameters:
|
||||
state - the initial state. Changed in-place.
|
||||
last_token - Used only for line information in case of an empty lexer.
|
||||
"""
|
||||
try:
|
||||
token = last_token
|
||||
for token in state.lexer.lex(state):
|
||||
assert token is not None
|
||||
state.feed_token(token)
|
||||
|
||||
end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
|
||||
return state.feed_token(end_token, True)
|
||||
except UnexpectedInput as e:
|
||||
try:
|
||||
e.interactive_parser = InteractiveParser(self, state, state.lexer)
|
||||
except NameError:
|
||||
pass
|
||||
raise e
|
||||
except Exception as e:
|
||||
if self.debug:
|
||||
print("")
|
||||
print("STATE STACK DUMP")
|
||||
print("----------------")
|
||||
for i, s in enumerate(state.state_stack):
|
||||
print('%d)' % i , s)
|
||||
print("")
|
||||
|
||||
raise
|
||||
###}
|
||||
@@ -0,0 +1,110 @@
|
||||
from copy import deepcopy, copy
|
||||
from typing import Dict, Any, Generic, List
|
||||
from ..lexer import Token, LexerThread
|
||||
from ..common import ParserCallbacks
|
||||
|
||||
from .lalr_analysis import Shift, ParseTableBase, StateT
|
||||
from lark.exceptions import UnexpectedToken
|
||||
|
||||
###{standalone
|
||||
|
||||
class ParseConf(Generic[StateT]):
|
||||
__slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states'
|
||||
|
||||
parse_table: ParseTableBase[StateT]
|
||||
callbacks: ParserCallbacks
|
||||
start: str
|
||||
|
||||
start_state: StateT
|
||||
end_state: StateT
|
||||
states: Dict[StateT, Dict[str, tuple]]
|
||||
|
||||
def __init__(self, parse_table: ParseTableBase[StateT], callbacks: ParserCallbacks, start: str):
|
||||
self.parse_table = parse_table
|
||||
|
||||
self.start_state = self.parse_table.start_states[start]
|
||||
self.end_state = self.parse_table.end_states[start]
|
||||
self.states = self.parse_table.states
|
||||
|
||||
self.callbacks = callbacks
|
||||
self.start = start
|
||||
|
||||
class ParserState(Generic[StateT]):
|
||||
__slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack'
|
||||
|
||||
parse_conf: ParseConf[StateT]
|
||||
lexer: LexerThread
|
||||
state_stack: List[StateT]
|
||||
value_stack: list
|
||||
|
||||
def __init__(self, parse_conf: ParseConf[StateT], lexer: LexerThread, state_stack=None, value_stack=None):
|
||||
self.parse_conf = parse_conf
|
||||
self.lexer = lexer
|
||||
self.state_stack = state_stack or [self.parse_conf.start_state]
|
||||
self.value_stack = value_stack or []
|
||||
|
||||
@property
|
||||
def position(self) -> StateT:
|
||||
return self.state_stack[-1]
|
||||
|
||||
# Necessary for match_examples() to work
|
||||
def __eq__(self, other) -> bool:
|
||||
if not isinstance(other, ParserState):
|
||||
return NotImplemented
|
||||
return len(self.state_stack) == len(other.state_stack) and self.position == other.position
|
||||
|
||||
def __copy__(self):
|
||||
return self.copy()
|
||||
|
||||
def copy(self, deepcopy_values=True) -> 'ParserState[StateT]':
|
||||
return type(self)(
|
||||
self.parse_conf,
|
||||
self.lexer, # XXX copy
|
||||
copy(self.state_stack),
|
||||
deepcopy(self.value_stack) if deepcopy_values else copy(self.value_stack),
|
||||
)
|
||||
|
||||
def feed_token(self, token: Token, is_end=False) -> Any:
|
||||
state_stack = self.state_stack
|
||||
value_stack = self.value_stack
|
||||
states = self.parse_conf.states
|
||||
end_state = self.parse_conf.end_state
|
||||
callbacks = self.parse_conf.callbacks
|
||||
|
||||
while True:
|
||||
state = state_stack[-1]
|
||||
try:
|
||||
action, arg = states[state][token.type]
|
||||
except KeyError:
|
||||
expected = {s for s in states[state].keys() if s.isupper()}
|
||||
raise UnexpectedToken(token, expected, state=self, interactive_parser=None)
|
||||
|
||||
assert arg != end_state
|
||||
|
||||
if action is Shift:
|
||||
# shift once and return
|
||||
assert not is_end
|
||||
state_stack.append(arg)
|
||||
value_stack.append(token if token.type not in callbacks else callbacks[token.type](token))
|
||||
return
|
||||
else:
|
||||
# reduce+shift as many times as necessary
|
||||
rule = arg
|
||||
size = len(rule.expansion)
|
||||
if size:
|
||||
s = value_stack[-size:]
|
||||
del state_stack[-size:]
|
||||
del value_stack[-size:]
|
||||
else:
|
||||
s = []
|
||||
|
||||
value = callbacks[rule](s) if callbacks else s
|
||||
|
||||
_action, new_state = states[state_stack[-1]][rule.origin.name]
|
||||
assert _action is Shift
|
||||
state_stack.append(new_state)
|
||||
value_stack.append(value)
|
||||
|
||||
if is_end and state_stack[-1] == end_state:
|
||||
return value_stack[-1]
|
||||
###}
|
||||
@@ -0,0 +1,166 @@
|
||||
"""This module implements an Earley parser with a dynamic lexer
|
||||
|
||||
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
|
||||
https://www.sciencedirect.com/science/article/pii/S1571066108001497
|
||||
|
||||
That is probably the best reference for understanding the algorithm here.
|
||||
|
||||
The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
|
||||
is better documented here:
|
||||
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
|
||||
|
||||
Instead of running a lexer beforehand, or using a costy char-by-char method, this parser
|
||||
uses regular expressions by necessity, achieving high-performance while maintaining all of
|
||||
Earley's power in parsing any CFG.
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING, Callable, Optional, List, Any
|
||||
from collections import defaultdict
|
||||
|
||||
from ..tree import Tree
|
||||
from ..exceptions import UnexpectedCharacters
|
||||
from ..lexer import Token
|
||||
from ..grammar import Terminal
|
||||
from .earley import Parser as BaseParser
|
||||
from .earley_forest import TokenNode
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..common import LexerConf, ParserConf
|
||||
|
||||
class Parser(BaseParser):
|
||||
def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable,
|
||||
resolve_ambiguity: bool=True, complete_lex: bool=False, debug: bool=False,
|
||||
tree_class: Optional[Callable[[str, List], Any]]=Tree, ordered_sets: bool=True):
|
||||
BaseParser.__init__(self, lexer_conf, parser_conf, term_matcher, resolve_ambiguity,
|
||||
debug, tree_class, ordered_sets)
|
||||
self.ignore = [Terminal(t) for t in lexer_conf.ignore]
|
||||
self.complete_lex = complete_lex
|
||||
|
||||
def _parse(self, stream, columns, to_scan, start_symbol=None):
|
||||
|
||||
def scan(i, to_scan):
|
||||
"""The core Earley Scanner.
|
||||
|
||||
This is a custom implementation of the scanner that uses the
|
||||
Lark lexer to match tokens. The scan list is built by the
|
||||
Earley predictor, based on the previously completed tokens.
|
||||
This ensures that at each phase of the parse we have a custom
|
||||
lexer context, allowing for more complex ambiguities."""
|
||||
|
||||
node_cache = {}
|
||||
|
||||
# 1) Loop the expectations and ask the lexer to match.
|
||||
# Since regexp is forward looking on the input stream, and we only
|
||||
# want to process tokens when we hit the point in the stream at which
|
||||
# they complete, we push all tokens into a buffer (delayed_matches), to
|
||||
# be held possibly for a later parse step when we reach the point in the
|
||||
# input stream at which they complete.
|
||||
for item in self.Set(to_scan):
|
||||
m = match(item.expect, stream, i)
|
||||
if m:
|
||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
|
||||
delayed_matches[m.end()].append( (item, i, t) )
|
||||
|
||||
if self.complete_lex:
|
||||
s = m.group(0)
|
||||
for j in range(1, len(s)):
|
||||
m = match(item.expect, s[:-j])
|
||||
if m:
|
||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
|
||||
delayed_matches[i+m.end()].append( (item, i, t) )
|
||||
|
||||
# XXX The following 3 lines were commented out for causing a bug. See issue #768
|
||||
# # Remove any items that successfully matched in this pass from the to_scan buffer.
|
||||
# # This ensures we don't carry over tokens that already matched, if we're ignoring below.
|
||||
# to_scan.remove(item)
|
||||
|
||||
# 3) Process any ignores. This is typically used for e.g. whitespace.
|
||||
# We carry over any unmatched items from the to_scan buffer to be matched again after
|
||||
# the ignore. This should allow us to use ignored symbols in non-terminals to implement
|
||||
# e.g. mandatory spacing.
|
||||
for x in self.ignore:
|
||||
m = match(x, stream, i)
|
||||
if m:
|
||||
# Carry over any items still in the scan buffer, to past the end of the ignored items.
|
||||
delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ])
|
||||
|
||||
# If we're ignoring up to the end of the file, # carry over the start symbol if it already completed.
|
||||
delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol])
|
||||
|
||||
next_to_scan = self.Set()
|
||||
next_set = self.Set()
|
||||
columns.append(next_set)
|
||||
transitives.append({})
|
||||
|
||||
## 4) Process Tokens from delayed_matches.
|
||||
# This is the core of the Earley scanner. Create an SPPF node for each Token,
|
||||
# and create the symbol node in the SPPF tree. Advance the item that completed,
|
||||
# and add the resulting new item to either the Earley set (for processing by the
|
||||
# completer/predictor) or the to_scan buffer for the next parse step.
|
||||
for item, start, token in delayed_matches[i+1]:
|
||||
if token is not None:
|
||||
token.end_line = text_line
|
||||
token.end_column = text_column + 1
|
||||
token.end_pos = i + 1
|
||||
|
||||
new_item = item.advance()
|
||||
label = (new_item.s, new_item.start, i + 1)
|
||||
token_node = TokenNode(token, terminals[token.type])
|
||||
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
|
||||
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)
|
||||
else:
|
||||
new_item = item
|
||||
|
||||
if new_item.expect in self.TERMINALS:
|
||||
# add (B ::= Aai+1.B, h, y) to Q'
|
||||
next_to_scan.add(new_item)
|
||||
else:
|
||||
# add (B ::= Aa+1.B, h, y) to Ei+1
|
||||
next_set.add(new_item)
|
||||
|
||||
del delayed_matches[i+1] # No longer needed, so unburden memory
|
||||
|
||||
if not next_set and not delayed_matches and not next_to_scan:
|
||||
considered_rules = list(sorted(to_scan, key=lambda key: key.rule.origin.name))
|
||||
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan},
|
||||
set(to_scan), state=frozenset(i.s for i in to_scan),
|
||||
considered_rules=considered_rules
|
||||
)
|
||||
|
||||
return next_to_scan, node_cache
|
||||
|
||||
|
||||
delayed_matches = defaultdict(list)
|
||||
match = self.term_matcher
|
||||
terminals = self.lexer_conf.terminals_by_name
|
||||
|
||||
# Cache for nodes & tokens created in a particular parse step.
|
||||
transitives = [{}]
|
||||
|
||||
text_line = 1
|
||||
text_column = 1
|
||||
|
||||
## The main Earley loop.
|
||||
# Run the Prediction/Completion cycle for any Items in the current Earley set.
|
||||
# Completions will be added to the SPPF tree, and predictions will be recursively
|
||||
# processed down to terminals/empty nodes to be added to the scanner for the next
|
||||
# step.
|
||||
i = 0
|
||||
node_cache = {}
|
||||
for token in stream:
|
||||
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
|
||||
|
||||
to_scan, node_cache = scan(i, to_scan)
|
||||
|
||||
if token == '\n':
|
||||
text_line += 1
|
||||
text_column = 1
|
||||
else:
|
||||
text_column += 1
|
||||
i += 1
|
||||
|
||||
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
|
||||
|
||||
## Column is now the final column in the parse.
|
||||
assert i == len(columns)-1
|
||||
return to_scan
|
||||
@@ -0,0 +1,107 @@
|
||||
"""This is an experimental tool for reconstructing text from a shaped tree, based on a Lark grammar.
|
||||
"""
|
||||
|
||||
from typing import Dict, Callable, Iterable, Optional
|
||||
|
||||
from .lark import Lark
|
||||
from .tree import Tree, ParseTree
|
||||
from .visitors import Transformer_InPlace
|
||||
from .lexer import Token, PatternStr, TerminalDef
|
||||
from .grammar import Terminal, NonTerminal, Symbol
|
||||
|
||||
from .tree_matcher import TreeMatcher, is_discarded_terminal
|
||||
from .utils import is_id_continue
|
||||
|
||||
def is_iter_empty(i):
|
||||
try:
|
||||
_ = next(i)
|
||||
return False
|
||||
except StopIteration:
|
||||
return True
|
||||
|
||||
|
||||
class WriteTokensTransformer(Transformer_InPlace):
|
||||
"Inserts discarded tokens into their correct place, according to the rules of grammar"
|
||||
|
||||
tokens: Dict[str, TerminalDef]
|
||||
term_subs: Dict[str, Callable[[Symbol], str]]
|
||||
|
||||
def __init__(self, tokens: Dict[str, TerminalDef], term_subs: Dict[str, Callable[[Symbol], str]]) -> None:
|
||||
self.tokens = tokens
|
||||
self.term_subs = term_subs
|
||||
|
||||
def __default__(self, data, children, meta):
|
||||
if not getattr(meta, 'match_tree', False):
|
||||
return Tree(data, children)
|
||||
|
||||
iter_args = iter(children)
|
||||
to_write = []
|
||||
for sym in meta.orig_expansion:
|
||||
if is_discarded_terminal(sym):
|
||||
try:
|
||||
v = self.term_subs[sym.name](sym)
|
||||
except KeyError:
|
||||
t = self.tokens[sym.name]
|
||||
if not isinstance(t.pattern, PatternStr):
|
||||
raise NotImplementedError("Reconstructing regexps not supported yet: %s" % t)
|
||||
|
||||
v = t.pattern.value
|
||||
to_write.append(v)
|
||||
else:
|
||||
x = next(iter_args)
|
||||
if isinstance(x, list):
|
||||
to_write += x
|
||||
else:
|
||||
if isinstance(x, Token):
|
||||
assert Terminal(x.type) == sym, x
|
||||
else:
|
||||
assert NonTerminal(x.data) == sym, (sym, x)
|
||||
to_write.append(x)
|
||||
|
||||
assert is_iter_empty(iter_args)
|
||||
return to_write
|
||||
|
||||
|
||||
class Reconstructor(TreeMatcher):
|
||||
"""
|
||||
A Reconstructor that will, given a full parse Tree, generate source code.
|
||||
|
||||
Note:
|
||||
The reconstructor cannot generate values from regexps. If you need to produce discarded
|
||||
regexes, such as newlines, use `term_subs` and provide default values for them.
|
||||
|
||||
Parameters:
|
||||
parser: a Lark instance
|
||||
term_subs: a dictionary of [Terminal name as str] to [output text as str]
|
||||
"""
|
||||
|
||||
write_tokens: WriteTokensTransformer
|
||||
|
||||
def __init__(self, parser: Lark, term_subs: Optional[Dict[str, Callable[[Symbol], str]]]=None) -> None:
|
||||
TreeMatcher.__init__(self, parser)
|
||||
|
||||
self.write_tokens = WriteTokensTransformer({t.name:t for t in self.tokens}, term_subs or {})
|
||||
|
||||
def _reconstruct(self, tree):
|
||||
unreduced_tree = self.match_tree(tree, tree.data)
|
||||
|
||||
res = self.write_tokens.transform(unreduced_tree)
|
||||
for item in res:
|
||||
if isinstance(item, Tree):
|
||||
# TODO use orig_expansion.rulename to support templates
|
||||
yield from self._reconstruct(item)
|
||||
else:
|
||||
yield item
|
||||
|
||||
def reconstruct(self, tree: ParseTree, postproc: Optional[Callable[[Iterable[str]], Iterable[str]]]=None, insert_spaces: bool=True) -> str:
|
||||
x = self._reconstruct(tree)
|
||||
if postproc:
|
||||
x = postproc(x)
|
||||
y = []
|
||||
prev_item = ''
|
||||
for item in x:
|
||||
if insert_spaces and prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]):
|
||||
y.append(' ')
|
||||
y.append(item)
|
||||
prev_item = item
|
||||
return ''.join(y)
|
||||
@@ -0,0 +1,70 @@
|
||||
import sys
|
||||
from argparse import ArgumentParser, FileType
|
||||
from textwrap import indent
|
||||
from logging import DEBUG, INFO, WARN, ERROR
|
||||
from typing import Optional
|
||||
import warnings
|
||||
|
||||
from lark import Lark, logger
|
||||
try:
|
||||
from interegular import logger as interegular_logger
|
||||
has_interegular = True
|
||||
except ImportError:
|
||||
has_interegular = False
|
||||
|
||||
lalr_argparser = ArgumentParser(add_help=False, epilog='Look at the Lark documentation for more info on the options')
|
||||
|
||||
flags = [
|
||||
('d', 'debug'),
|
||||
'keep_all_tokens',
|
||||
'regex',
|
||||
'propagate_positions',
|
||||
'maybe_placeholders',
|
||||
'use_bytes'
|
||||
]
|
||||
|
||||
options = ['start', 'lexer']
|
||||
|
||||
lalr_argparser.add_argument('-v', '--verbose', action='count', default=0, help="Increase Logger output level, up to three times")
|
||||
lalr_argparser.add_argument('-s', '--start', action='append', default=[])
|
||||
lalr_argparser.add_argument('-l', '--lexer', default='contextual', choices=('basic', 'contextual'))
|
||||
lalr_argparser.add_argument('-o', '--out', type=FileType('w', encoding='utf-8'), default=sys.stdout, help='the output file (default=stdout)')
|
||||
lalr_argparser.add_argument('grammar_file', type=FileType('r', encoding='utf-8'), help='A valid .lark file')
|
||||
|
||||
for flag in flags:
|
||||
if isinstance(flag, tuple):
|
||||
options.append(flag[1])
|
||||
lalr_argparser.add_argument('-' + flag[0], '--' + flag[1], action='store_true')
|
||||
elif isinstance(flag, str):
|
||||
options.append(flag)
|
||||
lalr_argparser.add_argument('--' + flag, action='store_true')
|
||||
else:
|
||||
raise NotImplementedError("flags must only contain strings or tuples of strings")
|
||||
|
||||
|
||||
def build_lalr(namespace):
|
||||
logger.setLevel((ERROR, WARN, INFO, DEBUG)[min(namespace.verbose, 3)])
|
||||
if has_interegular:
|
||||
interegular_logger.setLevel(logger.getEffectiveLevel())
|
||||
if len(namespace.start) == 0:
|
||||
namespace.start.append('start')
|
||||
kwargs = {n: getattr(namespace, n) for n in options}
|
||||
return Lark(namespace.grammar_file, parser='lalr', **kwargs), namespace.out
|
||||
|
||||
|
||||
def showwarning_as_comment(message, category, filename, lineno, file=None, line=None):
|
||||
# Based on warnings._showwarnmsg_impl
|
||||
text = warnings.formatwarning(message, category, filename, lineno, line)
|
||||
text = indent(text, '# ')
|
||||
if file is None:
|
||||
file = sys.stderr
|
||||
if file is None:
|
||||
return
|
||||
try:
|
||||
file.write(text)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def make_warnings_comments():
|
||||
warnings.showwarning = showwarning_as_comment
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,202 @@
|
||||
"Converts Nearley grammars to Lark"
|
||||
|
||||
import os.path
|
||||
import sys
|
||||
import codecs
|
||||
import argparse
|
||||
|
||||
|
||||
from lark import Lark, Transformer, v_args
|
||||
|
||||
nearley_grammar = r"""
|
||||
start: (ruledef|directive)+
|
||||
|
||||
directive: "@" NAME (STRING|NAME)
|
||||
| "@" JS -> js_code
|
||||
ruledef: NAME "->" expansions
|
||||
| NAME REGEXP "->" expansions -> macro
|
||||
expansions: expansion ("|" expansion)*
|
||||
|
||||
expansion: expr+ js
|
||||
|
||||
?expr: item (":" /[+*?]/)?
|
||||
|
||||
?item: rule|string|regexp|null
|
||||
| "(" expansions ")"
|
||||
|
||||
rule: NAME
|
||||
string: STRING
|
||||
regexp: REGEXP
|
||||
null: "null"
|
||||
JS: /{%.*?%}/s
|
||||
js: JS?
|
||||
|
||||
NAME: /[a-zA-Z_$]\w*/
|
||||
COMMENT: /#[^\n]*/
|
||||
REGEXP: /\[.*?\]/
|
||||
|
||||
STRING: _STRING "i"?
|
||||
|
||||
%import common.ESCAPED_STRING -> _STRING
|
||||
%import common.WS
|
||||
%ignore WS
|
||||
%ignore COMMENT
|
||||
|
||||
"""
|
||||
|
||||
nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='basic')
|
||||
|
||||
def _get_rulename(name):
|
||||
name = {'_': '_ws_maybe', '__': '_ws'}.get(name, name)
|
||||
return 'n_' + name.replace('$', '__DOLLAR__').lower()
|
||||
|
||||
@v_args(inline=True)
|
||||
class NearleyToLark(Transformer):
|
||||
def __init__(self):
|
||||
self._count = 0
|
||||
self.extra_rules = {}
|
||||
self.extra_rules_rev = {}
|
||||
self.alias_js_code = {}
|
||||
|
||||
def _new_function(self, code):
|
||||
name = 'alias_%d' % self._count
|
||||
self._count += 1
|
||||
|
||||
self.alias_js_code[name] = code
|
||||
return name
|
||||
|
||||
def _extra_rule(self, rule):
|
||||
if rule in self.extra_rules_rev:
|
||||
return self.extra_rules_rev[rule]
|
||||
|
||||
name = 'xrule_%d' % len(self.extra_rules)
|
||||
assert name not in self.extra_rules
|
||||
self.extra_rules[name] = rule
|
||||
self.extra_rules_rev[rule] = name
|
||||
return name
|
||||
|
||||
def rule(self, name):
|
||||
return _get_rulename(name)
|
||||
|
||||
def ruledef(self, name, exps):
|
||||
return '!%s: %s' % (_get_rulename(name), exps)
|
||||
|
||||
def expr(self, item, op):
|
||||
rule = '(%s)%s' % (item, op)
|
||||
return self._extra_rule(rule)
|
||||
|
||||
def regexp(self, r):
|
||||
return '/%s/' % r
|
||||
|
||||
def null(self):
|
||||
return ''
|
||||
|
||||
def string(self, s):
|
||||
return self._extra_rule(s)
|
||||
|
||||
def expansion(self, *x):
|
||||
x, js = x[:-1], x[-1]
|
||||
if js.children:
|
||||
js_code ,= js.children
|
||||
js_code = js_code[2:-2]
|
||||
alias = '-> ' + self._new_function(js_code)
|
||||
else:
|
||||
alias = ''
|
||||
return ' '.join(x) + alias
|
||||
|
||||
def expansions(self, *x):
|
||||
return '%s' % ('\n |'.join(x))
|
||||
|
||||
def start(self, *rules):
|
||||
return '\n'.join(filter(None, rules))
|
||||
|
||||
def _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, includes):
|
||||
rule_defs = []
|
||||
|
||||
tree = nearley_grammar_parser.parse(g)
|
||||
for statement in tree.children:
|
||||
if statement.data == 'directive':
|
||||
directive, arg = statement.children
|
||||
if directive in ('builtin', 'include'):
|
||||
folder = builtin_path if directive == 'builtin' else folder_path
|
||||
path = os.path.join(folder, arg[1:-1])
|
||||
if path not in includes:
|
||||
includes.add(path)
|
||||
with codecs.open(path, encoding='utf8') as f:
|
||||
text = f.read()
|
||||
rule_defs += _nearley_to_lark(text, builtin_path, n2l, js_code, os.path.abspath(os.path.dirname(path)), includes)
|
||||
else:
|
||||
assert False, directive
|
||||
elif statement.data == 'js_code':
|
||||
code ,= statement.children
|
||||
code = code[2:-2]
|
||||
js_code.append(code)
|
||||
elif statement.data == 'macro':
|
||||
pass # TODO Add support for macros!
|
||||
elif statement.data == 'ruledef':
|
||||
rule_defs.append(n2l.transform(statement))
|
||||
else:
|
||||
raise Exception("Unknown statement: %s" % statement)
|
||||
|
||||
return rule_defs
|
||||
|
||||
|
||||
def create_code_for_nearley_grammar(g, start, builtin_path, folder_path, es6=False):
|
||||
import js2py
|
||||
|
||||
emit_code = []
|
||||
def emit(x=None):
|
||||
if x:
|
||||
emit_code.append(x)
|
||||
emit_code.append('\n')
|
||||
|
||||
js_code = ['function id(x) {return x[0];}']
|
||||
n2l = NearleyToLark()
|
||||
rule_defs = _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, set())
|
||||
lark_g = '\n'.join(rule_defs)
|
||||
lark_g += '\n'+'\n'.join('!%s: %s' % item for item in n2l.extra_rules.items())
|
||||
|
||||
emit('from lark import Lark, Transformer')
|
||||
emit()
|
||||
emit('grammar = ' + repr(lark_g))
|
||||
emit()
|
||||
|
||||
for alias, code in n2l.alias_js_code.items():
|
||||
js_code.append('%s = (%s);' % (alias, code))
|
||||
|
||||
if es6:
|
||||
emit(js2py.translate_js6('\n'.join(js_code)))
|
||||
else:
|
||||
emit(js2py.translate_js('\n'.join(js_code)))
|
||||
emit('class TransformNearley(Transformer):')
|
||||
for alias in n2l.alias_js_code:
|
||||
emit(" %s = var.get('%s').to_python()" % (alias, alias))
|
||||
emit(" __default__ = lambda self, n, c, m: c if c else None")
|
||||
|
||||
emit()
|
||||
emit('parser = Lark(grammar, start="n_%s", maybe_placeholders=False)' % start)
|
||||
emit('def parse(text):')
|
||||
emit(' return TransformNearley().transform(parser.parse(text))')
|
||||
|
||||
return ''.join(emit_code)
|
||||
|
||||
def main(fn, start, nearley_lib, es6=False):
|
||||
with codecs.open(fn, encoding='utf8') as f:
|
||||
grammar = f.read()
|
||||
return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn)), es6=es6)
|
||||
|
||||
def get_arg_parser():
|
||||
parser = argparse.ArgumentParser(description='Reads a Nearley grammar (with js functions), and outputs an equivalent lark parser.')
|
||||
parser.add_argument('nearley_grammar', help='Path to the file containing the nearley grammar')
|
||||
parser.add_argument('start_rule', help='Rule within the nearley grammar to make the base rule')
|
||||
parser.add_argument('nearley_lib', help='Path to root directory of nearley codebase (used for including builtins)')
|
||||
parser.add_argument('--es6', help='Enable experimental ES6 support', action='store_true')
|
||||
return parser
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = get_arg_parser()
|
||||
if len(sys.argv) == 1:
|
||||
parser.print_help(sys.stderr)
|
||||
sys.exit(1)
|
||||
args = parser.parse_args()
|
||||
print(main(fn=args.nearley_grammar, start=args.start_rule, nearley_lib=args.nearley_lib, es6=args.es6))
|
||||
@@ -0,0 +1,32 @@
|
||||
import sys
|
||||
import json
|
||||
|
||||
from lark.grammar import Rule
|
||||
from lark.lexer import TerminalDef
|
||||
from lark.tools import lalr_argparser, build_lalr
|
||||
|
||||
import argparse
|
||||
|
||||
argparser = argparse.ArgumentParser(prog='python -m lark.tools.serialize', parents=[lalr_argparser],
|
||||
description="Lark Serialization Tool - Stores Lark's internal state & LALR analysis as a JSON file",
|
||||
epilog='Look at the Lark documentation for more info on the options')
|
||||
|
||||
|
||||
def serialize(lark_inst, outfile):
|
||||
data, memo = lark_inst.memo_serialize([TerminalDef, Rule])
|
||||
outfile.write('{\n')
|
||||
outfile.write(' "data": %s,\n' % json.dumps(data))
|
||||
outfile.write(' "memo": %s\n' % json.dumps(memo))
|
||||
outfile.write('}\n')
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv)==1:
|
||||
argparser.print_help(sys.stderr)
|
||||
sys.exit(1)
|
||||
ns = argparser.parse_args()
|
||||
serialize(*build_lalr(ns))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,196 @@
|
||||
###{standalone
|
||||
#
|
||||
#
|
||||
# Lark Stand-alone Generator Tool
|
||||
# ----------------------------------
|
||||
# Generates a stand-alone LALR(1) parser
|
||||
#
|
||||
# Git: https://github.com/erezsh/lark
|
||||
# Author: Erez Shinan (erezshin@gmail.com)
|
||||
#
|
||||
#
|
||||
# >>> LICENSE
|
||||
#
|
||||
# This tool and its generated code use a separate license from Lark,
|
||||
# and are subject to the terms of the Mozilla Public License, v. 2.0.
|
||||
# If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
# If you wish to purchase a commercial license for this tool and its
|
||||
# generated code, you may contact me via email or otherwise.
|
||||
#
|
||||
# If MPL2 is incompatible with your free or open-source project,
|
||||
# contact me and we'll work it out.
|
||||
#
|
||||
#
|
||||
|
||||
from copy import deepcopy
|
||||
from abc import ABC, abstractmethod
|
||||
from types import ModuleType
|
||||
from typing import (
|
||||
TypeVar, Generic, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any,
|
||||
Union, Iterable, IO, TYPE_CHECKING, overload, Sequence,
|
||||
Pattern as REPattern, ClassVar, Set, Mapping
|
||||
)
|
||||
###}
|
||||
|
||||
import sys
|
||||
import token, tokenize
|
||||
import os
|
||||
from os import path
|
||||
from collections import defaultdict
|
||||
from functools import partial
|
||||
from argparse import ArgumentParser
|
||||
|
||||
import lark
|
||||
from lark.tools import lalr_argparser, build_lalr, make_warnings_comments
|
||||
|
||||
|
||||
from lark.grammar import Rule
|
||||
from lark.lexer import TerminalDef
|
||||
|
||||
_dir = path.dirname(__file__)
|
||||
_larkdir = path.join(_dir, path.pardir)
|
||||
|
||||
|
||||
EXTRACT_STANDALONE_FILES = [
|
||||
'tools/standalone.py',
|
||||
'exceptions.py',
|
||||
'utils.py',
|
||||
'tree.py',
|
||||
'visitors.py',
|
||||
'grammar.py',
|
||||
'lexer.py',
|
||||
'common.py',
|
||||
'parse_tree_builder.py',
|
||||
'parsers/lalr_analysis.py',
|
||||
'parsers/lalr_parser_state.py',
|
||||
'parsers/lalr_parser.py',
|
||||
'parsers/lalr_interactive_parser.py',
|
||||
'parser_frontends.py',
|
||||
'lark.py',
|
||||
'indenter.py',
|
||||
]
|
||||
|
||||
def extract_sections(lines):
|
||||
section = None
|
||||
text = []
|
||||
sections = defaultdict(list)
|
||||
for line in lines:
|
||||
if line.startswith('###'):
|
||||
if line[3] == '{':
|
||||
section = line[4:].strip()
|
||||
elif line[3] == '}':
|
||||
sections[section] += text
|
||||
section = None
|
||||
text = []
|
||||
else:
|
||||
raise ValueError(line)
|
||||
elif section:
|
||||
text.append(line)
|
||||
|
||||
return {name: ''.join(text) for name, text in sections.items()}
|
||||
|
||||
|
||||
def strip_docstrings(line_gen):
|
||||
""" Strip comments and docstrings from a file.
|
||||
Based on code from: https://stackoverflow.com/questions/1769332/script-to-remove-python-comments-docstrings
|
||||
"""
|
||||
res = []
|
||||
|
||||
prev_toktype = token.INDENT
|
||||
last_lineno = -1
|
||||
last_col = 0
|
||||
|
||||
tokgen = tokenize.generate_tokens(line_gen)
|
||||
for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
|
||||
if slineno > last_lineno:
|
||||
last_col = 0
|
||||
if scol > last_col:
|
||||
res.append(" " * (scol - last_col))
|
||||
if toktype == token.STRING and prev_toktype == token.INDENT:
|
||||
# Docstring
|
||||
res.append("#--")
|
||||
elif toktype == tokenize.COMMENT:
|
||||
# Comment
|
||||
res.append("##\n")
|
||||
else:
|
||||
res.append(ttext)
|
||||
prev_toktype = toktype
|
||||
last_col = ecol
|
||||
last_lineno = elineno
|
||||
|
||||
return ''.join(res)
|
||||
|
||||
|
||||
def gen_standalone(lark_inst, output=None, out=sys.stdout, compress=False):
|
||||
if output is None:
|
||||
output = partial(print, file=out)
|
||||
|
||||
import pickle, zlib, base64
|
||||
def compressed_output(obj):
|
||||
s = pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)
|
||||
c = zlib.compress(s)
|
||||
output(repr(base64.b64encode(c)))
|
||||
|
||||
def output_decompress(name):
|
||||
output('%(name)s = pickle.loads(zlib.decompress(base64.b64decode(%(name)s)))' % locals())
|
||||
|
||||
output('# The file was automatically generated by Lark v%s' % lark.__version__)
|
||||
output('__version__ = "%s"' % lark.__version__)
|
||||
output()
|
||||
|
||||
for i, pyfile in enumerate(EXTRACT_STANDALONE_FILES):
|
||||
with open(os.path.join(_larkdir, pyfile)) as f:
|
||||
code = extract_sections(f)['standalone']
|
||||
if i: # if not this file
|
||||
code = strip_docstrings(partial(next, iter(code.splitlines(True))))
|
||||
output(code)
|
||||
|
||||
data, m = lark_inst.memo_serialize([TerminalDef, Rule])
|
||||
output('import pickle, zlib, base64')
|
||||
if compress:
|
||||
output('DATA = (')
|
||||
compressed_output(data)
|
||||
output(')')
|
||||
output_decompress('DATA')
|
||||
output('MEMO = (')
|
||||
compressed_output(m)
|
||||
output(')')
|
||||
output_decompress('MEMO')
|
||||
else:
|
||||
output('DATA = (')
|
||||
output(data)
|
||||
output(')')
|
||||
output('MEMO = (')
|
||||
output(m)
|
||||
output(')')
|
||||
|
||||
|
||||
output('Shift = 0')
|
||||
output('Reduce = 1')
|
||||
output("def Lark_StandAlone(**kwargs):")
|
||||
output(" return Lark._load_from_dict(DATA, MEMO, **kwargs)")
|
||||
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
make_warnings_comments()
|
||||
parser = ArgumentParser(prog="prog='python -m lark.tools.standalone'", description="Lark Stand-alone Generator Tool",
|
||||
parents=[lalr_argparser], epilog='Look at the Lark documentation for more info on the options')
|
||||
parser.add_argument('-c', '--compress', action='store_true', default=0, help="Enable compression")
|
||||
if len(sys.argv) == 1:
|
||||
parser.print_help(sys.stderr)
|
||||
sys.exit(1)
|
||||
ns = parser.parse_args()
|
||||
|
||||
lark_inst, out = build_lalr(ns)
|
||||
gen_standalone(lark_inst, out=out, compress=ns.compress)
|
||||
|
||||
ns.out.close()
|
||||
ns.grammar_file.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,281 @@
|
||||
import sys
|
||||
from copy import deepcopy
|
||||
|
||||
from typing import List, Callable, Iterator, Union, Optional, Generic, TypeVar, TYPE_CHECKING
|
||||
|
||||
from .lexer import Token
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .lexer import TerminalDef
|
||||
try:
|
||||
import rich
|
||||
except ImportError:
|
||||
pass
|
||||
from typing import Literal
|
||||
|
||||
###{standalone
|
||||
|
||||
class Meta:
|
||||
|
||||
empty: bool
|
||||
line: int
|
||||
column: int
|
||||
start_pos: int
|
||||
end_line: int
|
||||
end_column: int
|
||||
end_pos: int
|
||||
orig_expansion: 'List[TerminalDef]'
|
||||
match_tree: bool
|
||||
|
||||
def __init__(self):
|
||||
self.empty = True
|
||||
|
||||
|
||||
_Leaf_T = TypeVar("_Leaf_T")
|
||||
Branch = Union[_Leaf_T, 'Tree[_Leaf_T]']
|
||||
|
||||
|
||||
class Tree(Generic[_Leaf_T]):
|
||||
"""The main tree class.
|
||||
|
||||
Creates a new tree, and stores "data" and "children" in attributes of the same name.
|
||||
Trees can be hashed and compared.
|
||||
|
||||
Parameters:
|
||||
data: The name of the rule or alias
|
||||
children: List of matched sub-rules and terminals
|
||||
meta: Line & Column numbers (if ``propagate_positions`` is enabled).
|
||||
meta attributes: (line, column, end_line, end_column, start_pos, end_pos,
|
||||
container_line, container_column, container_end_line, container_end_column)
|
||||
container_* attributes consider all symbols, including those that have been inlined in the tree.
|
||||
For example, in the rule 'a: _A B _C', the regular attributes will mark the start and end of B,
|
||||
but the container_* attributes will also include _A and _C in the range. However, rules that
|
||||
contain 'a' will consider it in full, including _A and _C for all attributes.
|
||||
"""
|
||||
|
||||
data: str
|
||||
children: 'List[Branch[_Leaf_T]]'
|
||||
|
||||
def __init__(self, data: str, children: 'List[Branch[_Leaf_T]]', meta: Optional[Meta]=None) -> None:
|
||||
self.data = data
|
||||
self.children = children
|
||||
self._meta = meta
|
||||
|
||||
@property
|
||||
def meta(self) -> Meta:
|
||||
if self._meta is None:
|
||||
self._meta = Meta()
|
||||
return self._meta
|
||||
|
||||
def __repr__(self):
|
||||
return 'Tree(%r, %r)' % (self.data, self.children)
|
||||
|
||||
__match_args__ = ("data", "children")
|
||||
|
||||
def _pretty_label(self):
|
||||
return self.data
|
||||
|
||||
def _pretty(self, level, indent_str):
|
||||
yield f'{indent_str*level}{self._pretty_label()}'
|
||||
if len(self.children) == 1 and not isinstance(self.children[0], Tree):
|
||||
yield f'\t{self.children[0]}\n'
|
||||
else:
|
||||
yield '\n'
|
||||
for n in self.children:
|
||||
if isinstance(n, Tree):
|
||||
yield from n._pretty(level+1, indent_str)
|
||||
else:
|
||||
yield f'{indent_str*(level+1)}{n}\n'
|
||||
|
||||
def pretty(self, indent_str: str=' ') -> str:
|
||||
"""Returns an indented string representation of the tree.
|
||||
|
||||
Great for debugging.
|
||||
"""
|
||||
return ''.join(self._pretty(0, indent_str))
|
||||
|
||||
def __rich__(self, parent:Optional['rich.tree.Tree']=None) -> 'rich.tree.Tree':
|
||||
"""Returns a tree widget for the 'rich' library.
|
||||
|
||||
Example:
|
||||
::
|
||||
from rich import print
|
||||
from lark import Tree
|
||||
|
||||
tree = Tree('root', ['node1', 'node2'])
|
||||
print(tree)
|
||||
"""
|
||||
return self._rich(parent)
|
||||
|
||||
def _rich(self, parent):
|
||||
if parent:
|
||||
tree = parent.add(f'[bold]{self.data}[/bold]')
|
||||
else:
|
||||
import rich.tree
|
||||
tree = rich.tree.Tree(self.data)
|
||||
|
||||
for c in self.children:
|
||||
if isinstance(c, Tree):
|
||||
c._rich(tree)
|
||||
else:
|
||||
tree.add(f'[green]{c}[/green]')
|
||||
|
||||
return tree
|
||||
|
||||
def __eq__(self, other):
|
||||
try:
|
||||
return self.data == other.data and self.children == other.children
|
||||
except AttributeError:
|
||||
return False
|
||||
|
||||
def __ne__(self, other):
|
||||
return not (self == other)
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash((self.data, tuple(self.children)))
|
||||
|
||||
def iter_subtrees(self) -> 'Iterator[Tree[_Leaf_T]]':
|
||||
"""Depth-first iteration.
|
||||
|
||||
Iterates over all the subtrees, never returning to the same node twice (Lark's parse-tree is actually a DAG).
|
||||
"""
|
||||
queue = [self]
|
||||
subtrees = dict()
|
||||
for subtree in queue:
|
||||
subtrees[id(subtree)] = subtree
|
||||
queue += [c for c in reversed(subtree.children)
|
||||
if isinstance(c, Tree) and id(c) not in subtrees]
|
||||
|
||||
del queue
|
||||
return reversed(list(subtrees.values()))
|
||||
|
||||
def iter_subtrees_topdown(self):
|
||||
"""Breadth-first iteration.
|
||||
|
||||
Iterates over all the subtrees, return nodes in order like pretty() does.
|
||||
"""
|
||||
stack = [self]
|
||||
stack_append = stack.append
|
||||
stack_pop = stack.pop
|
||||
while stack:
|
||||
node = stack_pop()
|
||||
if not isinstance(node, Tree):
|
||||
continue
|
||||
yield node
|
||||
for child in reversed(node.children):
|
||||
stack_append(child)
|
||||
|
||||
def find_pred(self, pred: 'Callable[[Tree[_Leaf_T]], bool]') -> 'Iterator[Tree[_Leaf_T]]':
|
||||
"""Returns all nodes of the tree that evaluate pred(node) as true."""
|
||||
return filter(pred, self.iter_subtrees())
|
||||
|
||||
def find_data(self, data: str) -> 'Iterator[Tree[_Leaf_T]]':
|
||||
"""Returns all nodes of the tree whose data equals the given data."""
|
||||
return self.find_pred(lambda t: t.data == data)
|
||||
|
||||
###}
|
||||
|
||||
def find_token(self, token_type: str) -> Iterator[_Leaf_T]:
|
||||
"""Returns all tokens whose type equals the given token_type.
|
||||
|
||||
This is a recursive function that will find tokens in all the subtrees.
|
||||
|
||||
Example:
|
||||
>>> term_tokens = tree.find_token('TERM')
|
||||
"""
|
||||
return self.scan_values(lambda v: isinstance(v, Token) and v.type == token_type)
|
||||
|
||||
def expand_kids_by_data(self, *data_values):
|
||||
"""Expand (inline) children with any of the given data values. Returns True if anything changed"""
|
||||
changed = False
|
||||
for i in range(len(self.children)-1, -1, -1):
|
||||
child = self.children[i]
|
||||
if isinstance(child, Tree) and child.data in data_values:
|
||||
self.children[i:i+1] = child.children
|
||||
changed = True
|
||||
return changed
|
||||
|
||||
|
||||
def scan_values(self, pred: 'Callable[[Branch[_Leaf_T]], bool]') -> Iterator[_Leaf_T]:
|
||||
"""Return all values in the tree that evaluate pred(value) as true.
|
||||
|
||||
This can be used to find all the tokens in the tree.
|
||||
|
||||
Example:
|
||||
>>> all_tokens = tree.scan_values(lambda v: isinstance(v, Token))
|
||||
"""
|
||||
for c in self.children:
|
||||
if isinstance(c, Tree):
|
||||
for t in c.scan_values(pred):
|
||||
yield t
|
||||
else:
|
||||
if pred(c):
|
||||
yield c
|
||||
|
||||
def __deepcopy__(self, memo):
|
||||
return type(self)(self.data, deepcopy(self.children, memo), meta=self._meta)
|
||||
|
||||
def copy(self) -> 'Tree[_Leaf_T]':
|
||||
return type(self)(self.data, self.children)
|
||||
|
||||
def set(self, data: str, children: 'List[Branch[_Leaf_T]]') -> None:
|
||||
self.data = data
|
||||
self.children = children
|
||||
|
||||
|
||||
ParseTree = Tree['Token']
|
||||
|
||||
|
||||
class SlottedTree(Tree):
|
||||
__slots__ = 'data', 'children', 'rule', '_meta'
|
||||
|
||||
|
||||
def pydot__tree_to_png(tree: Tree, filename: str, rankdir: 'Literal["TB", "LR", "BT", "RL"]'="LR", **kwargs) -> None:
|
||||
graph = pydot__tree_to_graph(tree, rankdir, **kwargs)
|
||||
graph.write_png(filename)
|
||||
|
||||
|
||||
def pydot__tree_to_dot(tree: Tree, filename, rankdir="LR", **kwargs):
|
||||
graph = pydot__tree_to_graph(tree, rankdir, **kwargs)
|
||||
graph.write(filename)
|
||||
|
||||
|
||||
def pydot__tree_to_graph(tree: Tree, rankdir="LR", **kwargs):
|
||||
"""Creates a colorful image that represents the tree (data+children, without meta)
|
||||
|
||||
Possible values for `rankdir` are "TB", "LR", "BT", "RL", corresponding to
|
||||
directed graphs drawn from top to bottom, from left to right, from bottom to
|
||||
top, and from right to left, respectively.
|
||||
|
||||
`kwargs` can be any graph attribute (e. g. `dpi=200`). For a list of
|
||||
possible attributes, see https://www.graphviz.org/doc/info/attrs.html.
|
||||
"""
|
||||
|
||||
import pydot # type: ignore[import-not-found]
|
||||
graph = pydot.Dot(graph_type='digraph', rankdir=rankdir, **kwargs)
|
||||
|
||||
i = [0]
|
||||
|
||||
def new_leaf(leaf):
|
||||
node = pydot.Node(i[0], label=repr(leaf))
|
||||
i[0] += 1
|
||||
graph.add_node(node)
|
||||
return node
|
||||
|
||||
def _to_pydot(subtree):
|
||||
color = hash(subtree.data) & 0xffffff
|
||||
color |= 0x808080
|
||||
|
||||
subnodes = [_to_pydot(child) if isinstance(child, Tree) else new_leaf(child)
|
||||
for child in subtree.children]
|
||||
node = pydot.Node(i[0], style="filled", fillcolor="#%x" % color, label=subtree.data)
|
||||
i[0] += 1
|
||||
graph.add_node(node)
|
||||
|
||||
for subnode in subnodes:
|
||||
graph.add_edge(pydot.Edge(node, subnode))
|
||||
|
||||
return node
|
||||
|
||||
_to_pydot(tree)
|
||||
return graph
|
||||
@@ -0,0 +1,199 @@
|
||||
"""Tree matcher based on Lark grammar"""
|
||||
|
||||
import re
|
||||
from typing import List, Dict
|
||||
from collections import defaultdict
|
||||
|
||||
from . import Tree, Token, Lark
|
||||
from .common import ParserConf
|
||||
from .exceptions import ConfigurationError
|
||||
from .parsers import earley
|
||||
from .grammar import Rule, Terminal, NonTerminal
|
||||
|
||||
|
||||
def is_discarded_terminal(t):
|
||||
return t.is_term and t.filter_out
|
||||
|
||||
|
||||
class _MakeTreeMatch:
|
||||
def __init__(self, name, expansion):
|
||||
self.name = name
|
||||
self.expansion = expansion
|
||||
|
||||
def __call__(self, args):
|
||||
t = Tree(self.name, args)
|
||||
t.meta.match_tree = True
|
||||
t.meta.orig_expansion = self.expansion
|
||||
return t
|
||||
|
||||
|
||||
def _best_from_group(seq, group_key, cmp_key):
|
||||
d = {}
|
||||
for item in seq:
|
||||
key = group_key(item)
|
||||
if key in d:
|
||||
v1 = cmp_key(item)
|
||||
v2 = cmp_key(d[key])
|
||||
if v2 > v1:
|
||||
d[key] = item
|
||||
else:
|
||||
d[key] = item
|
||||
return list(d.values())
|
||||
|
||||
|
||||
def _best_rules_from_group(rules: List[Rule]) -> List[Rule]:
|
||||
rules = _best_from_group(rules, lambda r: r, lambda r: -len(r.expansion))
|
||||
rules.sort(key=lambda r: len(r.expansion))
|
||||
return rules
|
||||
|
||||
|
||||
def _match(term, token):
|
||||
if isinstance(token, Tree):
|
||||
name, _args = parse_rulename(term.name)
|
||||
return token.data == name
|
||||
elif isinstance(token, Token):
|
||||
return term == Terminal(token.type)
|
||||
assert False, (term, token)
|
||||
|
||||
|
||||
def make_recons_rule(origin, expansion, old_expansion):
|
||||
return Rule(origin, expansion, alias=_MakeTreeMatch(origin.name, old_expansion))
|
||||
|
||||
|
||||
def make_recons_rule_to_term(origin, term):
|
||||
return make_recons_rule(origin, [Terminal(term.name)], [term])
|
||||
|
||||
|
||||
def parse_rulename(s):
|
||||
"Parse rule names that may contain a template syntax (like rule{a, b, ...})"
|
||||
name, args_str = re.match(r'(\w+)(?:{(.+)})?', s).groups()
|
||||
args = args_str and [a.strip() for a in args_str.split(',')]
|
||||
return name, args
|
||||
|
||||
|
||||
|
||||
class ChildrenLexer:
|
||||
def __init__(self, children):
|
||||
self.children = children
|
||||
|
||||
def lex(self, parser_state):
|
||||
return self.children
|
||||
|
||||
class TreeMatcher:
|
||||
"""Match the elements of a tree node, based on an ontology
|
||||
provided by a Lark grammar.
|
||||
|
||||
Supports templates and inlined rules (`rule{a, b,..}` and `_rule`)
|
||||
|
||||
Initialize with an instance of Lark.
|
||||
"""
|
||||
rules_for_root: Dict[str, List[Rule]]
|
||||
rules: List[Rule]
|
||||
parser: Lark
|
||||
|
||||
def __init__(self, parser: Lark):
|
||||
# XXX TODO calling compile twice returns different results!
|
||||
assert not parser.options.maybe_placeholders
|
||||
|
||||
if parser.options.postlex and parser.options.postlex.always_accept:
|
||||
# If postlexer's always_accept is used, we need to recompile the grammar with empty terminals-to-keep
|
||||
if not hasattr(parser, 'grammar'):
|
||||
raise ConfigurationError('Source grammar not available from cached parser, use cache_grammar=True'
|
||||
if parser.options.cache else "Source grammar not available!")
|
||||
self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set())
|
||||
else:
|
||||
self.tokens = list(parser.terminals)
|
||||
rules = list(parser.rules)
|
||||
|
||||
self.rules_for_root = defaultdict(list)
|
||||
|
||||
self.rules = list(self._build_recons_rules(rules))
|
||||
self.rules.reverse()
|
||||
|
||||
# Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation.
|
||||
self.rules = _best_rules_from_group(self.rules)
|
||||
|
||||
self.parser = parser
|
||||
self._parser_cache: Dict[str, earley.Parser] = {}
|
||||
|
||||
def _build_recons_rules(self, rules: List[Rule]):
|
||||
"Convert tree-parsing/construction rules to tree-matching rules"
|
||||
expand1s = {r.origin for r in rules if r.options.expand1}
|
||||
|
||||
aliases = defaultdict(list)
|
||||
for r in rules:
|
||||
if r.alias:
|
||||
aliases[r.origin].append(r.alias)
|
||||
|
||||
rule_names = {r.origin for r in rules}
|
||||
nonterminals = {sym for sym in rule_names
|
||||
if sym.name.startswith('_') or sym in expand1s or sym in aliases}
|
||||
|
||||
seen = set()
|
||||
for r in rules:
|
||||
recons_exp = [sym if sym in nonterminals else Terminal(sym.name)
|
||||
for sym in r.expansion if not is_discarded_terminal(sym)]
|
||||
|
||||
# Skip self-recursive constructs
|
||||
if recons_exp == [r.origin] and r.alias is None:
|
||||
continue
|
||||
|
||||
sym = NonTerminal(r.alias) if r.alias else r.origin
|
||||
rule = make_recons_rule(sym, recons_exp, r.expansion)
|
||||
|
||||
if sym in expand1s and len(recons_exp) != 1:
|
||||
self.rules_for_root[sym.name].append(rule)
|
||||
|
||||
if sym.name not in seen:
|
||||
yield make_recons_rule_to_term(sym, sym)
|
||||
seen.add(sym.name)
|
||||
else:
|
||||
if sym.name.startswith('_') or sym in expand1s:
|
||||
yield rule
|
||||
else:
|
||||
self.rules_for_root[sym.name].append(rule)
|
||||
|
||||
for origin, rule_aliases in aliases.items():
|
||||
for alias in rule_aliases:
|
||||
yield make_recons_rule_to_term(origin, NonTerminal(alias))
|
||||
yield make_recons_rule_to_term(origin, origin)
|
||||
|
||||
def match_tree(self, tree: Tree, rulename: str) -> Tree:
|
||||
"""Match the elements of `tree` to the symbols of rule `rulename`.
|
||||
|
||||
Parameters:
|
||||
tree (Tree): the tree node to match
|
||||
rulename (str): The expected full rule name (including template args)
|
||||
|
||||
Returns:
|
||||
Tree: an unreduced tree that matches `rulename`
|
||||
|
||||
Raises:
|
||||
UnexpectedToken: If no match was found.
|
||||
|
||||
Note:
|
||||
It's the callers' responsibility to match the tree recursively.
|
||||
"""
|
||||
if rulename:
|
||||
# validate
|
||||
name, _args = parse_rulename(rulename)
|
||||
assert tree.data == name
|
||||
else:
|
||||
rulename = tree.data
|
||||
|
||||
# TODO: ambiguity?
|
||||
try:
|
||||
parser = self._parser_cache[rulename]
|
||||
except KeyError:
|
||||
rules = self.rules + _best_rules_from_group(self.rules_for_root[rulename])
|
||||
|
||||
# TODO pass callbacks through dict, instead of alias?
|
||||
callbacks = {rule: rule.alias for rule in rules}
|
||||
conf = ParserConf(rules, callbacks, [rulename]) # type: ignore[arg-type]
|
||||
parser = earley.Parser(self.parser.lexer_conf, conf, _match, resolve_ambiguity=True)
|
||||
self._parser_cache[rulename] = parser
|
||||
|
||||
# find a full derivation
|
||||
unreduced_tree: Tree = parser.parse(ChildrenLexer(tree.children), rulename)
|
||||
assert unreduced_tree.data == rulename
|
||||
return unreduced_tree
|
||||
@@ -0,0 +1,180 @@
|
||||
"""This module defines utilities for matching and translation tree templates.
|
||||
|
||||
A tree templates is a tree that contains nodes that are template variables.
|
||||
|
||||
"""
|
||||
|
||||
from typing import Union, Optional, Mapping, Dict, Tuple, Iterator
|
||||
|
||||
from lark import Tree, Transformer
|
||||
from lark.exceptions import MissingVariableError
|
||||
|
||||
Branch = Union[Tree[str], str]
|
||||
TreeOrCode = Union[Tree[str], str]
|
||||
MatchResult = Dict[str, Tree]
|
||||
_TEMPLATE_MARKER = '$'
|
||||
|
||||
|
||||
class TemplateConf:
|
||||
"""Template Configuration
|
||||
|
||||
Allows customization for different uses of Template
|
||||
|
||||
parse() must return a Tree instance.
|
||||
"""
|
||||
|
||||
def __init__(self, parse=None):
|
||||
self._parse = parse
|
||||
|
||||
def test_var(self, var: Union[Tree[str], str]) -> Optional[str]:
|
||||
"""Given a tree node, if it is a template variable return its name. Otherwise, return None.
|
||||
|
||||
This method may be overridden for customization
|
||||
|
||||
Parameters:
|
||||
var: Tree | str - The tree node to test
|
||||
|
||||
"""
|
||||
if isinstance(var, str):
|
||||
return _get_template_name(var)
|
||||
|
||||
if (
|
||||
isinstance(var, Tree)
|
||||
and var.data == "var"
|
||||
and len(var.children) > 0
|
||||
and isinstance(var.children[0], str)
|
||||
):
|
||||
return _get_template_name(var.children[0])
|
||||
|
||||
return None
|
||||
|
||||
def _get_tree(self, template: TreeOrCode) -> Tree[str]:
|
||||
if isinstance(template, str):
|
||||
assert self._parse
|
||||
template = self._parse(template)
|
||||
|
||||
if not isinstance(template, Tree):
|
||||
raise TypeError("template parser must return a Tree instance")
|
||||
|
||||
return template
|
||||
|
||||
def __call__(self, template: Tree[str]) -> 'Template':
|
||||
return Template(template, conf=self)
|
||||
|
||||
def _match_tree_template(self, template: TreeOrCode, tree: Branch) -> Optional[MatchResult]:
|
||||
"""Returns dict of {var: match} if found a match, else None
|
||||
"""
|
||||
template_var = self.test_var(template)
|
||||
if template_var:
|
||||
if not isinstance(tree, Tree):
|
||||
raise TypeError(f"Template variables can only match Tree instances. Not {tree!r}")
|
||||
return {template_var: tree}
|
||||
|
||||
if isinstance(template, str):
|
||||
if template == tree:
|
||||
return {}
|
||||
return None
|
||||
|
||||
assert isinstance(template, Tree) and isinstance(tree, Tree), f"template={template} tree={tree}"
|
||||
|
||||
if template.data == tree.data and len(template.children) == len(tree.children):
|
||||
res = {}
|
||||
for t1, t2 in zip(template.children, tree.children):
|
||||
matches = self._match_tree_template(t1, t2)
|
||||
if matches is None:
|
||||
return None
|
||||
|
||||
res.update(matches)
|
||||
|
||||
return res
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class _ReplaceVars(Transformer[str, Tree[str]]):
|
||||
def __init__(self, conf: TemplateConf, vars: Mapping[str, Tree[str]]) -> None:
|
||||
super().__init__()
|
||||
self._conf = conf
|
||||
self._vars = vars
|
||||
|
||||
def __default__(self, data, children, meta) -> Tree[str]:
|
||||
tree = super().__default__(data, children, meta)
|
||||
|
||||
var = self._conf.test_var(tree)
|
||||
if var:
|
||||
try:
|
||||
return self._vars[var]
|
||||
except KeyError:
|
||||
raise MissingVariableError(f"No mapping for template variable ({var})")
|
||||
return tree
|
||||
|
||||
|
||||
class Template:
|
||||
"""Represents a tree template, tied to a specific configuration
|
||||
|
||||
A tree template is a tree that contains nodes that are template variables.
|
||||
Those variables will match any tree.
|
||||
(future versions may support annotations on the variables, to allow more complex templates)
|
||||
"""
|
||||
|
||||
def __init__(self, tree: Tree[str], conf: TemplateConf = TemplateConf()):
|
||||
self.conf = conf
|
||||
self.tree = conf._get_tree(tree)
|
||||
|
||||
def match(self, tree: TreeOrCode) -> Optional[MatchResult]:
|
||||
"""Match a tree template to a tree.
|
||||
|
||||
A tree template without variables will only match ``tree`` if it is equal to the template.
|
||||
|
||||
Parameters:
|
||||
tree (Tree): The tree to match to the template
|
||||
|
||||
Returns:
|
||||
Optional[Dict[str, Tree]]: If match is found, returns a dictionary mapping
|
||||
template variable names to their matching tree nodes.
|
||||
If no match was found, returns None.
|
||||
"""
|
||||
tree = self.conf._get_tree(tree)
|
||||
return self.conf._match_tree_template(self.tree, tree)
|
||||
|
||||
def search(self, tree: TreeOrCode) -> Iterator[Tuple[Tree[str], MatchResult]]:
|
||||
"""Search for all occurrences of the tree template inside ``tree``.
|
||||
"""
|
||||
tree = self.conf._get_tree(tree)
|
||||
for subtree in tree.iter_subtrees():
|
||||
res = self.match(subtree)
|
||||
if res:
|
||||
yield subtree, res
|
||||
|
||||
def apply_vars(self, vars: Mapping[str, Tree[str]]) -> Tree[str]:
|
||||
"""Apply vars to the template tree
|
||||
"""
|
||||
return _ReplaceVars(self.conf, vars).transform(self.tree)
|
||||
|
||||
|
||||
def translate(t1: Template, t2: Template, tree: TreeOrCode):
|
||||
"""Search tree and translate each occurrence of t1 into t2.
|
||||
"""
|
||||
tree = t1.conf._get_tree(tree) # ensure it's a tree, parse if necessary and possible
|
||||
for subtree, vars in t1.search(tree):
|
||||
res = t2.apply_vars(vars)
|
||||
subtree.set(res.data, res.children)
|
||||
return tree
|
||||
|
||||
|
||||
class TemplateTranslator:
|
||||
"""Utility class for translating a collection of patterns
|
||||
"""
|
||||
|
||||
def __init__(self, translations: Mapping[Template, Template]):
|
||||
assert all(isinstance(k, Template) and isinstance(v, Template) for k, v in translations.items())
|
||||
self.translations = translations
|
||||
|
||||
def translate(self, tree: Tree[str]):
|
||||
for k, v in self.translations.items():
|
||||
tree = translate(k, v, tree)
|
||||
return tree
|
||||
|
||||
|
||||
def _get_template_name(value: str) -> Optional[str]:
|
||||
return value.lstrip(_TEMPLATE_MARKER) if value.startswith(_TEMPLATE_MARKER) else None
|
||||
@@ -0,0 +1,416 @@
|
||||
import unicodedata
|
||||
import os
|
||||
from itertools import product
|
||||
from collections import deque
|
||||
from typing import Callable, Iterator, List, Optional, Tuple, Type, TypeVar, Union, Dict, Any, Sequence, Iterable, AbstractSet
|
||||
|
||||
###{standalone
|
||||
import sys, re
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Generic, AnyStr
|
||||
|
||||
logger: logging.Logger = logging.getLogger("lark")
|
||||
logger.addHandler(logging.StreamHandler())
|
||||
# Set to highest level, since we have some warnings amongst the code
|
||||
# By default, we should not output any log messages
|
||||
logger.setLevel(logging.CRITICAL)
|
||||
|
||||
|
||||
NO_VALUE = object()
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def classify(seq: Iterable, key: Optional[Callable] = None, value: Optional[Callable] = None) -> Dict:
|
||||
d: Dict[Any, Any] = {}
|
||||
for item in seq:
|
||||
k = key(item) if (key is not None) else item
|
||||
v = value(item) if (value is not None) else item
|
||||
try:
|
||||
d[k].append(v)
|
||||
except KeyError:
|
||||
d[k] = [v]
|
||||
return d
|
||||
|
||||
|
||||
def _deserialize(data: Any, namespace: Dict[str, Any], memo: Dict) -> Any:
|
||||
if isinstance(data, dict):
|
||||
if '__type__' in data: # Object
|
||||
class_ = namespace[data['__type__']]
|
||||
return class_.deserialize(data, memo)
|
||||
elif '@' in data:
|
||||
return memo[data['@']]
|
||||
return {key:_deserialize(value, namespace, memo) for key, value in data.items()}
|
||||
elif isinstance(data, list):
|
||||
return [_deserialize(value, namespace, memo) for value in data]
|
||||
return data
|
||||
|
||||
|
||||
_T = TypeVar("_T", bound="Serialize")
|
||||
|
||||
class Serialize:
|
||||
"""Safe-ish serialization interface that doesn't rely on Pickle
|
||||
|
||||
Attributes:
|
||||
__serialize_fields__ (List[str]): Fields (aka attributes) to serialize.
|
||||
__serialize_namespace__ (list): List of classes that deserialization is allowed to instantiate.
|
||||
Should include all field types that aren't builtin types.
|
||||
"""
|
||||
|
||||
def memo_serialize(self, types_to_memoize: List) -> Any:
|
||||
memo = SerializeMemoizer(types_to_memoize)
|
||||
return self.serialize(memo), memo.serialize()
|
||||
|
||||
def serialize(self, memo = None) -> Dict[str, Any]:
|
||||
if memo and memo.in_types(self):
|
||||
return {'@': memo.memoized.get(self)}
|
||||
|
||||
fields = getattr(self, '__serialize_fields__')
|
||||
res = {f: _serialize(getattr(self, f), memo) for f in fields}
|
||||
res['__type__'] = type(self).__name__
|
||||
if hasattr(self, '_serialize'):
|
||||
self._serialize(res, memo)
|
||||
return res
|
||||
|
||||
@classmethod
|
||||
def deserialize(cls: Type[_T], data: Dict[str, Any], memo: Dict[int, Any]) -> _T:
|
||||
namespace = getattr(cls, '__serialize_namespace__', [])
|
||||
namespace = {c.__name__:c for c in namespace}
|
||||
|
||||
fields = getattr(cls, '__serialize_fields__')
|
||||
|
||||
if '@' in data:
|
||||
return memo[data['@']]
|
||||
|
||||
inst = cls.__new__(cls)
|
||||
for f in fields:
|
||||
try:
|
||||
setattr(inst, f, _deserialize(data[f], namespace, memo))
|
||||
except KeyError as e:
|
||||
raise KeyError("Cannot find key for class", cls, e)
|
||||
|
||||
if hasattr(inst, '_deserialize'):
|
||||
inst._deserialize()
|
||||
|
||||
return inst
|
||||
|
||||
|
||||
class SerializeMemoizer(Serialize):
|
||||
"A version of serialize that memoizes objects to reduce space"
|
||||
|
||||
__serialize_fields__ = 'memoized',
|
||||
|
||||
def __init__(self, types_to_memoize: List) -> None:
|
||||
self.types_to_memoize = tuple(types_to_memoize)
|
||||
self.memoized = Enumerator()
|
||||
|
||||
def in_types(self, value: Serialize) -> bool:
|
||||
return isinstance(value, self.types_to_memoize)
|
||||
|
||||
def serialize(self) -> Dict[int, Any]: # type: ignore[override]
|
||||
return _serialize(self.memoized.reversed(), None)
|
||||
|
||||
@classmethod
|
||||
def deserialize(cls, data: Dict[int, Any], namespace: Dict[str, Any], memo: Dict[Any, Any]) -> Dict[int, Any]: # type: ignore[override]
|
||||
return _deserialize(data, namespace, memo)
|
||||
|
||||
|
||||
try:
|
||||
import regex
|
||||
_has_regex = True
|
||||
except ImportError:
|
||||
_has_regex = False
|
||||
|
||||
if sys.version_info >= (3, 11):
|
||||
import re._parser as sre_parse
|
||||
import re._constants as sre_constants
|
||||
else:
|
||||
import sre_parse
|
||||
import sre_constants
|
||||
|
||||
categ_pattern = re.compile(r'\\p{[A-Za-z_]+}')
|
||||
|
||||
def get_regexp_width(expr: str) -> Union[Tuple[int, int], List[int]]:
|
||||
if _has_regex:
|
||||
# Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with
|
||||
# a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex
|
||||
# match here below.
|
||||
regexp_final = re.sub(categ_pattern, 'A', expr)
|
||||
else:
|
||||
if re.search(categ_pattern, expr):
|
||||
raise ImportError('`regex` module must be installed in order to use Unicode categories.', expr)
|
||||
regexp_final = expr
|
||||
try:
|
||||
# Fixed in next version (past 0.960) of typeshed
|
||||
return [int(x) for x in sre_parse.parse(regexp_final).getwidth()]
|
||||
except sre_constants.error:
|
||||
if not _has_regex:
|
||||
raise ValueError(expr)
|
||||
else:
|
||||
# sre_parse does not support the new features in regex. To not completely fail in that case,
|
||||
# we manually test for the most important info (whether the empty string is matched)
|
||||
c = regex.compile(regexp_final)
|
||||
# Python 3.11.7 introducded sre_parse.MAXWIDTH that is used instead of MAXREPEAT
|
||||
# See lark-parser/lark#1376 and python/cpython#109859
|
||||
MAXWIDTH = getattr(sre_parse, "MAXWIDTH", sre_constants.MAXREPEAT)
|
||||
if c.match('') is None:
|
||||
# MAXREPEAT is a none pickable subclass of int, therefore needs to be converted to enable caching
|
||||
return 1, int(MAXWIDTH)
|
||||
else:
|
||||
return 0, int(MAXWIDTH)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TextSlice(Generic[AnyStr]):
|
||||
"""A view of a string or bytes object, between the start and end indices.
|
||||
|
||||
Never creates a copy.
|
||||
|
||||
Lark accepts instances of TextSlice as input (instead of a string),
|
||||
when the lexer is 'basic' or 'contextual'.
|
||||
|
||||
Args:
|
||||
text (str or bytes): The text to slice.
|
||||
start (int): The start index. Negative indices are supported.
|
||||
end (int): The end index. Negative indices are supported.
|
||||
|
||||
Raises:
|
||||
TypeError: If `text` is not a `str` or `bytes`.
|
||||
AssertionError: If `start` or `end` are out of bounds.
|
||||
|
||||
Examples:
|
||||
>>> TextSlice("Hello, World!", 7, -1)
|
||||
TextSlice(text='Hello, World!', start=7, end=12)
|
||||
|
||||
>>> TextSlice("Hello, World!", 7, None).count("o")
|
||||
1
|
||||
|
||||
"""
|
||||
text: AnyStr
|
||||
start: int
|
||||
end: int
|
||||
|
||||
def __post_init__(self):
|
||||
if not isinstance(self.text, (str, bytes)):
|
||||
raise TypeError("text must be str or bytes")
|
||||
|
||||
if self.start < 0:
|
||||
object.__setattr__(self, 'start', self.start + len(self.text))
|
||||
assert self.start >=0
|
||||
|
||||
if self.end is None:
|
||||
object.__setattr__(self, 'end', len(self.text))
|
||||
elif self.end < 0:
|
||||
object.__setattr__(self, 'end', self.end + len(self.text))
|
||||
assert self.end <= len(self.text)
|
||||
|
||||
@classmethod
|
||||
def cast_from(cls, text: 'TextOrSlice') -> 'TextSlice[AnyStr]':
|
||||
if isinstance(text, TextSlice):
|
||||
return text
|
||||
|
||||
return cls(text, 0, len(text))
|
||||
|
||||
def is_complete_text(self):
|
||||
return self.start == 0 and self.end == len(self.text)
|
||||
|
||||
def __len__(self):
|
||||
return self.end - self.start
|
||||
|
||||
def count(self, substr: AnyStr):
|
||||
return self.text.count(substr, self.start, self.end)
|
||||
|
||||
def rindex(self, substr: AnyStr):
|
||||
return self.text.rindex(substr, self.start, self.end)
|
||||
|
||||
|
||||
TextOrSlice = Union[AnyStr, 'TextSlice[AnyStr]']
|
||||
LarkInput = Union[AnyStr, TextSlice[AnyStr], Any]
|
||||
|
||||
###}
|
||||
|
||||
|
||||
_ID_START = 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc'
|
||||
_ID_CONTINUE = _ID_START + ('Nd', 'Nl',)
|
||||
|
||||
def _test_unicode_category(s: str, categories: Sequence[str]) -> bool:
|
||||
if len(s) != 1:
|
||||
return all(_test_unicode_category(char, categories) for char in s)
|
||||
return s == '_' or unicodedata.category(s) in categories
|
||||
|
||||
def is_id_continue(s: str) -> bool:
|
||||
"""
|
||||
Checks if all characters in `s` are alphanumeric characters (Unicode standard, so diacritics, indian vowels, non-latin
|
||||
numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details.
|
||||
"""
|
||||
return _test_unicode_category(s, _ID_CONTINUE)
|
||||
|
||||
def is_id_start(s: str) -> bool:
|
||||
"""
|
||||
Checks if all characters in `s` are alphabetic characters (Unicode standard, so diacritics, indian vowels, non-latin
|
||||
numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details.
|
||||
"""
|
||||
return _test_unicode_category(s, _ID_START)
|
||||
|
||||
|
||||
def dedup_list(l: Iterable[T]) -> List[T]:
|
||||
"""Given a list (l) will removing duplicates from the list,
|
||||
preserving the original order of the list. Assumes that
|
||||
the list entries are hashable."""
|
||||
return list(dict.fromkeys(l))
|
||||
|
||||
|
||||
class Enumerator(Serialize):
|
||||
def __init__(self) -> None:
|
||||
self.enums: Dict[Any, int] = {}
|
||||
|
||||
def get(self, item) -> int:
|
||||
if item not in self.enums:
|
||||
self.enums[item] = len(self.enums)
|
||||
return self.enums[item]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.enums)
|
||||
|
||||
def reversed(self) -> Dict[int, Any]:
|
||||
r = {v: k for k, v in self.enums.items()}
|
||||
assert len(r) == len(self.enums)
|
||||
return r
|
||||
|
||||
|
||||
|
||||
def combine_alternatives(lists):
|
||||
"""
|
||||
Accepts a list of alternatives, and enumerates all their possible concatenations.
|
||||
|
||||
Examples:
|
||||
>>> combine_alternatives([range(2), [4,5]])
|
||||
[[0, 4], [0, 5], [1, 4], [1, 5]]
|
||||
|
||||
>>> combine_alternatives(["abc", "xy", '$'])
|
||||
[['a', 'x', '$'], ['a', 'y', '$'], ['b', 'x', '$'], ['b', 'y', '$'], ['c', 'x', '$'], ['c', 'y', '$']]
|
||||
|
||||
>>> combine_alternatives([])
|
||||
[[]]
|
||||
"""
|
||||
if not lists:
|
||||
return [[]]
|
||||
assert all(l for l in lists), lists
|
||||
return list(product(*lists))
|
||||
|
||||
try:
|
||||
import atomicwrites
|
||||
_has_atomicwrites = True
|
||||
except ImportError:
|
||||
_has_atomicwrites = False
|
||||
|
||||
class FS:
|
||||
exists = staticmethod(os.path.exists)
|
||||
|
||||
@staticmethod
|
||||
def open(name, mode="r", **kwargs):
|
||||
if _has_atomicwrites and "w" in mode:
|
||||
return atomicwrites.atomic_write(name, mode=mode, overwrite=True, **kwargs)
|
||||
else:
|
||||
return open(name, mode, **kwargs)
|
||||
|
||||
|
||||
class fzset(frozenset):
|
||||
def __repr__(self):
|
||||
return '{%s}' % ', '.join(map(repr, self))
|
||||
|
||||
|
||||
def classify_bool(seq: Iterable, pred: Callable) -> Any:
|
||||
false_elems = []
|
||||
true_elems = [elem for elem in seq if pred(elem) or false_elems.append(elem)] # type: ignore[func-returns-value]
|
||||
return true_elems, false_elems
|
||||
|
||||
|
||||
def bfs(initial: Iterable, expand: Callable) -> Iterator:
|
||||
open_q = deque(list(initial))
|
||||
visited = set(open_q)
|
||||
while open_q:
|
||||
node = open_q.popleft()
|
||||
yield node
|
||||
for next_node in expand(node):
|
||||
if next_node not in visited:
|
||||
visited.add(next_node)
|
||||
open_q.append(next_node)
|
||||
|
||||
def bfs_all_unique(initial, expand):
|
||||
"bfs, but doesn't keep track of visited (aka seen), because there can be no repetitions"
|
||||
open_q = deque(list(initial))
|
||||
while open_q:
|
||||
node = open_q.popleft()
|
||||
yield node
|
||||
open_q += expand(node)
|
||||
|
||||
|
||||
def _serialize(value: Any, memo: Optional[SerializeMemoizer]) -> Any:
|
||||
if isinstance(value, Serialize):
|
||||
return value.serialize(memo)
|
||||
elif isinstance(value, list):
|
||||
return [_serialize(elem, memo) for elem in value]
|
||||
elif isinstance(value, frozenset):
|
||||
return list(value) # TODO reversible?
|
||||
elif isinstance(value, dict):
|
||||
return {key:_serialize(elem, memo) for key, elem in value.items()}
|
||||
# assert value is None or isinstance(value, (int, float, str, tuple)), value
|
||||
return value
|
||||
|
||||
|
||||
|
||||
|
||||
def small_factors(n: int, max_factor: int) -> List[Tuple[int, int]]:
|
||||
"""
|
||||
Splits n up into smaller factors and summands <= max_factor.
|
||||
Returns a list of [(a, b), ...]
|
||||
so that the following code returns n:
|
||||
|
||||
n = 1
|
||||
for a, b in values:
|
||||
n = n * a + b
|
||||
|
||||
Currently, we also keep a + b <= max_factor, but that might change
|
||||
"""
|
||||
assert n >= 0
|
||||
assert max_factor > 2
|
||||
if n <= max_factor:
|
||||
return [(n, 0)]
|
||||
|
||||
for a in range(max_factor, 1, -1):
|
||||
r, b = divmod(n, a)
|
||||
if a + b <= max_factor:
|
||||
return small_factors(r, max_factor) + [(a, b)]
|
||||
assert False, "Failed to factorize %s" % n
|
||||
|
||||
|
||||
class OrderedSet(AbstractSet[T]):
|
||||
"""A minimal OrderedSet implementation, using a dictionary.
|
||||
|
||||
(relies on the dictionary being ordered)
|
||||
"""
|
||||
def __init__(self, items: Iterable[T] =()):
|
||||
self.d = dict.fromkeys(items)
|
||||
|
||||
def __contains__(self, item: Any) -> bool:
|
||||
return item in self.d
|
||||
|
||||
def add(self, item: T):
|
||||
self.d[item] = None
|
||||
|
||||
def __iter__(self) -> Iterator[T]:
|
||||
return iter(self.d)
|
||||
|
||||
def remove(self, item: T):
|
||||
del self.d[item]
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.d)
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.d)
|
||||
|
||||
def __repr__(self):
|
||||
return f"{type(self).__name__}({', '.join(map(repr,self))})"
|
||||
@@ -0,0 +1,596 @@
|
||||
from typing import TypeVar, Tuple, List, Callable, Generic, Type, Union, Optional, Any, cast
|
||||
from abc import ABC
|
||||
|
||||
from .utils import combine_alternatives
|
||||
from .tree import Tree, Branch
|
||||
from .exceptions import VisitError, GrammarError
|
||||
from .lexer import Token
|
||||
|
||||
###{standalone
|
||||
from functools import wraps, update_wrapper
|
||||
from inspect import getmembers, getmro
|
||||
|
||||
_Return_T = TypeVar('_Return_T')
|
||||
_Return_V = TypeVar('_Return_V')
|
||||
_Leaf_T = TypeVar('_Leaf_T')
|
||||
_Leaf_U = TypeVar('_Leaf_U')
|
||||
_R = TypeVar('_R')
|
||||
_FUNC = Callable[..., _Return_T]
|
||||
_DECORATED = Union[_FUNC, type]
|
||||
|
||||
class _DiscardType:
|
||||
"""When the Discard value is returned from a transformer callback,
|
||||
that node is discarded and won't appear in the parent.
|
||||
|
||||
Note:
|
||||
This feature is disabled when the transformer is provided to Lark
|
||||
using the ``transformer`` keyword (aka Tree-less LALR mode).
|
||||
|
||||
Example:
|
||||
::
|
||||
|
||||
class T(Transformer):
|
||||
def ignore_tree(self, children):
|
||||
return Discard
|
||||
|
||||
def IGNORE_TOKEN(self, token):
|
||||
return Discard
|
||||
"""
|
||||
|
||||
def __repr__(self):
|
||||
return "lark.visitors.Discard"
|
||||
|
||||
Discard = _DiscardType()
|
||||
|
||||
# Transformers
|
||||
|
||||
class _Decoratable:
|
||||
"Provides support for decorating methods with @v_args"
|
||||
|
||||
@classmethod
|
||||
def _apply_v_args(cls, visit_wrapper):
|
||||
mro = getmro(cls)
|
||||
assert mro[0] is cls
|
||||
libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)}
|
||||
for name, value in getmembers(cls):
|
||||
|
||||
# Make sure the function isn't inherited (unless it's overwritten)
|
||||
if name.startswith('_') or (name in libmembers and name not in cls.__dict__):
|
||||
continue
|
||||
if not callable(value):
|
||||
continue
|
||||
|
||||
# Skip if v_args already applied (at the function level)
|
||||
if isinstance(cls.__dict__[name], _VArgsWrapper):
|
||||
continue
|
||||
|
||||
setattr(cls, name, _VArgsWrapper(cls.__dict__[name], visit_wrapper))
|
||||
return cls
|
||||
|
||||
def __class_getitem__(cls, _):
|
||||
return cls
|
||||
|
||||
|
||||
class Transformer(_Decoratable, ABC, Generic[_Leaf_T, _Return_T]):
|
||||
"""Transformers work bottom-up (or depth-first), starting with visiting the leaves and working
|
||||
their way up until ending at the root of the tree.
|
||||
|
||||
For each node visited, the transformer will call the appropriate method (callbacks), according to the
|
||||
node's ``data``, and use the returned value to replace the node, thereby creating a new tree structure.
|
||||
|
||||
Transformers can be used to implement map & reduce patterns. Because nodes are reduced from leaf to root,
|
||||
at any point the callbacks may assume the children have already been transformed (if applicable).
|
||||
|
||||
If the transformer cannot find a method with the right name, it will instead call ``__default__``, which by
|
||||
default creates a copy of the node.
|
||||
|
||||
To discard a node, return Discard (``lark.visitors.Discard``).
|
||||
|
||||
``Transformer`` can do anything ``Visitor`` can do, but because it reconstructs the tree,
|
||||
it is slightly less efficient.
|
||||
|
||||
A transformer without methods essentially performs a non-memoized partial deepcopy.
|
||||
|
||||
All these classes implement the transformer interface:
|
||||
|
||||
- ``Transformer`` - Recursively transforms the tree. This is the one you probably want.
|
||||
- ``Transformer_InPlace`` - Non-recursive. Changes the tree in-place instead of returning new instances
|
||||
- ``Transformer_InPlaceRecursive`` - Recursive. Changes the tree in-place instead of returning new instances
|
||||
|
||||
Parameters:
|
||||
visit_tokens (bool, optional): Should the transformer visit tokens in addition to rules.
|
||||
Setting this to ``False`` is slightly faster. Defaults to ``True``.
|
||||
(For processing ignored tokens, use the ``lexer_callbacks`` options)
|
||||
|
||||
"""
|
||||
__visit_tokens__ = True # For backwards compatibility
|
||||
|
||||
def __init__(self, visit_tokens: bool=True) -> None:
|
||||
self.__visit_tokens__ = visit_tokens
|
||||
|
||||
def _call_userfunc(self, tree, new_children=None):
|
||||
# Assumes tree is already transformed
|
||||
children = new_children if new_children is not None else tree.children
|
||||
try:
|
||||
f = getattr(self, tree.data)
|
||||
except AttributeError:
|
||||
return self.__default__(tree.data, children, tree.meta)
|
||||
else:
|
||||
try:
|
||||
wrapper = getattr(f, 'visit_wrapper', None)
|
||||
if wrapper is not None:
|
||||
return f.visit_wrapper(f, tree.data, children, tree.meta)
|
||||
else:
|
||||
return f(children)
|
||||
except GrammarError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise VisitError(tree.data, tree, e)
|
||||
|
||||
def _call_userfunc_token(self, token):
|
||||
try:
|
||||
f = getattr(self, token.type)
|
||||
except AttributeError:
|
||||
return self.__default_token__(token)
|
||||
else:
|
||||
try:
|
||||
return f(token)
|
||||
except GrammarError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise VisitError(token.type, token, e)
|
||||
|
||||
def _transform_children(self, children):
|
||||
for c in children:
|
||||
if isinstance(c, Tree):
|
||||
res = self._transform_tree(c)
|
||||
elif self.__visit_tokens__ and isinstance(c, Token):
|
||||
res = self._call_userfunc_token(c)
|
||||
else:
|
||||
res = c
|
||||
|
||||
if res is not Discard:
|
||||
yield res
|
||||
|
||||
def _transform_tree(self, tree):
|
||||
children = list(self._transform_children(tree.children))
|
||||
return self._call_userfunc(tree, children)
|
||||
|
||||
def transform(self, tree: Tree[_Leaf_T]) -> _Return_T:
|
||||
"Transform the given tree, and return the final result"
|
||||
res = list(self._transform_children([tree]))
|
||||
if not res:
|
||||
return None # type: ignore[return-value]
|
||||
assert len(res) == 1
|
||||
return res[0]
|
||||
|
||||
def __mul__(
|
||||
self: 'Transformer[_Leaf_T, Tree[_Leaf_U]]',
|
||||
other: 'Union[Transformer[_Leaf_U, _Return_V], TransformerChain[_Leaf_U, _Return_V,]]'
|
||||
) -> 'TransformerChain[_Leaf_T, _Return_V]':
|
||||
"""Chain two transformers together, returning a new transformer.
|
||||
"""
|
||||
return TransformerChain(self, other)
|
||||
|
||||
def __default__(self, data, children, meta):
|
||||
"""Default function that is called if there is no attribute matching ``data``
|
||||
|
||||
Can be overridden. Defaults to creating a new copy of the tree node (i.e. ``return Tree(data, children, meta)``)
|
||||
"""
|
||||
return Tree(data, children, meta)
|
||||
|
||||
def __default_token__(self, token):
|
||||
"""Default function that is called if there is no attribute matching ``token.type``
|
||||
|
||||
Can be overridden. Defaults to returning the token as-is.
|
||||
"""
|
||||
return token
|
||||
|
||||
|
||||
def merge_transformers(base_transformer=None, **transformers_to_merge):
|
||||
"""Merge a collection of transformers into the base_transformer, each into its own 'namespace'.
|
||||
|
||||
When called, it will collect the methods from each transformer, and assign them to base_transformer,
|
||||
with their name prefixed with the given keyword, as ``prefix__methodname``.
|
||||
|
||||
This function is especially useful for processing grammars that import other grammars,
|
||||
thereby creating some of their rules in a 'namespace'. (i.e with a consistent name prefix).
|
||||
In this case, the key for the transformer should match the name of the imported grammar.
|
||||
|
||||
Parameters:
|
||||
base_transformer (Transformer, optional): The transformer that all other transformers will be added to.
|
||||
**transformers_to_merge: Keyword arguments, in the form of ``name_prefix = transformer``.
|
||||
|
||||
Raises:
|
||||
AttributeError: In case of a name collision in the merged methods
|
||||
|
||||
Example:
|
||||
::
|
||||
|
||||
class TBase(Transformer):
|
||||
def start(self, children):
|
||||
return children[0] + 'bar'
|
||||
|
||||
class TImportedGrammar(Transformer):
|
||||
def foo(self, children):
|
||||
return "foo"
|
||||
|
||||
composed_transformer = merge_transformers(TBase(), imported=TImportedGrammar())
|
||||
|
||||
t = Tree('start', [ Tree('imported__foo', []) ])
|
||||
|
||||
assert composed_transformer.transform(t) == 'foobar'
|
||||
|
||||
"""
|
||||
if base_transformer is None:
|
||||
base_transformer = Transformer()
|
||||
for prefix, transformer in transformers_to_merge.items():
|
||||
for method_name in dir(transformer):
|
||||
method = getattr(transformer, method_name)
|
||||
if not callable(method):
|
||||
continue
|
||||
if method_name.startswith("_") or method_name == "transform":
|
||||
continue
|
||||
prefixed_method = prefix + "__" + method_name
|
||||
if hasattr(base_transformer, prefixed_method):
|
||||
raise AttributeError("Cannot merge: method '%s' appears more than once" % prefixed_method)
|
||||
|
||||
setattr(base_transformer, prefixed_method, method)
|
||||
|
||||
return base_transformer
|
||||
|
||||
|
||||
class InlineTransformer(Transformer): # XXX Deprecated
|
||||
def _call_userfunc(self, tree, new_children=None):
|
||||
# Assumes tree is already transformed
|
||||
children = new_children if new_children is not None else tree.children
|
||||
try:
|
||||
f = getattr(self, tree.data)
|
||||
except AttributeError:
|
||||
return self.__default__(tree.data, children, tree.meta)
|
||||
else:
|
||||
return f(*children)
|
||||
|
||||
|
||||
class TransformerChain(Generic[_Leaf_T, _Return_T]):
|
||||
|
||||
transformers: 'Tuple[Union[Transformer, TransformerChain], ...]'
|
||||
|
||||
def __init__(self, *transformers: 'Union[Transformer, TransformerChain]') -> None:
|
||||
self.transformers = transformers
|
||||
|
||||
def transform(self, tree: Tree[_Leaf_T]) -> _Return_T:
|
||||
for t in self.transformers:
|
||||
tree = t.transform(tree)
|
||||
return cast(_Return_T, tree)
|
||||
|
||||
def __mul__(
|
||||
self: 'TransformerChain[_Leaf_T, Tree[_Leaf_U]]',
|
||||
other: 'Union[Transformer[_Leaf_U, _Return_V], TransformerChain[_Leaf_U, _Return_V]]'
|
||||
) -> 'TransformerChain[_Leaf_T, _Return_V]':
|
||||
return TransformerChain(*self.transformers + (other,))
|
||||
|
||||
|
||||
class Transformer_InPlace(Transformer[_Leaf_T, _Return_T]):
|
||||
"""Same as Transformer, but non-recursive, and changes the tree in-place instead of returning new instances
|
||||
|
||||
Useful for huge trees. Conservative in memory.
|
||||
"""
|
||||
def _transform_tree(self, tree): # Cancel recursion
|
||||
return self._call_userfunc(tree)
|
||||
|
||||
def transform(self, tree: Tree[_Leaf_T]) -> _Return_T:
|
||||
for subtree in tree.iter_subtrees():
|
||||
subtree.children = list(self._transform_children(subtree.children))
|
||||
|
||||
return self._transform_tree(tree)
|
||||
|
||||
|
||||
class Transformer_NonRecursive(Transformer[_Leaf_T, _Return_T]):
|
||||
"""Same as Transformer but non-recursive.
|
||||
|
||||
Like Transformer, it doesn't change the original tree.
|
||||
|
||||
Useful for huge trees.
|
||||
"""
|
||||
|
||||
def transform(self, tree: Tree[_Leaf_T]) -> _Return_T:
|
||||
# Tree to postfix
|
||||
rev_postfix = []
|
||||
q: List[Branch[_Leaf_T]] = [tree]
|
||||
while q:
|
||||
t = q.pop()
|
||||
rev_postfix.append(t)
|
||||
if isinstance(t, Tree):
|
||||
q += t.children
|
||||
|
||||
# Postfix to tree
|
||||
stack: List = []
|
||||
for x in reversed(rev_postfix):
|
||||
if isinstance(x, Tree):
|
||||
size = len(x.children)
|
||||
if size:
|
||||
args = stack[-size:]
|
||||
del stack[-size:]
|
||||
else:
|
||||
args = []
|
||||
|
||||
res = self._call_userfunc(x, args)
|
||||
if res is not Discard:
|
||||
stack.append(res)
|
||||
|
||||
elif self.__visit_tokens__ and isinstance(x, Token):
|
||||
res = self._call_userfunc_token(x)
|
||||
if res is not Discard:
|
||||
stack.append(res)
|
||||
else:
|
||||
stack.append(x)
|
||||
|
||||
result, = stack # We should have only one tree remaining
|
||||
# There are no guarantees on the type of the value produced by calling a user func for a
|
||||
# child will produce. This means type system can't statically know that the final result is
|
||||
# _Return_T. As a result a cast is required.
|
||||
return cast(_Return_T, result)
|
||||
|
||||
|
||||
class Transformer_InPlaceRecursive(Transformer[_Leaf_T, _Return_T]):
|
||||
"Same as Transformer, recursive, but changes the tree in-place instead of returning new instances"
|
||||
def _transform_tree(self, tree):
|
||||
tree.children = list(self._transform_children(tree.children))
|
||||
return self._call_userfunc(tree)
|
||||
|
||||
|
||||
# Visitors
|
||||
|
||||
class VisitorBase:
|
||||
def _call_userfunc(self, tree):
|
||||
return getattr(self, tree.data, self.__default__)(tree)
|
||||
|
||||
def __default__(self, tree):
|
||||
"""Default function that is called if there is no attribute matching ``tree.data``
|
||||
|
||||
Can be overridden. Defaults to doing nothing.
|
||||
"""
|
||||
return tree
|
||||
|
||||
def __class_getitem__(cls, _):
|
||||
return cls
|
||||
|
||||
|
||||
class Visitor(VisitorBase, ABC, Generic[_Leaf_T]):
|
||||
"""Tree visitor, non-recursive (can handle huge trees).
|
||||
|
||||
Visiting a node calls its methods (provided by the user via inheritance) according to ``tree.data``
|
||||
"""
|
||||
|
||||
def visit(self, tree: Tree[_Leaf_T]) -> Tree[_Leaf_T]:
|
||||
"Visits the tree, starting with the leaves and finally the root (bottom-up)"
|
||||
for subtree in tree.iter_subtrees():
|
||||
self._call_userfunc(subtree)
|
||||
return tree
|
||||
|
||||
def visit_topdown(self, tree: Tree[_Leaf_T]) -> Tree[_Leaf_T]:
|
||||
"Visit the tree, starting at the root, and ending at the leaves (top-down)"
|
||||
for subtree in tree.iter_subtrees_topdown():
|
||||
self._call_userfunc(subtree)
|
||||
return tree
|
||||
|
||||
|
||||
class Visitor_Recursive(VisitorBase, Generic[_Leaf_T]):
|
||||
"""Bottom-up visitor, recursive.
|
||||
|
||||
Visiting a node calls its methods (provided by the user via inheritance) according to ``tree.data``
|
||||
|
||||
Slightly faster than the non-recursive version.
|
||||
"""
|
||||
|
||||
def visit(self, tree: Tree[_Leaf_T]) -> Tree[_Leaf_T]:
|
||||
"Visits the tree, starting with the leaves and finally the root (bottom-up)"
|
||||
for child in tree.children:
|
||||
if isinstance(child, Tree):
|
||||
self.visit(child)
|
||||
|
||||
self._call_userfunc(tree)
|
||||
return tree
|
||||
|
||||
def visit_topdown(self,tree: Tree[_Leaf_T]) -> Tree[_Leaf_T]:
|
||||
"Visit the tree, starting at the root, and ending at the leaves (top-down)"
|
||||
self._call_userfunc(tree)
|
||||
|
||||
for child in tree.children:
|
||||
if isinstance(child, Tree):
|
||||
self.visit_topdown(child)
|
||||
|
||||
return tree
|
||||
|
||||
|
||||
class Interpreter(_Decoratable, ABC, Generic[_Leaf_T, _Return_T]):
|
||||
"""Interpreter walks the tree starting at the root.
|
||||
|
||||
Visits the tree, starting with the root and finally the leaves (top-down)
|
||||
|
||||
For each tree node, it calls its methods (provided by user via inheritance) according to ``tree.data``.
|
||||
|
||||
Unlike ``Transformer`` and ``Visitor``, the Interpreter doesn't automatically visit its sub-branches.
|
||||
The user has to explicitly call ``visit``, ``visit_children``, or use the ``@visit_children_decor``.
|
||||
This allows the user to implement branching and loops.
|
||||
"""
|
||||
|
||||
def visit(self, tree: Tree[_Leaf_T]) -> _Return_T:
|
||||
# There are no guarantees on the type of the value produced by calling a user func for a
|
||||
# child will produce. So only annotate the public method and use an internal method when
|
||||
# visiting child trees.
|
||||
return self._visit_tree(tree)
|
||||
|
||||
def _visit_tree(self, tree: Tree[_Leaf_T]):
|
||||
f = getattr(self, tree.data)
|
||||
wrapper = getattr(f, 'visit_wrapper', None)
|
||||
if wrapper is not None:
|
||||
return f.visit_wrapper(f, tree.data, tree.children, tree.meta)
|
||||
else:
|
||||
return f(tree)
|
||||
|
||||
def visit_children(self, tree: Tree[_Leaf_T]) -> List:
|
||||
return [self._visit_tree(child) if isinstance(child, Tree) else child
|
||||
for child in tree.children]
|
||||
|
||||
def __getattr__(self, name):
|
||||
return self.__default__
|
||||
|
||||
def __default__(self, tree):
|
||||
return self.visit_children(tree)
|
||||
|
||||
|
||||
_InterMethod = Callable[[Type[Interpreter], _Return_T], _R]
|
||||
|
||||
def visit_children_decor(func: _InterMethod) -> _InterMethod:
|
||||
"See Interpreter"
|
||||
@wraps(func)
|
||||
def inner(cls, tree):
|
||||
values = cls.visit_children(tree)
|
||||
return func(cls, values)
|
||||
return inner
|
||||
|
||||
# Decorators
|
||||
|
||||
def _apply_v_args(obj, visit_wrapper):
|
||||
try:
|
||||
_apply = obj._apply_v_args
|
||||
except AttributeError:
|
||||
return _VArgsWrapper(obj, visit_wrapper)
|
||||
else:
|
||||
return _apply(visit_wrapper)
|
||||
|
||||
|
||||
class _VArgsWrapper:
|
||||
"""
|
||||
A wrapper around a Callable. It delegates `__call__` to the Callable.
|
||||
If the Callable has a `__get__`, that is also delegate and the resulting function is wrapped.
|
||||
Otherwise, we use the original function mirroring the behaviour without a __get__.
|
||||
We also have the visit_wrapper attribute to be used by Transformers.
|
||||
"""
|
||||
base_func: Callable
|
||||
|
||||
def __init__(self, func: Callable, visit_wrapper: Callable[[Callable, str, list, Any], Any]):
|
||||
if isinstance(func, _VArgsWrapper):
|
||||
func = func.base_func
|
||||
self.base_func = func
|
||||
self.visit_wrapper = visit_wrapper
|
||||
update_wrapper(self, func)
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
return self.base_func(*args, **kwargs)
|
||||
|
||||
def __get__(self, instance, owner=None):
|
||||
try:
|
||||
# Use the __get__ attribute of the type instead of the instance
|
||||
# to fully mirror the behavior of getattr
|
||||
g = type(self.base_func).__get__
|
||||
except AttributeError:
|
||||
return self
|
||||
else:
|
||||
return _VArgsWrapper(g(self.base_func, instance, owner), self.visit_wrapper)
|
||||
|
||||
def __set_name__(self, owner, name):
|
||||
try:
|
||||
f = type(self.base_func).__set_name__
|
||||
except AttributeError:
|
||||
return
|
||||
else:
|
||||
f(self.base_func, owner, name)
|
||||
|
||||
|
||||
def _vargs_inline(f, _data, children, _meta):
|
||||
return f(*children)
|
||||
def _vargs_meta_inline(f, _data, children, meta):
|
||||
return f(meta, *children)
|
||||
def _vargs_meta(f, _data, children, meta):
|
||||
return f(meta, children)
|
||||
def _vargs_tree(f, data, children, meta):
|
||||
return f(Tree(data, children, meta))
|
||||
|
||||
|
||||
def v_args(inline: bool = False, meta: bool = False, tree: bool = False, wrapper: Optional[Callable] = None) -> Callable[[_DECORATED], _DECORATED]:
|
||||
"""A convenience decorator factory for modifying the behavior of user-supplied callback methods of ``Transformer`` classes.
|
||||
|
||||
By default, transformer callback methods accept one argument - a list of the node's children.
|
||||
|
||||
``v_args`` can modify this behavior. When used on a ``Transformer`` class definition, it applies to
|
||||
all the callback methods inside it.
|
||||
|
||||
``v_args`` can be applied to a single method, or to an entire class. When applied to both,
|
||||
the options given to the method take precedence.
|
||||
|
||||
Parameters:
|
||||
inline (bool, optional): Children are provided as ``*args`` instead of a list argument (not recommended for very long lists).
|
||||
meta (bool, optional): Provides two arguments: ``meta`` and ``children`` (instead of just the latter); ``meta`` isn't available for transformers supplied to Lark using the ``transformer`` parameter (aka internal transformers).
|
||||
tree (bool, optional): Provides the entire tree as the argument, instead of the children.
|
||||
wrapper (function, optional): Provide a function to decorate all methods.
|
||||
|
||||
Example:
|
||||
::
|
||||
|
||||
@v_args(inline=True)
|
||||
class SolveArith(Transformer):
|
||||
def add(self, left, right):
|
||||
return left + right
|
||||
|
||||
@v_args(meta=True)
|
||||
def mul(self, meta, children):
|
||||
logger.info(f'mul at line {meta.line}')
|
||||
left, right = children
|
||||
return left * right
|
||||
|
||||
|
||||
class ReverseNotation(Transformer_InPlace):
|
||||
@v_args(tree=True)
|
||||
def tree_node(self, tree):
|
||||
tree.children = tree.children[::-1]
|
||||
"""
|
||||
if tree and (meta or inline):
|
||||
raise ValueError("Visitor functions cannot combine 'tree' with 'meta' or 'inline'.")
|
||||
|
||||
func = None
|
||||
if meta:
|
||||
if inline:
|
||||
func = _vargs_meta_inline
|
||||
else:
|
||||
func = _vargs_meta
|
||||
elif inline:
|
||||
func = _vargs_inline
|
||||
elif tree:
|
||||
func = _vargs_tree
|
||||
|
||||
if wrapper is not None:
|
||||
if func is not None:
|
||||
raise ValueError("Cannot use 'wrapper' along with 'tree', 'meta' or 'inline'.")
|
||||
func = wrapper
|
||||
|
||||
def _visitor_args_dec(obj):
|
||||
return _apply_v_args(obj, func)
|
||||
return _visitor_args_dec
|
||||
|
||||
|
||||
###}
|
||||
|
||||
|
||||
# --- Visitor Utilities ---
|
||||
|
||||
class CollapseAmbiguities(Transformer):
|
||||
"""
|
||||
Transforms a tree that contains any number of _ambig nodes into a list of trees,
|
||||
each one containing an unambiguous tree.
|
||||
|
||||
The length of the resulting list is the product of the length of all _ambig nodes.
|
||||
|
||||
Warning: This may quickly explode for highly ambiguous trees.
|
||||
|
||||
"""
|
||||
def _ambig(self, options):
|
||||
return sum(options, [])
|
||||
|
||||
def __default__(self, data, children_lists, meta):
|
||||
return [Tree(data, children, meta) for children in combine_alternatives(children_lists)]
|
||||
|
||||
def __default_token__(self, t):
|
||||
return [t]
|
||||
Reference in New Issue
Block a user