Hello community, here is the log from the commit of package python-lark-parser for openSUSE:Factory checked in at 2019-11-04 17:09:17 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-lark-parser (Old) and /work/SRC/openSUSE:Factory/.python-lark-parser.new.2990 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-lark-parser" Mon Nov 4 17:09:17 2019 rev:6 rq:743738 version:0.7.7 Changes: -------- --- /work/SRC/openSUSE:Factory/python-lark-parser/python-lark-parser.changes 2019-09-23 12:08:14.173897188 +0200 +++ /work/SRC/openSUSE:Factory/.python-lark-parser.new.2990/python-lark-parser.changes 2019-11-04 17:09:19.620448337 +0100 @@ -1,0 +2,10 @@ +Sun Oct 27 14:28:26 UTC 2019 - Lars Vogdt <l...@linux-schulserver.de> + +- Update to 0.7.7: + * Fixed a bug in Earley where running it from different threads + produced bad results + * Improved error reporting when using LALR + * Added 'edit_terminals' option, to allow programmatical manipulation + of terminals, for example to support keywords in different languages. + +------------------------------------------------------------------- Old: ---- lark-parser-0.7.5.tar.gz New: ---- lark-0.7.7.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-lark-parser.spec ++++++ --- /var/tmp/diff_new_pack.Z7O5HJ/_old 2019-11-04 17:09:20.208448966 +0100 +++ /var/tmp/diff_new_pack.Z7O5HJ/_new 2019-11-04 17:09:20.212448970 +0100 @@ -18,13 +18,13 @@ %{?!python_module:%define python_module() python-%{**} python3-%{**}} Name: python-lark-parser -Version: 0.7.5 +Version: 0.7.7 Release: 0 Summary: A parsing library for Python License: MIT Group: Development/Languages/Python -URL: https://github.com/erezsh/lark -Source: https://github.com/lark-parser/lark/archive/%{version}.tar.gz#/lark-parser-%{version}.tar.gz +URL: https://github.com/lark-parser +Source: https://github.com/lark-parser/lark/archive/%{version}.tar.gz#/lark-%{version}.tar.gz # extracted test gramars from nearley -> https://github.com/kach/nearley Source1: testdata.tar.gz BuildRequires: %{python_module Js2Py} ++++++ lark-parser-0.7.5.tar.gz -> lark-0.7.7.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lark-0.7.5/docs/grammar.md new/lark-0.7.7/docs/grammar.md --- old/lark-0.7.5/docs/grammar.md 2019-09-06 07:18:42.000000000 +0200 +++ new/lark-0.7.7/docs/grammar.md 2019-10-03 10:29:49.000000000 +0200 @@ -1,5 +1,13 @@ # Grammar Reference +Table of contents: + +1. [Definitions](#defs) +1. [Terminals](#terms) +1. [Rules](#rules) +1. [Directives](#dirs) + +<a name="defs"></a> ## Definitions **A grammar** is a list of rules and terminals, that together define a language. @@ -25,6 +33,7 @@ Names of rules are always in lowercase, while names of terminals are always in uppercase. This distinction has practical effects, for the shape of the generated parse-tree, and the automatic construction of the lexer (aka tokenizer, or scanner). +<a name="terms"></a> ## Terminals Terminals are used to match text into symbols. They can be defined as a combination of literals and other terminals. @@ -70,6 +79,53 @@ SQL_SELECT: "select"i ``` +### Regular expressions & Ambiguity + +Each terminal is eventually compiled to a regular expression. All the operators and references inside it are mapped to their respective expressions. + +For example, in the following grammar, `A1` and `A2`, are equivalent: +```perl +A1: "a" | "b" +A2: /a|b/ +``` + +This means that inside terminals, Lark cannot detect or resolve ambiguity, even when using Earley. + +For example, for this grammar: +```perl +start : (A | B)+ +A : "a" | "ab" +B : "b" +``` +We get this behavior: + +```bash +>>> p.parse("ab") +Tree(start, [Token(A, 'a'), Token(B, 'b')]) +``` + +This is happening because Python's regex engine always returns the first matching option. + +If you find yourself in this situation, the recommended solution is to use rules instead. + +Example: + +```python +>>> p = Lark("""start: (a | b)+ +... !a: "a" | "ab" +... !b: "b" +... """, ambiguity="explicit") +>>> print(p.parse("ab").pretty()) +_ambig + start + a ab + start + a a + b b +``` + + +<a name="rules"></a> ## Rules **Syntax:** @@ -114,6 +170,7 @@ Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default). +<a name="dirs"></a> ## Directives ### %ignore diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lark-0.7.5/docs/json_tutorial.md new/lark-0.7.7/docs/json_tutorial.md --- old/lark-0.7.5/docs/json_tutorial.md 2019-09-06 07:18:42.000000000 +0200 +++ new/lark-0.7.7/docs/json_tutorial.md 2019-10-03 10:29:49.000000000 +0200 @@ -230,7 +230,8 @@ class MyTransformer(Transformer): def list(self, items): return list(items) - def pair(self, (k,v)): + def pair(self, key_value): + k, v = key_value return k, v def dict(self, items): return dict(items) @@ -251,9 +252,11 @@ from lark import Transformer class TreeToJson(Transformer): - def string(self, (s,)): + def string(self, s): + (s,) = s return s[1:-1] - def number(self, (n,)): + def number(self, n): + (n,) = n return float(n) list = list @@ -315,9 +318,11 @@ """ class TreeToJson(Transformer): - def string(self, (s,)): + def string(self, s): + (s,) = s return s[1:-1] - def number(self, (n,)): + def number(self, n): + (n,) = n return float(n) list = list diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lark-0.7.5/lark/__init__.py new/lark-0.7.7/lark/__init__.py --- old/lark-0.7.5/lark/__init__.py 2019-09-06 07:18:42.000000000 +0200 +++ new/lark-0.7.7/lark/__init__.py 2019-10-03 10:29:49.000000000 +0200 @@ -5,4 +5,4 @@ from .lexer import Token from .lark import Lark -__version__ = "0.7.5" +__version__ = "0.7.7" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lark-0.7.5/lark/lark.py new/lark-0.7.7/lark/lark.py --- old/lark-0.7.5/lark/lark.py 2019-09-06 07:18:42.000000000 +0200 +++ new/lark-0.7.7/lark/lark.py 2019-10-03 10:29:49.000000000 +0200 @@ -69,6 +69,7 @@ 'propagate_positions': False, 'lexer_callbacks': {}, 'maybe_placeholders': False, + 'edit_terminals': None, } def __init__(self, options_dict): @@ -205,6 +206,10 @@ # Compile the EBNF grammar into BNF self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) + if self.options.edit_terminals: + for t in self.terminals: + self.options.edit_terminals(t) + self._terminals_dict = {t.name:t for t in self.terminals} # If the user asked to invert the priorities, negate them all here. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lark-0.7.5/lark/lexer.py new/lark-0.7.7/lark/lexer.py --- old/lark-0.7.5/lark/lexer.py 2019-09-06 07:18:42.000000000 +0200 +++ new/lark-0.7.7/lark/lexer.py 2019-10-03 10:29:49.000000000 +0200 @@ -3,7 +3,7 @@ import re from .utils import Str, classify, get_regexp_width, Py36, Serialize -from .exceptions import UnexpectedCharacters, LexError +from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken ###{standalone @@ -43,7 +43,7 @@ __serialize_fields__ = 'value', 'flags' type = "str" - + def to_regexp(self): return self._get_flags(re.escape(self.value)) @@ -166,36 +166,33 @@ while line_ctr.char_pos < len(stream): lexer = self.lexer - for mre, type_from_index in lexer.mres: - m = mre.match(stream, line_ctr.char_pos) - if not m: - continue - - t = None - value = m.group(0) - type_ = type_from_index[m.lastindex] - if type_ not in ignore_types: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - if t.type in lexer.callback: - t = lexer.callback[t.type](t) - if not isinstance(t, Token): - raise ValueError("Callbacks must return a token (returned %r)" % t) - last_token = t - yield t - else: - if type_ in lexer.callback: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - lexer.callback[type_](t) - - line_ctr.feed(value, type_ in newline_types) - if t: - t.end_line = line_ctr.line - t.end_column = line_ctr.column + res = lexer.match(stream, line_ctr.char_pos) + if not res: + allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types + if not allowed: + allowed = {"<END-OF-FILE>"} + raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) - break + value, type_ = res + + t = None + if type_ not in ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + if not isinstance(t, Token): + raise ValueError("Callbacks must return a token (returned %r)" % t) + last_token = t + yield t else: - allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) + if type_ in lexer.callback: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + lexer.callback[type_](t) + + line_ctr.feed(value, type_ in newline_types) + if t: + t.end_line = line_ctr.line + t.end_column = line_ctr.column class UnlessCallback: @@ -330,6 +327,11 @@ self.mres = build_mres(terminals) + def match(self, stream, pos): + for mre, type_from_index in self.mres: + m = mre.match(stream, pos) + if m: + return m.group(0), type_from_index[m.lastindex] def lex(self, stream): return _Lex(self).lex(stream, self.newline_types, self.ignore_types) @@ -367,9 +369,21 @@ def lex(self, stream): l = _Lex(self.lexers[self.parser_state], self.parser_state) - for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): - yield x - l.lexer = self.lexers[self.parser_state] - l.state = self.parser_state + try: + for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): + yield x + l.lexer = self.lexers[self.parser_state] + l.state = self.parser_state + except UnexpectedCharacters as e: + # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, + # but not in the current context. + # This tests the input against the global context, to provide a nicer error. + root_match = self.root_lexer.match(stream, e.pos_in_stream) + if not root_match: + raise + + value, type_ = root_match + t = Token(type_, value, e.pos_in_stream, e.line, e.column) + raise UnexpectedToken(t, e.allowed) ###} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lark-0.7.5/lark/parsers/earley.py new/lark-0.7.7/lark/parsers/earley.py --- old/lark-0.7.5/lark/parsers/earley.py 2019-09-06 07:18:42.000000000 +0200 +++ new/lark-0.7.7/lark/parsers/earley.py 2019-10-03 10:29:49.000000000 +0200 @@ -46,12 +46,8 @@ # skip the extra tree walk. We'll also skip this if the user just didn't specify priorities # on any rules. if self.forest_sum_visitor is None and rule.options and rule.options.priority is not None: - self.forest_sum_visitor = ForestSumVisitor() + self.forest_sum_visitor = ForestSumVisitor - if resolve_ambiguity: - self.forest_tree_visitor = ForestToTreeVisitor(self.callbacks, self.forest_sum_visitor) - else: - self.forest_tree_visitor = ForestToAmbiguousTreeVisitor(self.callbacks, self.forest_sum_visitor) self.term_matcher = term_matcher @@ -316,7 +312,10 @@ assert False, 'Earley should not generate multiple start symbol items!' # Perform our SPPF -> AST conversion using the right ForestVisitor. - return self.forest_tree_visitor.visit(solutions[0]) + forest_tree_visitor_cls = ForestToTreeVisitor if self.resolve_ambiguity else ForestToAmbiguousTreeVisitor + forest_tree_visitor = forest_tree_visitor_cls(self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor()) + + return forest_tree_visitor.visit(solutions[0]) class ApplyCallbacks(Transformer_InPlace):