Author: Carl Friedrich Bolz-Tereick <cfb...@gmx.de> Branch: Changeset: r94730:e85e93d7927e Date: 2018-06-06 15:11 +0200 http://bitbucket.org/pypy/pypy/changeset/e85e93d7927e/
Log: merge pyparser-improvements-3 some small refactorings in interpreter/pyparser and module/parser diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst --- a/pypy/doc/whatsnew-head.rst +++ b/pypy/doc/whatsnew-head.rst @@ -27,3 +27,8 @@ The reverse-debugger branch has been merged. For more information, see https://bitbucket.org/pypy/revdb + + +.. branch: pyparser-improvements-3 + +Small refactorings in the Python parser. diff --git a/pypy/interpreter/pyparser/future.py b/pypy/interpreter/pyparser/future.py --- a/pypy/interpreter/pyparser/future.py +++ b/pypy/interpreter/pyparser/future.py @@ -43,7 +43,7 @@ self.tok = self.tokens[index] def skip(self, n): - if self.tok[0] == n: + if self.tok.token_type == n: self.next() return True else: @@ -51,7 +51,7 @@ def skip_name(self, name): from pypy.interpreter.pyparser import pygram - if self.tok[0] == pygram.tokens.NAME and self.tok[1] == name: + if self.tok.token_type == pygram.tokens.NAME and self.tok.value == name: self.next() return True else: @@ -59,8 +59,8 @@ def next_feature_name(self): from pypy.interpreter.pyparser import pygram - if self.tok[0] == pygram.tokens.NAME: - name = self.tok[1] + if self.tok.token_type == pygram.tokens.NAME: + name = self.tok.value self.next() if self.skip_name("as"): self.skip(pygram.tokens.NAME) @@ -101,7 +101,7 @@ # somewhere inside the last __future__ import statement # (at the start would be fine too, but it's easier to grab a # random position inside) - last_position = (it.tok[2], it.tok[3]) + last_position = (it.tok.lineno, it.tok.column) result |= future_flags.get_compiler_feature(it.next_feature_name()) while it.skip(pygram.tokens.COMMA): result |= future_flags.get_compiler_feature(it.next_feature_name()) diff --git a/pypy/interpreter/pyparser/parser.py b/pypy/interpreter/pyparser/parser.py --- a/pypy/interpreter/pyparser/parser.py +++ b/pypy/interpreter/pyparser/parser.py @@ -34,6 +34,18 @@ new.token_ids = self.token_ids return new + + def classify(self, token): + """Find the label for a token.""" + if token.token_type == self.KEYWORD_TOKEN: + label_index = self.keyword_ids.get(token.value, -1) + if label_index != -1: + return label_index + label_index = self.token_ids.get(token.token_type, -1) + if label_index == -1: + raise ParseError("invalid token", token) + return label_index + def _freeze_(self): # Remove some attributes not used in parsing. try: @@ -66,6 +78,33 @@ b[pos] |= bit return str(b) + +class Token(object): + def __init__(self, token_type, value, lineno, column, line): + self.token_type = token_type + self.value = value + self.lineno = lineno + # 0-based offset + self.column = column + self.line = line + + def __repr__(self): + return "Token(%s, %s)" % (self.token_type, self.value) + + def __eq__(self, other): + # for tests + return ( + self.token_type == other.token_type and + self.value == other.value and + self.lineno == other.lineno and + self.column == other.column and + self.line == other.line + ) + + def __ne__(self, other): + return not self == other + + class Node(object): __slots__ = ("type", ) @@ -106,6 +145,11 @@ self.lineno = lineno self.column = column + @staticmethod + def fromtoken(token): + return Terminal( + token.token_type, token.value, token.lineno, token.column) + def __repr__(self): return "Terminal(type=%s, value=%r)" % (self.type, self.value) @@ -194,20 +238,14 @@ class ParseError(Exception): - def __init__(self, msg, token_type, value, lineno, column, line, - expected=-1, expected_str=None): + def __init__(self, msg, token, expected=-1, expected_str=None): self.msg = msg - self.token_type = token_type - self.value = value - self.lineno = lineno - # this is a 0-based index - self.column = column - self.line = line + self.token = token self.expected = expected self.expected_str = expected_str def __str__(self): - return "ParserError(%s, %r)" % (self.token_type, self.value) + return "ParserError(%s)" % (self.token, ) class StackEntry(object): @@ -250,8 +288,8 @@ self.root = None self.stack = StackEntry(None, self.grammar.dfas[start - 256], 0) - def add_token(self, token_type, value, lineno, column, line): - label_index = self.classify(token_type, value, lineno, column, line) + def add_token(self, token): + label_index = self.grammar.classify(token) sym_id = 0 # for the annotator while True: dfa = self.stack.dfa @@ -262,7 +300,7 @@ sym_id = self.grammar.labels[i] if label_index == i: # We matched a non-terminal. - self.shift(next_state, token_type, value, lineno, column) + self.shift(next_state, token) state = states[next_state] # While the only possible action is to accept, pop nodes off # the stack. @@ -279,8 +317,7 @@ sub_node_dfa = self.grammar.dfas[sym_id - 256] # Check if this token can start a child node. if sub_node_dfa.could_match_token(label_index): - self.push(sub_node_dfa, next_state, sym_id, lineno, - column) + self.push(sub_node_dfa, next_state, sym_id) break else: # We failed to find any arcs to another state, so unless this @@ -288,8 +325,7 @@ if is_accepting: self.pop() if self.stack is None: - raise ParseError("too much input", token_type, value, - lineno, column, line) + raise ParseError("too much input", token) else: # If only one possible input would satisfy, attach it to the # error. @@ -300,28 +336,16 @@ else: expected = -1 expected_str = None - raise ParseError("bad input", token_type, value, lineno, - column, line, expected, expected_str) + raise ParseError("bad input", token, expected, expected_str) - def classify(self, token_type, value, lineno, column, line): - """Find the label for a token.""" - if token_type == self.grammar.KEYWORD_TOKEN: - label_index = self.grammar.keyword_ids.get(value, -1) - if label_index != -1: - return label_index - label_index = self.grammar.token_ids.get(token_type, -1) - if label_index == -1: - raise ParseError("invalid token", token_type, value, lineno, column, - line) - return label_index - def shift(self, next_state, token_type, value, lineno, column): + def shift(self, next_state, token): """Shift a non-terminal and prepare for the next state.""" - new_node = Terminal(token_type, value, lineno, column) + new_node = Terminal.fromtoken(token) self.stack.node_append_child(new_node) self.stack.state = next_state - def push(self, next_dfa, next_state, node_type, lineno, column): + def push(self, next_dfa, next_state, node_type): """Push a terminal and adjust the current state.""" self.stack.state = next_state self.stack = self.stack.push(next_dfa, 0) diff --git a/pypy/interpreter/pyparser/pyparse.py b/pypy/interpreter/pyparser/pyparse.py --- a/pypy/interpreter/pyparser/pyparse.py +++ b/pypy/interpreter/pyparser/pyparse.py @@ -147,37 +147,37 @@ flags &= ~consts.PyCF_DONT_IMPLY_DEDENT self.prepare(_targets[compile_info.mode]) - tp = 0 try: try: # Note: we no longer pass the CO_FUTURE_* to the tokenizer, # which is expected to work independently of them. It's # certainly the case for all futures in Python <= 2.7. tokens = pytokenizer.generate_tokens(source_lines, flags) - - newflags, last_future_import = ( - future.add_future_flags(self.future_flags, tokens)) - compile_info.last_future_import = last_future_import - compile_info.flags |= newflags - - self.grammar = pygram.choose_grammar( - print_function=compile_info.flags & consts.CO_FUTURE_PRINT_FUNCTION, - revdb=self.space.config.translation.reverse_debugger) - - for tp, value, lineno, column, line in tokens: - if self.add_token(tp, value, lineno, column, line): - break except error.TokenError as e: e.filename = compile_info.filename raise except error.TokenIndentationError as e: e.filename = compile_info.filename raise + + newflags, last_future_import = ( + future.add_future_flags(self.future_flags, tokens)) + compile_info.last_future_import = last_future_import + compile_info.flags |= newflags + + self.grammar = pygram.choose_grammar( + print_function=compile_info.flags & consts.CO_FUTURE_PRINT_FUNCTION, + revdb=self.space.config.translation.reverse_debugger) + + try: + for token in tokens: + if self.add_token(token): + break except parser.ParseError as e: # Catch parse errors, pretty them up and reraise them as a # SyntaxError. new_err = error.IndentationError - if tp == pygram.tokens.INDENT: + if token.token_type == pygram.tokens.INDENT: msg = "unexpected indent" elif e.expected == pygram.tokens.INDENT: msg = "expected an indented block" @@ -189,7 +189,7 @@ # parser.ParseError(...).column is 0-based, but the offsets in the # exceptions in the error module are 1-based, hence the '+ 1' - raise new_err(msg, e.lineno, e.column + 1, e.line, + raise new_err(msg, e.token.lineno, e.token.column + 1, e.token.line, compile_info.filename) else: tree = self.root diff --git a/pypy/interpreter/pyparser/pytokenize.py b/pypy/interpreter/pyparser/pytokenize.py --- a/pypy/interpreter/pyparser/pytokenize.py +++ b/pypy/interpreter/pyparser/pytokenize.py @@ -1,9 +1,6 @@ # ______________________________________________________________________ """Module pytokenize -THIS FILE WAS COPIED FROM pypy/module/parser/pytokenize.py AND ADAPTED -TO BE ANNOTABLE (Mainly made lists homogeneous) - This is a modified version of Ka-Ping Yee's tokenize module found in the Python standard library. @@ -12,7 +9,6 @@ expressions have been replaced with hand built DFA's using the basil.util.automata module. -$Id: pytokenize.py,v 1.3 2003/10/03 16:31:53 jriehl Exp $ """ # ______________________________________________________________________ @@ -65,22 +61,3 @@ single_quoted[t] = t tabsize = 8 - -# PYPY MODIFICATION: removed TokenError class as it's not needed here - -# PYPY MODIFICATION: removed StopTokenizing class as it's not needed here - -# PYPY MODIFICATION: removed printtoken() as it's not needed here - -# PYPY MODIFICATION: removed tokenize() as it's not needed here - -# PYPY MODIFICATION: removed tokenize_loop() as it's not needed here - -# PYPY MODIFICATION: removed generate_tokens() as it was copied / modified -# in pythonlexer.py - -# PYPY MODIFICATION: removed main() as it's not needed here - -# ______________________________________________________________________ -# End of pytokenize.py - diff --git a/pypy/interpreter/pyparser/pytokenizer.py b/pypy/interpreter/pyparser/pytokenizer.py --- a/pypy/interpreter/pyparser/pytokenizer.py +++ b/pypy/interpreter/pyparser/pytokenizer.py @@ -1,4 +1,5 @@ from pypy.interpreter.pyparser import automata +from pypy.interpreter.pyparser.parser import Token from pypy.interpreter.pyparser.pygram import tokens from pypy.interpreter.pyparser.pytoken import python_opmap from pypy.interpreter.pyparser.error import TokenError, TokenIndentationError @@ -103,7 +104,7 @@ endmatch = endDFA.recognize(line) if endmatch >= 0: pos = end = endmatch - tok = (tokens.STRING, contstr + line[:end], strstart[0], + tok = Token(tokens.STRING, contstr + line[:end], strstart[0], strstart[1], line) token_list.append(tok) last_comment = '' @@ -111,7 +112,7 @@ contline = None elif (needcont and not line.endswith('\\\n') and not line.endswith('\\\r\n')): - tok = (tokens.ERRORTOKEN, contstr + line, strstart[0], + tok = Token(tokens.ERRORTOKEN, contstr + line, strstart[0], strstart[1], line) token_list.append(tok) last_comment = '' @@ -140,11 +141,11 @@ if column > indents[-1]: # count indents or dedents indents.append(column) - token_list.append((tokens.INDENT, line[:pos], lnum, 0, line)) + token_list.append(Token(tokens.INDENT, line[:pos], lnum, 0, line)) last_comment = '' while column < indents[-1]: indents.pop() - token_list.append((tokens.DEDENT, '', lnum, pos, line)) + token_list.append(Token(tokens.DEDENT, '', lnum, pos, line)) last_comment = '' if column != indents[-1]: err = "unindent does not match any outer indentation level" @@ -177,11 +178,11 @@ token, initial = line[start:end], line[start] if initial in numchars or \ (initial == '.' and token != '.'): # ordinary number - token_list.append((tokens.NUMBER, token, lnum, start, line)) + token_list.append(Token(tokens.NUMBER, token, lnum, start, line)) last_comment = '' elif initial in '\r\n': if not parenstack: - tok = (tokens.NEWLINE, last_comment, lnum, start, line) + tok = Token(tokens.NEWLINE, last_comment, lnum, start, line) token_list.append(tok) last_comment = '' elif initial == '#': @@ -193,7 +194,7 @@ if endmatch >= 0: # all on one line pos = endmatch token = line[start:pos] - tok = (tokens.STRING, token, lnum, start, line) + tok = Token(tokens.STRING, token, lnum, start, line) token_list.append(tok) last_comment = '' else: @@ -212,16 +213,16 @@ contline = line break else: # ordinary string - tok = (tokens.STRING, token, lnum, start, line) + tok = Token(tokens.STRING, token, lnum, start, line) token_list.append(tok) last_comment = '' elif initial in namechars: # ordinary name - token_list.append((tokens.NAME, token, lnum, start, line)) + token_list.append(Token(tokens.NAME, token, lnum, start, line)) last_comment = '' elif initial == '\\': # continued stmt continued = 1 elif initial == '$': - token_list.append((tokens.REVDBMETAVAR, token, + token_list.append(Token(tokens.REVDBMETAVAR, token, lnum, start, line)) last_comment = '' else: @@ -246,7 +247,7 @@ punct = python_opmap[token] else: punct = tokens.OP - token_list.append((punct, token, lnum, start, line)) + token_list.append(Token(punct, token, lnum, start, line)) last_comment = '' else: start = whiteSpaceDFA.recognize(line, pos) @@ -255,22 +256,22 @@ if start<max and line[start] in single_quoted: raise TokenError("end of line (EOL) while scanning string literal", line, lnum, start+1, token_list) - tok = (tokens.ERRORTOKEN, line[pos], lnum, pos, line) + tok = Token(tokens.ERRORTOKEN, line[pos], lnum, pos, line) token_list.append(tok) last_comment = '' pos = pos + 1 lnum -= 1 if not (flags & consts.PyCF_DONT_IMPLY_DEDENT): - if token_list and token_list[-1][0] != tokens.NEWLINE: - tok = (tokens.NEWLINE, '', lnum, 0, '\n') + if token_list and token_list[-1].token_type != tokens.NEWLINE: + tok = Token(tokens.NEWLINE, '', lnum, 0, '\n') token_list.append(tok) for indent in indents[1:]: # pop remaining indent levels - token_list.append((tokens.DEDENT, '', lnum, pos, line)) - tok = (tokens.NEWLINE, '', lnum, 0, '\n') + token_list.append(Token(tokens.DEDENT, '', lnum, pos, line)) + tok = Token(tokens.NEWLINE, '', lnum, 0, '\n') token_list.append(tok) - token_list.append((tokens.ENDMARKER, '', lnum, pos, line)) + token_list.append(Token(tokens.ENDMARKER, '', lnum, pos, line)) return token_list diff --git a/pypy/interpreter/pyparser/test/test_automata.py b/pypy/interpreter/pyparser/test/test_automata.py --- a/pypy/interpreter/pyparser/test/test_automata.py +++ b/pypy/interpreter/pyparser/test/test_automata.py @@ -1,4 +1,4 @@ -from pypy.interpreter.pyparser.automata import DFA, DEFAULT +from pypy.interpreter.pyparser.automata import DFA, NonGreedyDFA, DEFAULT def test_states(): d = DFA([{"\x00": 1}, {"\x01": 0}], [False, True]) @@ -10,3 +10,20 @@ assert d.states == "\x01\x00" assert d.defaults == "\xff\x00" assert d.max_char == 1 + +def test_recognize(): + d = DFA([{"a": 1}, {"b": 0}], [False, True]) + assert d.recognize("ababab") == 5 + assert d.recognize("c") == -1 + + d = DFA([{"a": 1}, {DEFAULT: 0}], [False, True]) + assert d.recognize("a,a?ab") == 5 + assert d.recognize("c") == -1 + + d = NonGreedyDFA([{"a": 1}, {"b": 0}], [False, True]) + assert d.recognize("ababab") == 1 + assert d.recognize("c") == -1 + + d = NonGreedyDFA([{"a": 1}, {DEFAULT: 0}], [False, True]) + assert d.recognize("a,a?ab") == 1 + assert d.recognize("c") == -1 diff --git a/pypy/interpreter/pyparser/test/test_parser.py b/pypy/interpreter/pyparser/test/test_parser.py --- a/pypy/interpreter/pyparser/test/test_parser.py +++ b/pypy/interpreter/pyparser/test/test_parser.py @@ -20,7 +20,7 @@ rl = StringIO.StringIO(input + "\n").readline gen = tokenize.generate_tokens(rl) for tp, value, begin, end, line in gen: - if self.add_token(tp, value, begin[0], begin[1], line): + if self.add_token(parser.Token(tp, value, begin[0], begin[1], line)): py.test.raises(StopIteration, gen.next) return self.root diff --git a/pypy/interpreter/pyparser/test/test_pytokenizer.py b/pypy/interpreter/pyparser/test/test_pytokenizer.py --- a/pypy/interpreter/pyparser/test/test_pytokenizer.py +++ b/pypy/interpreter/pyparser/test/test_pytokenizer.py @@ -1,5 +1,6 @@ import pytest from pypy.interpreter.pyparser import pytokenizer +from pypy.interpreter.pyparser.parser import Token from pypy.interpreter.pyparser.pygram import tokens from pypy.interpreter.pyparser.error import TokenError @@ -22,12 +23,12 @@ line = "a+1" tks = tokenize(line) assert tks == [ - (tokens.NAME, 'a', 1, 0, line), - (tokens.PLUS, '+', 1, 1, line), - (tokens.NUMBER, '1', 1, 2, line), - (tokens.NEWLINE, '', 2, 0, '\n'), - (tokens.NEWLINE, '', 2, 0, '\n'), - (tokens.ENDMARKER, '', 2, 0, ''), + Token(tokens.NAME, 'a', 1, 0, line), + Token(tokens.PLUS, '+', 1, 1, line), + Token(tokens.NUMBER, '1', 1, 2, line), + Token(tokens.NEWLINE, '', 2, 0, '\n'), + Token(tokens.NEWLINE, '', 2, 0, '\n'), + Token(tokens.ENDMARKER, '', 2, 0, ''), ] def test_error_parenthesis(self): diff --git a/pypy/module/parser/pyparser.py b/pypy/module/parser/pyparser.py --- a/pypy/module/parser/pyparser.py +++ b/pypy/module/parser/pyparser.py @@ -133,10 +133,9 @@ space.newtext(message)) -def get_node_type(space, w_tuple): +def get_node_type(space, tup_w, w_tuple): try: - w_type = space.getitem(w_tuple, space.newint(0)) - return space.int_w(w_type) + return space.int_w(tup_w[0]) except OperationError: raise parser_error(space, w_tuple, "Illegal component tuple.") @@ -145,44 +144,47 @@ self.lineno = 0 def build_node_tree(space, w_tuple): - type = get_node_type(space, w_tuple) + tup_w = space.unpackiterable(w_tuple) + if len(tup_w) == 0: + raise parser_error(space, w_tuple, "tuple too short") + + type = get_node_type(space, tup_w, w_tuple) node_state = NodeState() if 0 <= type < 256: # The tuple is simple, but it doesn't start with a start symbol. # Raise an exception now and be done with it. raise parser_error(space, w_tuple, "Illegal syntax-tree; cannot start with terminal symbol.") + return build_node_children(space, type, tup_w, node_state) + +def build_node_children(space, type, tup_w, node_state): node = pyparse.parser.Nonterminal(type) - build_node_children(space, w_tuple, node, node_state) - return node - -def build_node_children(space, w_tuple, node, node_state): - for w_elem in space.unpackiterable(w_tuple)[1:]: - type = get_node_type(space, w_elem) + for i in range(1, len(tup_w)): + w_elem = tup_w[i] + subtup_w = space.unpackiterable(w_elem) + type = get_node_type(space, subtup_w, w_elem) if type < 256: # Terminal node - length = space.len_w(w_elem) + length = len(subtup_w) if length == 2: - _, w_obj = space.unpackiterable(w_elem, 2) + _, w_obj = subtup_w elif length == 3: - _, w_obj, w_lineno = space.unpackiterable(w_elem, 3) + _, w_obj, w_lineno = subtup_w else: raise parse_error( space, "terminal nodes must have 2 or 3 entries") strn = space.text_w(w_obj) child = pyparse.parser.Terminal(type, strn, node_state.lineno, 0) else: - child = pyparse.parser.Nonterminal(type) + child = build_node_children(space, type, subtup_w, node_state) node.append_child(child) - if type >= 256: # Nonterminal node - build_node_children(space, w_elem, child, node_state) - elif type == pyparse.pygram.tokens.NEWLINE: + if type == pyparse.pygram.tokens.NEWLINE: node_state.lineno += 1 + return node -def validate_node(space, tree): +def validate_node(space, tree, parser): assert tree.type >= 256 type = tree.type - 256 - parser = pyparse.PythonParser(space) if type >= len(parser.grammar.dfas): raise parse_error(space, "Unrecognized node type %d." % type) dfa = parser.grammar.dfas[type] @@ -195,7 +197,7 @@ if label == ch.type: # The child is acceptable; validate it recursively if ch.type >= 256: - validate_node(space, ch) + validate_node(space, ch, parser) # Update the state, and move on to the next child. arcs, is_accepting = dfa.states[next_state] break @@ -209,5 +211,6 @@ def tuple2st(space, w_sequence): # Convert the tree to the internal form before checking it tree = build_node_tree(space, w_sequence) - validate_node(space, tree) + parser = pyparse.PythonParser(space) + validate_node(space, tree, parser) return W_STType(tree, 'eval') _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit