Author: Carl Friedrich Bolz-Tereick <cfb...@gmx.de> Branch: py3.6 Changeset: r97745:1b9016af40bd Date: 2019-10-09 13:39 +0200 http://bitbucket.org/pypy/pypy/changeset/1b9016af40bd/
Log: speed up utf8-handling of csv module diff --git a/pypy/module/_csv/interp_reader.py b/pypy/module/_csv/interp_reader.py --- a/pypy/module/_csv/interp_reader.py +++ b/pypy/module/_csv/interp_reader.py @@ -1,5 +1,5 @@ from rpython.rlib.rstring import UnicodeBuilder -from rpython.rlib.rutf8 import Utf8StringIterator +from rpython.rlib.rutf8 import Utf8StringIterator, Utf8StringBuilder from rpython.rlib import objectmodel from pypy.interpreter.baseobjspace import W_Root from pypy.interpreter.error import OperationError @@ -38,16 +38,15 @@ assert field_builder is not None if field_builder.getlength() >= field_limit.limit: raise self.error(u"field larger than field limit") - field_builder.append(c) + field_builder.append_code(c) def save_field(self, field_builder): space = self.space field = field_builder.build() + w_obj = space.newutf8(field, field_builder.getlength()) if self.numeric_field: self.numeric_field = False - w_obj = space.call_function(space.w_float, space.newtext(field)) - else: - w_obj = space.newtext(field) + w_obj = space.call_function(space.w_float, w_obj) self.fields_w.append(w_obj) def next_w(self): @@ -79,13 +78,11 @@ u"(did you open the file in text mode?") line = space.utf8_w(w_line) for c in Utf8StringIterator(line): - # XXX rewrite this to use c (as int) not unichr(c) - c = unichr(c) - if c == '\0': + if c == 0: raise self.error(u"line contains NULL byte") if state == START_RECORD: - if c == b'\n' or c == b'\r': + if c == ord(u'\n') or c == ord(u'\r'): state = EAT_CRNL continue # normal character - handle as START_FIELD @@ -93,23 +90,23 @@ # fall-through to the next case if state == START_FIELD: - field_builder = UnicodeBuilder(64) + field_builder = Utf8StringBuilder(64) # expecting field - if c == u'\n' or c == u'\r': + if c == ord(u'\n') or c == ord(u'\r'): # save empty field self.save_field(field_builder) state = EAT_CRNL - elif (c == dialect.quotechar and + elif (c == ord(dialect.quotechar) and dialect.quoting != QUOTE_NONE): # start quoted field state = IN_QUOTED_FIELD - elif c == dialect.escapechar: + elif c == ord(dialect.escapechar): # possible escaped character state = ESCAPED_CHAR - elif c == u' ' and dialect.skipinitialspace: + elif c == ord(u' ') and dialect.skipinitialspace: # ignore space at start of field pass - elif c == dialect.delimiter: + elif c == ord(dialect.delimiter): # save empty field self.save_field(field_builder) else: @@ -120,7 +117,7 @@ state = IN_FIELD elif state == ESCAPED_CHAR: - if c in '\n\r': + if c == ord(u'\n') or c == ord(u'\r'): self.add_char(field_builder, c) state = AFTER_ESCAPED_CRNL else: @@ -129,14 +126,14 @@ elif state == IN_FIELD or state == AFTER_ESCAPED_CRNL: # in unquoted field - if c == u'\n' or c == u'\r': + if c == ord(u'\n') or c == ord(u'\r'): # end of line self.save_field(field_builder) state = EAT_CRNL - elif c == dialect.escapechar: + elif c == ord(dialect.escapechar): # possible escaped character state = ESCAPED_CHAR - elif c == dialect.delimiter: + elif c == ord(dialect.delimiter): # save field - wait for new field self.save_field(field_builder) state = START_FIELD @@ -146,10 +143,10 @@ elif state == IN_QUOTED_FIELD: # in quoted field - if c == dialect.escapechar: + if c == ord(dialect.escapechar): # Possible escape character state = ESCAPE_IN_QUOTED_FIELD - elif (c == dialect.quotechar and + elif (c == ord(dialect.quotechar) and dialect.quoting != QUOTE_NONE): if dialect.doublequote: # doublequote; " represented by "" @@ -168,15 +165,15 @@ elif state == QUOTE_IN_QUOTED_FIELD: # doublequote - seen a quote in an quoted field if (dialect.quoting != QUOTE_NONE and - c == dialect.quotechar): + c == ord(dialect.quotechar)): # save "" as " self.add_char(field_builder, c) state = IN_QUOTED_FIELD - elif c == dialect.delimiter: + elif c == ord(dialect.delimiter): # save field - wait for new field self.save_field(field_builder) state = START_FIELD - elif c == u'\n' or c == u'\r': + elif c == ord(u'\n') or c == ord(u'\r'): # end of line self.save_field(field_builder) state = EAT_CRNL @@ -189,7 +186,7 @@ dialect.delimiter, dialect.quotechar)) elif state == EAT_CRNL: - if not (c == u'\n' or c == u'\r'): + if not (c == ord(u'\n') or c == ord(u'\r')): raise self.error(u"new-line character seen in unquoted " u"field - do you need to open the file " u"in universal-newline mode?") @@ -198,16 +195,16 @@ self.save_field(field_builder) break elif state == ESCAPED_CHAR: - self.add_char(field_builder, u'\n') + self.add_char(field_builder, ord(u'\n')) state = IN_FIELD elif state == IN_QUOTED_FIELD: pass elif state == ESCAPE_IN_QUOTED_FIELD: - self.add_char(field_builder, u'\n') + self.add_char(field_builder, ord(u'\n')) state = IN_QUOTED_FIELD elif state == START_FIELD: # save empty field - field_builder = UnicodeBuilder(1) + field_builder = Utf8StringBuilder() self.save_field(field_builder) break elif state == AFTER_ESCAPED_CRNL: _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit