Author: fijal Branch: unicode-utf8 Changeset: r92615:0379d71a32bf Date: 2017-10-05 18:40 +0200 http://bitbucket.org/pypy/pypy/changeset/0379d71a32bf/
Log: whack whack whack whack. I hate RPython diff --git a/TODO b/TODO --- a/TODO +++ b/TODO @@ -1,1 +1,2 @@ * unskip tests in test_unicodeobject.py +* rutf8.prev_codepoint_pos should use r_uint \ No newline at end of file diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -70,7 +70,7 @@ try: length = rutf8.check_utf8(string, allow_surrogates=True) except rutf8.CheckError as e: - XXX + raise Exception("foo") decode_error_handler(space)('strict', 'utf8', e.msg, string, e.startpos, e.endpos) raise False, "unreachable" diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -37,7 +37,7 @@ assert length >= 0 self._utf8 = utf8str self._length = length - self._index_storage = None + self._index_storage = rutf8.null_storage() if not we_are_translated(): assert rutf8.check_utf8(utf8str, allow_surrogates=True) == length @@ -521,6 +521,8 @@ if keepends: eol = pos lgt += line_end_chars + assert eol >= 0 + assert sol >= 0 strs_w.append(W_UnicodeObject(value[sol:eol], lgt)) return space.newlist(strs_w) @@ -636,7 +638,7 @@ def _getitem_result(self, space, index): if index >= self._length: raise oefmt(space.w_IndexError, "string index out of range") - if self._index_storage is None: + if self._index_storage == rutf8.null_storage(): self._index_storage = rutf8.create_utf8_index_storage(self._utf8, self._length) start = rutf8.codepoint_position_at_index(self._utf8, diff --git a/rpython/rlib/rstring.py b/rpython/rlib/rstring.py --- a/rpython/rlib/rstring.py +++ b/rpython/rlib/rstring.py @@ -7,7 +7,7 @@ from rpython.rtyper.llannotation import SomePtr from rpython.rlib import jit from rpython.rlib.objectmodel import newlist_hint, resizelist_hint, specialize, not_rpython -from rpython.rlib.rarithmetic import ovfcheck, LONG_BIT as BLOOM_WIDTH +from rpython.rlib.rarithmetic import ovfcheck, LONG_BIT as BLOOM_WIDTH, intmask from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb from rpython.rtyper.extregistry import ExtRegistryEntry from rpython.tool.pairtype import pairtype @@ -32,7 +32,9 @@ if isutf8: from rpython.rlib.rutf8 import next_codepoint_pos assert pos >= 0 - return next_codepoint_pos(s, pos) + r = next_codepoint_pos(s, pos) + assert r >= 0 + return r else: return pos + 1 @@ -42,7 +44,7 @@ from rpython.rlib.rutf8 import prev_codepoint_pos if pos <= 0: return -1 - return prev_codepoint_pos(s, pos) + return intmask(prev_codepoint_pos(s, pos)) else: return pos - 1 diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -18,7 +18,7 @@ from rpython.rlib.objectmodel import enforceargs from rpython.rlib.rstring import StringBuilder from rpython.rlib import jit -from rpython.rlib.rarithmetic import r_uint +from rpython.rlib.rarithmetic import r_uint, intmask from rpython.rtyper.lltypesystem import lltype @@ -81,6 +81,7 @@ Assumes valid utf8. 'pos' must be before the end of the string. """ chr1 = ord(code[pos]) + assert pos >= 0 if chr1 <= 0x7F: return pos + 1 if chr1 <= 0xDF: @@ -93,20 +94,24 @@ """Gives the position of the previous codepoint. 'pos' must not be zero. """ - pos = r_uint(pos) - pos -= 1 + pos -= 1 # ruint if pos >= len(code): # for the case where pos - 1 == len(code): + assert pos >= 0 return pos # assume there is an extra '\x00' character chr1 = ord(code[pos]) if chr1 <= 0x7F: + assert pos >= 0 return pos pos -= 1 if ord(code[pos]) >= 0xC0: + assert pos >= 0 return pos pos -= 1 if ord(code[pos]) >= 0xC0: + assert pos >= 0 return pos pos -= 1 + assert pos >= 0 return pos def compute_length_utf8(s): @@ -375,6 +380,9 @@ for _j in range(16): ASCII_INDEX_STORAGE[_i].ofs[_j] = chr(_j * 4 + 1) +def null_storage(): + return lltype.nullptr(UTF8_INDEX_STORAGE) + def create_utf8_index_storage(utf8, utf8len): """ Create an index storage which stores index of each 4th character in utf8 encoded unicode string. @@ -421,6 +429,7 @@ if index == 0: return prev_codepoint_pos(utf8, bytepos) elif index == 1: + assert bytepos >= 0 return bytepos elif index == 2: return next_codepoint_pos(utf8, bytepos) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit