Author: fijal Branch: unicode-utf8 Changeset: r92623:ecf3b7cd79eb Date: 2017-10-06 12:14 +0200 http://bitbucket.org/pypy/pypy/changeset/ecf3b7cd79eb/
Log: whack whack whack until we get to the point of getitem working diff --git a/TODO b/TODO --- a/TODO +++ b/TODO @@ -1,2 +1,3 @@ * unskip tests in test_unicodeobject.py -* rutf8.prev_codepoint_pos should use r_uint \ No newline at end of file +* rutf8.prev_codepoint_pos should use r_uint +* elidable in rutf8.check_utf8, WTF is wrong with that \ No newline at end of file diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -126,8 +126,7 @@ self.orig = handler def handle(self, errors, encoding, msg, s, pos, endpos): - s, p = self.orig(errors, encoding, msg, s, pos, endpos) - return s.decode("utf8"), p + return self.orig(errors, encoding, msg, s, pos, endpos) class EncodeWrapper(object): def __init__(self, handler): @@ -145,7 +144,8 @@ def str_decode_unicode_escape(s, slen, errors, final, errorhandler, ud_handler): w = DecodeWrapper(errorhandler) - u, pos = runicode.str_decode_unicode_escape(s, slen, errors, final, w.handle, + u, pos = runicode.str_decode_unicode_escape(s, slen, errors, final, + w.handle, ud_handler) return u.encode('utf8'), pos, len(u) @@ -159,7 +159,7 @@ return getattr(runicode, encoder_call_name)(u, len(u), errors, w.handle) def decoder(s, slen, errors, final, errorhandler): - w = DecodeWrapper(errorhandler) + w = DecodeWrapper((errorhandler)) u, pos = getattr(runicode, decoder_name)(s, slen, errors, final, w.handle) return u.encode('utf8'), pos, len(u) encoder.__name__ = encoder_name diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -66,7 +66,7 @@ "position %d from error handler out of bounds", newpos) w_replace = space.convert_to_w_unicode(w_replace) - return w_replace._utf8, newpos + return w_replace._utf8.decode('utf8'), newpos return call_errorhandler def make_decode_errorhandler(self, space): diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -38,8 +38,8 @@ self._utf8 = utf8str self._length = length self._index_storage = rutf8.null_storage() - if not we_are_translated(): - assert rutf8.check_utf8(utf8str, allow_surrogates=True) == length + #if not we_are_translated(): + # assert rutf8.check_utf8(utf8str, allow_surrogates=True) == length def __repr__(self): """representation for debugging purposes""" diff --git a/rpython/jit/codewriter/effectinfo.py b/rpython/jit/codewriter/effectinfo.py --- a/rpython/jit/codewriter/effectinfo.py +++ b/rpython/jit/codewriter/effectinfo.py @@ -6,6 +6,10 @@ from rpython.tool.algo import bitstring +class UnsupportedFieldExc(Exception): + pass + + class EffectInfo(object): _cache = {} @@ -313,7 +317,10 @@ return if getattr(T.OF, fieldname) is lltype.Void: return - descr = cpu.interiorfielddescrof(T, fieldname) + try: + descr = cpu.interiorfielddescrof(T, fieldname) + except UnsupportedFieldExc: + return descrs_interiorfields.append(descr) # a read or a write to an interiorfield, inside an array of diff --git a/rpython/jit/codewriter/heaptracker.py b/rpython/jit/codewriter/heaptracker.py --- a/rpython/jit/codewriter/heaptracker.py +++ b/rpython/jit/codewriter/heaptracker.py @@ -94,6 +94,7 @@ def all_interiorfielddescrs(gccache, ARRAY, get_field_descr=None): from rpython.jit.backend.llsupport import descr + from rpython.jit.codewriter.effectinfo import UnsupportedFieldExc if get_field_descr is None: get_field_descr = descr.get_field_descr @@ -107,7 +108,7 @@ if name == 'typeptr': continue # dealt otherwise elif isinstance(FIELD, lltype.Struct): - raise Exception("unexpected array(struct(struct))") + raise UnsupportedFieldExc("unexpected array(struct(struct))") res.append(get_field_descr(gccache, ARRAY, name)) return res diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -94,18 +94,20 @@ def default_unicode_error_decode(errors, encoding, msg, s, startingpos, endingpos): + assert endingpos >= 0 if errors == 'replace': - return u'\ufffd'.encode('utf8'), endingpos + return u'\ufffd', endingpos if errors == 'ignore': - return '', endingpos + return u'', endingpos raise UnicodeDecodeError(encoding, s, startingpos, endingpos, msg) def default_unicode_error_encode(errors, encoding, msg, u, startingpos, endingpos): + assert endingpos >= 0 if errors == 'replace': - return '?', None, endingpos + return u'?', None, endingpos if errors == 'ignore': - return '', None, endingpos + return u'', None, endingpos raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg) # ____________________________________________________________ diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -191,7 +191,7 @@ def __init__(self, pos): self.pos = pos -@jit.elidable +#@jit.elidable def check_ascii(s): for i in range(len(s)): if ord(s[i]) > 0x7F: @@ -289,12 +289,14 @@ (ordch1 == 0xf4 and ordch2 > 0x8f)) -@jit.elidable +#@jit.elidable def check_utf8(s, allow_surrogates=False): """Check that 's' is a utf-8-encoded byte string. Returns the length (number of chars) or raise CheckError. Note that surrogates are not handled specially here. """ + import pdb + pdb.set_trace() pos = 0 continuation_bytes = 0 while pos < len(s): @@ -416,6 +418,7 @@ break return storage +@jit.dont_look_inside def codepoint_position_at_index(utf8, storage, index): """ Return byte index of a character inside utf8 encoded string, given storage of type UTF8_INDEX_STORAGE. The index must be smaller than @@ -436,6 +439,7 @@ else: return next_codepoint_pos(utf8, next_codepoint_pos(utf8, bytepos)) +@jit.dont_look_inside def codepoint_at_index(utf8, storage, index): """ Return codepoint of a character inside utf8 encoded string, given storage of type UTF8_INDEX_STORAGE diff --git a/rpython/rtyper/rstr.py b/rpython/rtyper/rstr.py --- a/rpython/rtyper/rstr.py +++ b/rpython/rtyper/rstr.py @@ -8,7 +8,88 @@ from rpython.rtyper.rint import IntegerRepr from rpython.rtyper.rfloat import FloatRepr from rpython.tool.pairtype import pairtype, pair -from rpython.tool.sourcetools import func_with_new_name + +def str_decode_utf8(s): + from rpython.rlib.rstring import UnicodeBuilder + from rpython.rlib import runicode + + size = len(s) + if size == 0: + return u'' + + result = UnicodeBuilder(size) + pos = 0 + while pos < size: + ordch1 = ord(s[pos]) + # fast path for ASCII + # XXX maybe use a while loop here + if ordch1 < 0x80: + result.append(unichr(ordch1)) + pos += 1 + continue + + n = ord(runicode._utf8_code_length[ordch1 - 0x80]) + if pos + n > size: + raise UnicodeDecodeError('utf8', s, pos, pos + 1, + 'whatever') + if n == 0: + raise UnicodeDecodeError('utf8', s, pos, pos + 1, + 'whatever') + elif n == 1: + assert 0, "ascii should have gone through the fast path" + + elif n == 2: + ordch2 = ord(s[pos+1]) + if runicode._invalid_byte_2_of_2(ordch2): + + raise UnicodeDecodeError('utf8', s, pos, pos + 1, + 'whatever') + # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz + result.append(unichr(((ordch1 & 0x1F) << 6) + # 0b00011111 + (ordch2 & 0x3F))) # 0b00111111 + pos += 2 + + elif n == 3: + ordch2 = ord(s[pos+1]) + ordch3 = ord(s[pos+2]) + if (runicode._invalid_byte_2_of_3(ordch1, ordch2, True) or + runicode._invalid_byte_3_of_3(ordch3)): + raise UnicodeDecodeError('utf8', s, pos, pos + 1, + 'whatever') + # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz + result.append(unichr(((ordch1 & 0x0F) << 12) + # 0b00001111 + ((ordch2 & 0x3F) << 6) + # 0b00111111 + (ordch3 & 0x3F))) # 0b00111111 + pos += 3 + + elif n == 4: + ordch2 = ord(s[pos+1]) + ordch3 = ord(s[pos+2]) + ordch4 = ord(s[pos+3]) + if (runicode._invalid_byte_2_of_4(ordch1, ordch2) or + runicode._invalid_byte_3_of_4(ordch3) or + runicode._invalid_byte_4_of_4(ordch4)): + + raise UnicodeDecodeError('utf8', s, pos, pos + 1, + 'whatever') + # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz + c = (((ordch1 & 0x07) << 18) + # 0b00000111 + ((ordch2 & 0x3F) << 12) + # 0b00111111 + ((ordch3 & 0x3F) << 6) + # 0b00111111 + (ordch4 & 0x3F)) # 0b00111111 + if c <= runicode.MAXUNICODE: + result.append(runicode.UNICHR(c)) + else: + # compute and append the two surrogates: + # translate from 10000..10FFFF to 0..FFFF + c -= 0x10000 + # high surrogate = top 10 bits added to D800 + result.append(unichr(0xD800 + (c >> 10))) + # low surrogate = bottom 10 bits added to DC00 + result.append(unichr(0xDC00 + (c & 0x03FF))) + pos += 4 + + return result.build() class AbstractStringRepr(Repr): @@ -16,13 +97,10 @@ @jit.elidable def ll_decode_utf8(self, llvalue): from rpython.rtyper.annlowlevel import hlstr - from rpython.rlib import runicode value = hlstr(llvalue) assert value is not None - errorhandler = runicode.default_unicode_error_decode # NB. keep the arguments in sync with annotator/unaryop.py - u, pos = runicode.str_decode_utf_8_elidable( - value, len(value), 'strict', True, errorhandler, True) + u = str_decode_utf8(value) # XXX maybe the whole ''.decode('utf-8') should be not RPython. return self.ll.llunicode(u) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit