Author: fijal Branch: unicode-utf8 Changeset: r93109:86548802b11b Date: 2017-11-21 10:29 +0100 http://bitbucket.org/pypy/pypy/changeset/86548802b11b/
Log: interpreter fixes diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py --- a/pypy/interpreter/pyparser/parsestring.py +++ b/pypy/interpreter/pyparser/parsestring.py @@ -231,11 +231,14 @@ return s[pt:ps] def decode_utf8_recode(space, s, ps, end, recode_encoding): - lgt, flag = unicodehelper.check_utf8_or_raise(space, s, ps, end) - w_v = unicodehelper.encode(space, space.newutf8(s[ps:end], lgt, flag), + p = ps + while p < end and ord(s[p]) & 0x80: + p += 1 + lgt, flag = unicodehelper.check_utf8_or_raise(space, s, ps, p) + w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt, flag), recode_encoding) v = space.bytes_w(w_v) - return v, ps + return v, p def raise_app_valueerror(space, msg): raise OperationError(space.w_ValueError, space.newtext(msg)) diff --git a/pypy/interpreter/pyparser/test/test_parsestring.py b/pypy/interpreter/pyparser/test/test_parsestring.py --- a/pypy/interpreter/pyparser/test/test_parsestring.py +++ b/pypy/interpreter/pyparser/test/test_parsestring.py @@ -10,7 +10,7 @@ assert space.str_w(w_ret) == value elif isinstance(value, unicode): assert space.type(w_ret) == space.w_unicode - assert space.unicode_w(w_ret) == value + assert space.utf8_w(w_ret).decode('utf8') == value else: assert False @@ -102,7 +102,4 @@ def test_decode_unicode_utf8(self): buf = parsestring.decode_unicode_utf8(self.space, 'u"\xf0\x9f\x92\x8b"', 2, 6) - if sys.maxunicode == 65535: - assert buf == r"\U0000d83d\U0000dc8b" - else: - assert buf == r"\U0001f48b" + assert buf == r"\U0001f48b" diff --git a/pypy/interpreter/test/test_objspace.py b/pypy/interpreter/test/test_objspace.py --- a/pypy/interpreter/test/test_objspace.py +++ b/pypy/interpreter/test/test_objspace.py @@ -216,9 +216,7 @@ space = self.space w = space.wrap assert space.text0_w(w("123")) == "123" - exc = space.raises_w(space.w_TypeError, space.text0_w, w("123\x004")) - assert space.unicode0_w(w(u"123")) == u"123" - exc = space.raises_w(space.w_TypeError, space.unicode0_w, w(u"123\x004")) + space.raises_w(space.w_TypeError, space.text0_w, w("123\x004")) def test_getindex_w(self): w_instance1 = self.space.appexec([], """(): diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -351,12 +351,12 @@ try: chr = r_uint(int(s[pos:pos+digits], 16)) except ValueError: - aaaa endinpos = pos while s[endinpos] in hexdigits: endinpos += 1 res, pos = errorhandler(errors, encoding, message, s, pos-2, endinpos) + size, flag = rutf8.check_utf8(res, True) builder.append(res) else: # when we get here, chr is a 32-bit unicode character @@ -1392,7 +1392,7 @@ while pos < size: ch = s[pos] - c = mapping.get(ch, ERROR_CHAR) + c = mapping.get(ord(ch), ERROR_CHAR) if c == ERROR_CHAR: r, pos = errorhandler(errors, "charmap", "character maps to <undefined>", @@ -1407,20 +1407,17 @@ def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None): - YYY + size = len(s) if mapping is None: - return unicode_encode_latin_1(s, size, errors, - errorhandler=errorhandler) - - if errorhandler is None: - errorhandler = default_unicode_error_encode + return utf8_encode_latin_1(s, size, errors, + errorhandler=errorhandler) if size == 0: return '' result = StringBuilder(size) pos = 0 while pos < size: - ch = s[pos] + ch = rutf8.codepoint_at_pos(s, pos) c = mapping.get(ch, '') if len(c) == 0: @@ -1428,9 +1425,10 @@ collend = pos + 1 while collend < size and mapping.get(s[collend], '') == '': collend += 1 - ru, rs, pos = errorhandler(errors, "charmap", - "character maps to <undefined>", - s, pos, collend) + rs, pos = errorhandler(errors, "charmap", + "character maps to <undefined>", + s, pos, collend) + XXXX if rs is not None: # py3k only result.append(rs) @@ -1445,6 +1443,6 @@ result.append(c2) continue result.append(c) - pos += 1 + pos = rutf8.next_codepoint_pos(s, pos) return result.build() diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -551,10 +551,10 @@ # get the character from the mapping if self.mapping_w is not None: - w_ch = self.mapping_w[ord(ch)] + w_ch = self.mapping_w[ch] else: try: - w_ch = space.getitem(self.w_mapping, space.newint(ord(ch))) + w_ch = space.getitem(self.w_mapping, space.newint(ch)) except OperationError as e: if not e.match(space, space.w_LookupError): raise @@ -587,7 +587,7 @@ # get the character from the mapping try: - w_ch = space.getitem(self.w_mapping, space.newint(ord(ch))) + w_ch = space.getitem(self.w_mapping, space.newint(ch)) except OperationError as e: if not e.match(space, space.w_LookupError): raise @@ -633,8 +633,8 @@ return space.newtuple([space.newutf8(result, lgt, flag), space.newint(consumed)]) -@unwrap_spec(utf8='utf8', errors='text_or_none') -def charmap_encode(space, utf8, errors="strict", w_mapping=None): +@unwrap_spec(errors='text_or_none') +def charmap_encode(space, w_unicode, errors="strict", w_mapping=None): from pypy.interpreter import unicodehelper if errors is None: @@ -645,9 +645,10 @@ mapping = Charmap_Encode(space, w_mapping) state = space.fromcache(CodecState) - result = unicodehelper.unicode_encode_charmap( - utf8, errors, state.encode_error_handler, mapping) - return space.newtuple([space.newbytes(result), space.newint(len(uni))]) + w_uni = unicodehelper.convert_arg_to_w_unicode(space, w_unicode) + result = unicodehelper.utf8_encode_charmap( + space.utf8_w(w_uni), errors, state.encode_error_handler, mapping) + return space.newtuple([space.newbytes(result), space.newint(w_uni._len())]) @unwrap_spec(chars='utf8') _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit