Author: Armin Rigo <ar...@tunes.org> Branch: py3.5-newtext Changeset: r89098:e5f85b6b5bbf Date: 2016-12-16 14:51 +0100 http://bitbucket.org/pypy/pypy/changeset/e5f85b6b5bbf/
Log: Start diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py --- a/pypy/interpreter/baseobjspace.py +++ b/pypy/interpreter/baseobjspace.py @@ -6,7 +6,7 @@ from rpython.rlib.buffer import StringBuffer from rpython.rlib.debug import make_sure_not_resized from rpython.rlib.objectmodel import (we_are_translated, newlist_hint, - compute_unique_id, specialize) + compute_unique_id, specialize, not_rpython) from rpython.rlib.signature import signature from rpython.rlib.rarithmetic import r_uint, SHRT_MIN, SHRT_MAX, \ INT_MIN, INT_MAX, UINT_MAX, USHRT_MAX @@ -255,6 +255,9 @@ def identifier_w(self, space): self._typed_unwrap_error(space, "string") + def text_w(self, space): + self._typed_unwrap_error(space, "string") + def bytearray_list_of_chars_w(self, space): self._typed_unwrap_error(space, "bytearray") @@ -1570,18 +1573,20 @@ return None if self.is_none(w_obj) else self.str_w(w_obj) def text_or_None_w(self, w_obj): - return None if self.is_none(w_obj) else self.identifier_w(w_obj) + return None if self.is_none(w_obj) else self.text_w(w_obj) + @not_rpython def str_w(self, w_obj): """ - if w_obj is unicode, call identifier_w() (i.e., return the UTF-8 + if w_obj is unicode, call text_w() (i.e., return the UTF-8-nosg encoded string). Else, call bytes_w(). - Maybe we should kill str_w completely and manually substitute it with - identifier_w/bytes_w at all call sites? + We should kill str_w completely and manually substitute it with + text_w/identifier_w/bytes_w at all call sites. It remains for + now for tests only. """ if self.isinstance_w(w_obj, self.w_unicode): - return w_obj.identifier_w(self) + return w_obj.text_w(self) else: return w_obj.bytes_w(self) @@ -1660,11 +1665,22 @@ raise oefmt(self.w_TypeError, "argument must be a unicode") return self.unicode_w(w_obj) + def text_w(self, w_obj): + """ + Unwrap a unicode object and return a 'utf-8-nosg' byte string + ('no surrogate'). This encoding always works and is in one-to- + one correspondance with the unicode. + """ + return w_obj.text_w(self) + def identifier_w(self, w_obj): """ Unwrap an object which is used as an identifier (i.e. names of variables, methdods, functions, classes etc.). In py3k, identifiers are unicode strings and are unwrapped as UTF-8 encoded byte strings. + This differs from space.text_w() because it raises an app-level + UnicodeEncodeError if the unicode string contains surrogates. + This corresponds exactly to 'str.encode(obj, "utf-8")' at app-level. """ return w_obj.identifier_w(self) diff --git a/pypy/module/__pypy__/interp_stderrprinter.py b/pypy/module/__pypy__/interp_stderrprinter.py --- a/pypy/module/__pypy__/interp_stderrprinter.py +++ b/pypy/module/__pypy__/interp_stderrprinter.py @@ -34,8 +34,8 @@ return space.wrap(res) def descr_write(self, space, w_data): - # Encode to UTF-8. - data = space.identifier_w(w_data) + # Encode to UTF-8-nosg. + data = space.text_w(w_data) try: n = os.write(self.fd, data) diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py --- a/pypy/objspace/std/test/test_unicodeobject.py +++ b/pypy/objspace/std/test/test_unicodeobject.py @@ -30,6 +30,16 @@ space.w_unicode, "__new__", space.w_unicode, w_uni) assert w_new is w_uni + def test_identifier_or_text_w(self): + space = self.space + w_uni = space.wrap(u'abcd') + assert space.identifier_w(w_uni) == 'abcd' + assert space.text_w(w_uni) == 'abcd' + w_uni = space.wrap(unichr(0xd921) + unichr(0xdddd)) + space.raises_w(space.w_UnicodeEncodeError, space.identifier_w, w_uni) + assert space.text_w(w_uni) == '\xed\xa4\xa1\xed\xb7\x9d' + # ^^^ and not the 4-bytes combined character + class AppTestUnicodeStringStdOnly: def test_compares(self): diff --git a/pypy/objspace/std/typeobject.py b/pypy/objspace/std/typeobject.py --- a/pypy/objspace/std/typeobject.py +++ b/pypy/objspace/std/typeobject.py @@ -1073,7 +1073,7 @@ "__slots__ items must be strings, not '%T'", w_name) if not _isidentifier(space.unicode_w(w_name)): raise oefmt(space.w_TypeError, "__slots__ must be identifiers") - return w_name.identifier_w(space) + return w_name.text_w(space) def create_all_slots(w_self, hasoldstylebase, w_bestbase, force_new_layout): from pypy.objspace.std.listobject import StringSort diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -6,7 +6,9 @@ from rpython.rlib.rstring import StringBuilder, UnicodeBuilder from rpython.rlib.runicode import ( make_unicode_escape_function, str_decode_ascii, str_decode_utf_8, - unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii) + unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii, + unicode_encode_utf8sp, unicode_encode_utf8_forbid_surrogates, + SurrogateError) from rpython.rlib import jit from pypy.interpreter import unicodehelper @@ -77,24 +79,35 @@ def unicode_w(self, space): return self._value - def identifier_w(self, space): + def _identifier_or_text_w(self, space, ignore_sg): try: identifier = jit.conditional_call_elidable( self._utf8, g_encode_utf8, self._value) if not jit.isconstant(self): self._utf8 = identifier - except UnicodeEncodeError: - # bah, this is just to get an official app-level - # UnicodeEncodeError + except SurrogateError: + # If 'ignore_sg' is False, this logic is here only + # to get an official app-level UnicodeEncodeError. + # If 'ignore_sg' is True, we encode instead using + # unicode_encode_utf8sp(). u = self._value - eh = unicodehelper.rpy_encode_error_handler() - try: - identifier = unicode_encode_utf_8(u, len(u), None, - errorhandler=eh) - except unicodehelper.RUnicodeEncodeError as ue: - raise wrap_encode_error(space, ue) + if ignore_sg: + identifier = unicode_encode_utf8sp(u, len(u)) + else: + eh = unicodehelper.rpy_encode_error_handler() + try: + identifier = unicode_encode_utf_8(u, len(u), None, + errorhandler=eh) + except unicodehelper.RUnicodeEncodeError as ue: + raise wrap_encode_error(space, ue) return identifier + def text_w(self, space): + return self._identifier_or_text_w(space, ignore_sg=True) + + def identifier_w(self, space): + return self._identifier_or_text_w(space, ignore_sg=False) + def listview_unicode(self): return _create_list_from_unicode(self._value) @@ -1279,7 +1292,7 @@ @jit.elidable def g_encode_utf8(value): """This is a global function because of jit.conditional_call_value""" - return value.encode('utf-8') + return unicode_encode_utf8_forbid_surrogates(value, len(value)) _repr_function, _ = make_unicode_escape_function( pass_printable=True, unicode_output=True, quotes=True, prefix='') diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -428,6 +428,37 @@ _encodeUCS4(result, ch) return result.build() +class SurrogateError(Exception): + pass + +def unicode_encode_utf8_forbid_surrogates(s, size): + # Strict surrogate-forbidding utf-8 encoding. Any surrogate character + # raises an interp-level SurrogateError, even on 16-bit hosts. + # --- XXX check in detail what occurs on 16-bit hosts in PyPy 3 --- + assert(size >= 0) + result = StringBuilder(size) + pos = 0 + while pos < size: + ch = ord(s[pos]) + pos += 1 + if ch < 0x80: + # Encode ASCII + result.append(chr(ch)) + elif ch < 0x0800: + # Encode Latin-1 + result.append(chr((0xc0 | (ch >> 6)))) + result.append(chr((0x80 | (ch & 0x3f)))) + elif ch < 0x10000: + if 0xD800 <= ch <= 0xDFFF: + raise SurrogateError + # Encode UCS2 Unicode ordinals + result.append((chr((0xe0 | (ch >> 12))))) + result.append((chr((0x80 | ((ch >> 6) & 0x3f))))) + result.append((chr((0x80 | (ch & 0x3f))))) + else: + _encodeUCS4(result, ch) + return result.build() + # ____________________________________________________________ # utf-16 _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit