Author: Philip Jenvey <[email protected]>
Branch: py3k-refactor-str-types
Changeset: r68869:9d4908e6605a
Date: 2014-01-23 11:07 -0800
http://bitbucket.org/pypy/pypy/changeset/9d4908e6605a/
Log: reintegrate our unicode changes
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -8,7 +8,7 @@
from pypy.objspace.std.formatting import mod_format
from pypy.objspace.std.stdtypedef import StdTypeDef
from pypy.objspace.std.stringmethods import StringMethods
-from pypy.objspace.std.unicodeobject import (unicode_from_string,
+from pypy.objspace.std.unicodeobject import (
decode_object, unicode_from_encoded_object, _get_encoding_and_errors)
from rpython.rlib.jit import we_are_jitted
from rpython.rlib.objectmodel import compute_hash, compute_unique_id,
import_from_mixin
diff --git a/pypy/objspace/std/stringmethods.py
b/pypy/objspace/std/stringmethods.py
--- a/pypy/objspace/std/stringmethods.py
+++ b/pypy/objspace/std/stringmethods.py
@@ -513,7 +513,14 @@
if self._startswith(space, value, w_prefix, start, end):
return space.w_True
return space.w_False
- return space.newbool(self._startswith(space, value, w_prefix, start,
end))
+ try:
+ return space.newbool(self._startswith(space, value, w_prefix,
start, end))
+ except OperationError as e:
+ if e.match(space, space.w_TypeError):
+ msg = ("startswith first arg must be str or a tuple of str, "
+ "not %T")
+ raise operationerrfmt(space.w_TypeError, msg, w_prefix)
+ raise
def _startswith(self, space, value, w_prefix, start, end):
return startswith(value, self._op_val(space, w_prefix), start, end)
@@ -527,7 +534,15 @@
if self._endswith(space, value, w_suffix, start, end):
return space.w_True
return space.w_False
- return space.newbool(self._endswith(space, value, w_suffix, start,
end))
+ try:
+ return space.newbool(self._endswith(space, value, w_suffix, start,
+ end))
+ except OperationError as e:
+ if e.match(space, space.w_TypeError):
+ msg = ("endswith first arg must be str or a tuple of str, not "
+ "%T")
+ raise operationerrfmt(space.w_TypeError, msg, w_suffix)
+ raise
def _endswith(self, space, value, w_prefix, start, end):
return endswith(value, self._op_val(space, w_prefix), start, end)
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -16,7 +16,7 @@
__all__ = ['W_UnicodeObject', 'wrapunicode', 'plain_str2unicode',
'encode_object', 'decode_object', 'unicode_from_object',
- 'unicode_from_string', 'unicode_to_decimal_w']
+ 'unicode_to_decimal_w']
class W_UnicodeObject(W_Root):
@@ -99,9 +99,9 @@
def _op_val(self, space, w_other):
if isinstance(w_other, W_UnicodeObject):
return w_other._value
- if space.isinstance_w(w_other, space.w_str):
- return unicode_from_string(space, w_other)._value
- return unicode_from_encoded_object(space, w_other, None,
"strict")._value
+ raise operationerrfmt(space.w_TypeError,
+ "Can't convert '%T' object to str implicitly",
+ w_other)
def _chr(self, char):
assert len(char) == 1
@@ -155,41 +155,101 @@
return space.newlist_unicode(lst)
@staticmethod
- @unwrap_spec(w_string = WrappedDefault(""))
- def descr_new(space, w_unicodetype, w_string, w_encoding=None,
+ @unwrap_spec(w_object = WrappedDefault(u''))
+ def descr_new(space, w_unicodetype, w_object=None, w_encoding=None,
w_errors=None):
# NB. the default value of w_obj is really a *wrapped* empty string:
# there is gateway magic at work
- w_obj = w_string
+ w_obj = w_object
encoding, errors = _get_encoding_and_errors(space, w_encoding,
w_errors)
- # convoluted logic for the case when unicode subclass has a __unicode__
- # method, we need to call this method
- is_precisely_unicode = space.is_w(space.type(w_obj), space.w_unicode)
- if (is_precisely_unicode or
- (space.isinstance_w(w_obj, space.w_unicode) and
- space.findattr(w_obj, space.wrap('__unicode__')) is None)):
- if encoding is not None or errors is not None:
- raise OperationError(space.w_TypeError, space.wrap(
- 'decoding Unicode is not supported'))
- if (is_precisely_unicode and
- space.is_w(w_unicodetype, space.w_unicode)):
- return w_obj
- w_value = w_obj
+ if encoding is None and errors is None:
+ w_value = unicode_from_object(space, w_obj)
else:
- if encoding is None and errors is None:
- w_value = unicode_from_object(space, w_obj)
- else:
- w_value = unicode_from_encoded_object(space, w_obj,
- encoding, errors)
- if space.is_w(w_unicodetype, space.w_unicode):
- return w_value
+ w_value = unicode_from_encoded_object(space, w_obj,
+ encoding, errors)
+ if space.is_w(w_unicodetype, space.w_unicode):
+ return w_value
assert isinstance(w_value, W_UnicodeObject)
w_newobj = space.allocate_instance(W_UnicodeObject, w_unicodetype)
W_UnicodeObject.__init__(w_newobj, w_value._value)
return w_newobj
+ @staticmethod
+ def descr_maketrans(space, w_type, w_x, w_y=None, w_z=None):
+ if space.is_none(w_y):
+ y = None
+ else:
+ y = space.unicode_w(w_y)
+ if space.is_none(w_z):
+ z = None
+ else:
+ z = space.unicode_w(w_z)
+
+ w_new = space.newdict()
+ if y is not None:
+ # x must be a string too, of equal length
+ ylen = len(y)
+ try:
+ x = space.unicode_w(w_x)
+ except OperationError, e:
+ if not e.match(space, space.w_TypeError):
+ raise
+ raise OperationError(space.w_TypeError, space.wrap(
+ "first maketrans argument must "
+ "be a string if there is a second argument"))
+ if len(x) != ylen:
+ raise OperationError(space.w_ValueError, space.wrap(
+ "the first two maketrans "
+ "arguments must have equal length"))
+ # create entries for translating chars in x to those in y
+ for i in range(len(x)):
+ w_key = space.newint(ord(x[i]))
+ w_value = space.newint(ord(y[i]))
+ space.setitem(w_new, w_key, w_value)
+ # create entries for deleting chars in z
+ if z is not None:
+ for i in range(len(z)):
+ w_key = space.newint(ord(z[i]))
+ space.setitem(w_new, w_key, space.w_None)
+ else:
+ # x must be a dict
+ if not space.is_w(space.type(w_x), space.w_dict):
+ raise OperationError(space.w_TypeError, space.wrap(
+ "if you give only one argument "
+ "to maketrans it must be a dict"))
+ # copy entries into the new dict, converting string keys to int
keys
+ w_iter = space.iter(space.call_method(w_x, "items"))
+ while True:
+ try:
+ w_item = space.next(w_iter)
+ except OperationError, e:
+ if not e.match(space, space.w_StopIteration):
+ raise
+ break
+ w_key, w_value = space.unpackiterable(w_item, 2)
+ if space.isinstance_w(w_key, space.w_unicode):
+ # convert string keys to integer keys
+ key = space.unicode_w(w_key)
+ if len(key) != 1:
+ raise OperationError(space.w_ValueError, space.wrap(
+ "string keys in translate "
+ "table must be of length 1"))
+ w_key = space.newint(ord(key[0]))
+ else:
+ # just keep integer keys
+ try:
+ space.int_w(w_key)
+ except OperationError, e:
+ if not e.match(space, space.w_TypeError):
+ raise
+ raise OperationError(space.w_TypeError, space.wrap(
+ "keys in translate table must "
+ "be strings or integers"))
+ space.setitem(w_new, w_key, w_value)
+ return w_new
+
def descr_repr(self, space):
chars = self._value
size = len(chars)
@@ -197,7 +257,10 @@
return space.wrap(s)
def descr_str(self, space):
- return encode_object(space, self, None, None)
+ if space.is_w(space.type(self), space.w_unicode):
+ return self
+ # Subtype -- return genuine unicode string with the same value.
+ return space.wrap(space.unicode_w(self))
def descr_hash(self, space):
x = compute_hash(self._value)
@@ -209,13 +272,6 @@
except OperationError, e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
- if (e.match(space, space.w_UnicodeDecodeError) or
- e.match(space, space.w_UnicodeEncodeError)):
- msg = ("Unicode equal comparison failed to convert both "
- "arguments to Unicode - interpreting them as being "
- "unequal")
- space.warn(space.wrap(msg), space.w_UnicodeWarning)
- return space.w_False
raise
def descr_ne(self, space, w_other):
@@ -224,13 +280,6 @@
except OperationError, e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
- if (e.match(space, space.w_UnicodeDecodeError) or
- e.match(space, space.w_UnicodeEncodeError)):
- msg = ("Unicode unequal comparison failed to convert both "
- "arguments to Unicode - interpreting them as being "
- "unequal")
- space.warn(space.wrap(msg), space.w_UnicodeWarning)
- return space.w_True
raise
def descr_lt(self, space, w_other):
@@ -274,19 +323,16 @@
return newformat.format_method(space, self, __args__.arguments_w,
w_kwds, True)
+ def descr_format_map(self, space, w_mapping):
+ return newformat.format_method(space, self, None, w_mapping, True)
+
def descr__format__(self, space, w_format_spec):
- """
- if not space.isinstance_w(w_format_spec, space.w_unicode):
- w_format_spec = space.call_function(space.w_unicode, w_format_spec)
- spec = space.unicode_w(w_format_spec)
- formatter = newformat.unicode_formatter(space, spec)
- self2 = unicode_from_object(space, self)
- assert isinstance(self2, W_UnicodeObject)
- return formatter.format_string(self2._value)
- """
return newformat.run_formatter(space, w_format_spec, "format_string",
self)
+ def descr_iter(self, space):
+ return space.newseqiter(self)
+
def descr_mod(self, space, w_values):
return mod_format(space, self, w_values, do_unicode=True)
@@ -334,16 +380,6 @@
return 0
return 1
- def descr_formatter_parser(self, space):
- from pypy.objspace.std.newformat import unicode_template_formatter
- tformat = unicode_template_formatter(space, space.unicode_w(self))
- return tformat.formatter_parser()
-
- def descr_formatter_field_name_split(self, space):
- from pypy.objspace.std.newformat import unicode_template_formatter
- tformat = unicode_template_formatter(space, space.unicode_w(self))
- return tformat.formatter_field_name_split()
-
def descr_isdecimal(self, space):
return self._is_generic(space, '_isdecimal')
@@ -370,6 +406,15 @@
cased = True
return space.newbool(cased)
+ def descr_isidentifier(self, space):
+ return space.newbool(_isidentifier(self._value))
+
+ def descr_isprintable(self, space):
+ for uchar in self._value:
+ if not unicodedb.isprintable(ord(uchar)):
+ return space.w_False
+ return space.w_True
+
def wrapunicode(space, uni):
return W_UnicodeObject(uni)
@@ -390,6 +435,25 @@
space.wrap("ordinal not in range(128)")]))
assert False, "unreachable"
+def _isidentifier(u):
+ if not u:
+ return False
+
+ # PEP 3131 says that the first character must be in XID_Start and
+ # subsequent characters in XID_Continue, and for the ASCII range,
+ # the 2.x rules apply (i.e start with letters and underscore,
+ # continue with letters, digits, underscore). However, given the
+ # current definition of XID_Start and XID_Continue, it is sufficient
+ # to check just for these, except that _ must be allowed as starting
+ # an identifier.
+ first = u[0]
+ if not (unicodedb.isxidstart(ord(first)) or first == u'_'):
+ return False
+
+ for i in range(1, len(u)):
+ if not unicodedb.isxidcontinue(ord(u[i])):
+ return False
+ return True
# stuff imported from bytesobject for interoperability
@@ -420,14 +484,13 @@
if encoding == 'ascii':
u = space.unicode_w(w_object)
eh = unicodehelper.encode_error_handler(space)
- return space.wrap(unicode_encode_ascii(
+ return space.wrapbytes(unicode_encode_ascii(
u, len(u), None, errorhandler=eh))
if encoding == 'utf-8':
u = space.unicode_w(w_object)
eh = unicodehelper.encode_error_handler(space)
- return space.wrap(unicode_encode_utf_8(
- u, len(u), None, errorhandler=eh,
- allow_surrogates=True))
+ return space.wrapbytes(unicode_encode_utf_8(
+ u, len(u), None, errorhandler=eh))
from pypy.module._codecs.interp_codecs import lookup_codec
w_encoder = space.getitem(lookup_codec(space, encoding), space.wrap(0))
if errors is None:
@@ -436,10 +499,9 @@
w_errors = space.wrap(errors)
w_restuple = space.call_function(w_encoder, w_object, w_errors)
w_retval = space.getitem(w_restuple, space.wrap(0))
- if not space.isinstance_w(w_retval, space.w_str):
- raise operationerrfmt(space.w_TypeError,
- "encoder did not return an string object (type '%s')",
- space.type(w_retval).getname(space))
+ if not space.isinstance_w(w_retval, space.w_bytes):
+ msg = "encoder did not return a bytes string (type '%T')"
+ raise operationerrfmt(space.w_TypeError, msg, w_retval)
return w_retval
def decode_object(space, w_obj, encoding, errors):
@@ -456,8 +518,7 @@
s = space.bufferstr_w(w_obj)
eh = unicodehelper.decode_error_handler(space)
return space.wrap(str_decode_utf_8(
- s, len(s), None, final=True, errorhandler=eh,
- allow_surrogates=True)[0])
+ s, len(s), None, final=True, errorhandler=eh)[0])
w_codecs = space.getbuiltinmodule("_codecs")
w_decode = space.getattr(w_codecs, space.wrap("decode"))
if errors is None:
@@ -486,44 +547,29 @@
def unicode_from_object(space, w_obj):
if space.is_w(space.type(w_obj), space.w_unicode):
return w_obj
- elif space.is_w(space.type(w_obj), space.w_str):
- w_res = w_obj
- else:
- w_unicode_method = space.lookup(w_obj, "__unicode__")
- # obscure workaround: for the next two lines see
- # test_unicode_conversion_with__str__
- if w_unicode_method is None:
- if space.isinstance_w(w_obj, space.w_unicode):
- return space.wrap(space.unicode_w(w_obj))
- w_unicode_method = space.lookup(w_obj, "__str__")
- if w_unicode_method is not None:
- w_res = space.get_and_call_function(w_unicode_method, w_obj)
- else:
- w_res = space.str(w_obj)
- if space.isinstance_w(w_res, space.w_unicode):
- return w_res
- return unicode_from_encoded_object(space, w_res, None, "strict")
+ if space.lookup(w_obj, "__str__") is not None:
+ return space.str(w_obj)
+ return space.repr(w_obj)
-def unicode_from_string(space, w_str):
- # this is a performance and bootstrapping hack
- encoding = getdefaultencoding(space)
- if encoding != 'ascii':
- return unicode_from_encoded_object(space, w_str, encoding, "strict")
- s = space.str_w(w_str)
- try:
- return W_UnicodeObject(s.decode("ascii"))
- except UnicodeDecodeError:
- # raising UnicodeDecodeError is messy, "please crash for me"
- return unicode_from_encoded_object(space, w_str, "ascii", "strict")
+def ascii_from_object(space, w_obj):
+ """Implements builtins.ascii()"""
+ # repr is guaranteed to be unicode
+ w_repr = space.repr(w_obj)
+ w_encoded = encode_object(space, w_repr, 'ascii', 'backslashreplace')
+ return decode_object(space, w_encoded, 'ascii', None)
class UnicodeDocstrings:
- """unicode(object='') -> unicode object
- unicode(string[, encoding[, errors]]) -> unicode object
+ """str(object='') -> str
+ str(bytes_or_buffer[, encoding[, errors]]) -> str
- Create a new Unicode object from the given encoded string.
- encoding defaults to the current default string encoding.
- errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.
+ Create a new string object from the given object. If encoding or
+ errors is specified, then the object must expose a data buffer
+ that will be decoded using the given encoding and error handler.
+ Otherwise, returns the result of object.__str__() (if defined)
+ or repr(object).
+ encoding defaults to sys.getdefaultencoding().
+ errors defaults to 'strict'.
"""
@@ -554,18 +600,15 @@
def __getnewargs__():
""""""
- def __getslice__():
- """x.__getslice__(i, j) <==> x[i:j]
-
- Use of negative indices is not supported.
- """
-
def __gt__():
"""x.__gt__(y) <==> x>y"""
def __hash__():
"""x.__hash__() <==> hash(x)"""
+ def __iter__():
+ """x.__iter__() <==> iter(x)"""
+
def __le__():
"""x.__le__(y) <==> x<=y"""
@@ -676,6 +719,14 @@
The substitutions are identified by braces ('{' and '}').
"""
+ def format_map():
+ """S.format_map(mapping) -> str
+
+ Return a formatted version of S, using substitutions from
+ mapping. The substitutions are identified by braces ('{' and
+ '}').
+ """
+
def index():
"""S.index(sub[, start[, end]]) -> int
@@ -710,6 +761,13 @@
and there is at least one character in S, False otherwise.
"""
+ def isidentifier():
+ """S.isidentifier() -> bool
+
+ Return True if S is a valid identifier according to the language
+ definition.
+ """
+
def islower():
"""S.islower() -> bool
@@ -724,6 +782,13 @@
False otherwise.
"""
+ def isprintable():
+ """S.isprintable() -> bool
+
+ Return True if all characters in S are considered printable in
+ repr() or S is empty, False otherwise.
+ """
+
def isspace():
"""S.isspace() -> bool
@@ -775,6 +840,19 @@
If chars is a str, it will be converted to unicode before stripping
"""
+ def maketrans():
+ """str.maketrans(x[, y[, z]]) -> dict (static method)
+
+ Return a translation table usable for str.translate().
+ If there is only one argument, it must be a dictionary mapping Unicode
+ ordinals (integers) or characters to Unicode ordinals, strings or None.
+ Character keys will be then converted to ordinals.
+ If there are two arguments, they must be strings of equal length, and
+ in the resulting dictionary, each character in x will be mapped to the
+ character at the same position in y. If there is a third argument, it
+ must be a string, whose characters will be mapped to None in the
result.
+ """
+
def partition():
"""S.partition(sep) -> (head, sep, tail)
@@ -939,6 +1017,8 @@
__ge__ = interp2app(W_UnicodeObject.descr_ge,
doc=UnicodeDocstrings.__ge__.__doc__),
+ __iter__ = interp2app(W_UnicodeObject.descr_iter,
+ doc=UnicodeDocstrings.__iter__.__doc__),
__len__ = interp2app(W_UnicodeObject.descr_len,
doc=UnicodeDocstrings.__len__.__doc__),
__contains__ = interp2app(W_UnicodeObject.descr_contains,
@@ -953,8 +1033,6 @@
__getitem__ = interp2app(W_UnicodeObject.descr_getitem,
doc=UnicodeDocstrings.__getitem__.__doc__),
- __getslice__ = interp2app(W_UnicodeObject.descr_getslice,
- doc=UnicodeDocstrings.__getslice__.__doc__),
capitalize = interp2app(W_UnicodeObject.descr_capitalize,
doc=UnicodeDocstrings.capitalize.__doc__),
@@ -962,8 +1040,6 @@
doc=UnicodeDocstrings.center.__doc__),
count = interp2app(W_UnicodeObject.descr_count,
doc=UnicodeDocstrings.count.__doc__),
- decode = interp2app(W_UnicodeObject.descr_decode,
- doc=UnicodeDocstrings.decode.__doc__),
encode = interp2app(W_UnicodeObject.descr_encode,
doc=UnicodeDocstrings.encode.__doc__),
expandtabs = interp2app(W_UnicodeObject.descr_expandtabs,
@@ -984,10 +1060,14 @@
doc=UnicodeDocstrings.isdecimal.__doc__),
isdigit = interp2app(W_UnicodeObject.descr_isdigit,
doc=UnicodeDocstrings.isdigit.__doc__),
+ isidentifier = interp2app(W_UnicodeObject.descr_isidentifier,
+ doc=UnicodeDocstrings.isidentifier.__doc__),
islower = interp2app(W_UnicodeObject.descr_islower,
doc=UnicodeDocstrings.islower.__doc__),
isnumeric = interp2app(W_UnicodeObject.descr_isnumeric,
doc=UnicodeDocstrings.isnumeric.__doc__),
+ isprintable = interp2app(W_UnicodeObject.descr_isprintable,
+ doc=UnicodeDocstrings.isprintable.__doc__),
isspace = interp2app(W_UnicodeObject.descr_isspace,
doc=UnicodeDocstrings.isspace.__doc__),
istitle = interp2app(W_UnicodeObject.descr_istitle,
@@ -1037,15 +1117,17 @@
format = interp2app(W_UnicodeObject.descr_format,
doc=UnicodeDocstrings.format.__doc__),
+ format_map = interp2app(W_UnicodeObject.descr_format_map,
+ doc=UnicodeDocstrings.format_map.__doc__),
__format__ = interp2app(W_UnicodeObject.descr__format__,
doc=UnicodeDocstrings.__format__.__doc__),
__mod__ = interp2app(W_UnicodeObject.descr_mod,
doc=UnicodeDocstrings.__mod__.__doc__),
__getnewargs__ = interp2app(W_UnicodeObject.descr_getnewargs,
doc=UnicodeDocstrings.__getnewargs__.__doc__),
- _formatter_parser = interp2app(W_UnicodeObject.descr_formatter_parser),
- _formatter_field_name_split =
- interp2app(W_UnicodeObject.descr_formatter_field_name_split),
+ maketrans = interp2app(W_UnicodeObject.descr_maketrans,
+ as_classmethod=True,
+ doc=UnicodeDocstrings.maketrans.__doc__)
)
@@ -1057,7 +1139,15 @@
W_UnicodeObject.EMPTY = W_UnicodeObject(u'')
-# Helper for converting int/long
+# Helper for converting int/long this is called only from
+# {int,long,float}type.descr__new__: in the default branch this is implemented
+# using the same logic as PyUnicode_EncodeDecimal, as CPython 2.7 does.
+#
+# In CPython3 the call to PyUnicode_EncodeDecimal has been replaced to a call
+# to PyUnicode_TransformDecimalToASCII, which is much simpler. Here, we do the
+# equivalent.
+#
+# Note that, differently than default, we return an *unicode* RPython string
def unicode_to_decimal_w(space, w_unistr):
if not isinstance(w_unistr, W_UnicodeObject):
raise operationerrfmt(space.w_TypeError, "expected unicode, got '%T'",
@@ -1079,4 +1169,4 @@
_repr_function, _ = make_unicode_escape_function(
- pass_printable=False, unicode_output=False, quotes=True, prefix='u')
+ pass_printable=True, unicode_output=True, quotes=True, prefix='')
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit