[pypy-commit] pypy unicode-utf8-py3: Fix for id(unicode)
Author: Armin Rigo Branch: unicode-utf8-py3 Changeset: r95069:f9566e8f8110 Date: 2018-09-02 11:14 +0200 http://bitbucket.org/pypy/pypy/changeset/f9566e8f8110/ Log:Fix for id(unicode) diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -22,7 +22,7 @@ from pypy.objspace.std.sliceobject import (W_SliceObject, unwrap_start_stop, normalize_simple_slice) from pypy.objspace.std.stringmethods import StringMethods -from pypy.objspace.std.util import IDTAG_SPECIAL, IDTAG_SHIFT +from pypy.objspace.std.util import IDTAG_SPECIAL, IDTAG_SHIFT, IDTAG_ALT_UID __all__ = ['W_UnicodeObject', 'encode_object', 'decode_object', 'unicode_from_object', 'unicode_to_decimal_w'] @@ -68,7 +68,7 @@ return False s1 = space.utf8_w(self) s2 = space.utf8_w(w_other) -if len(s2) > 2: +if self._len() > 1: return s1 is s2 else:# strings of len <= 1 are unique-ified return s1 == s2 @@ -76,14 +76,16 @@ def immutable_unique_id(self, space): if self.user_overridden_class: return None -s = space.utf8_w(self) -if len(s) > 2: -uid = compute_unique_id(s) -else:# strings of len <= 1 are unique-ified -if len(s) == 1: -base = ~ord(s[0]) # negative base values -elif len(s) == 2: -base = ~((ord(s[1]) << 8) | ord(s[0])) +l = self._len() +if l > 1: +# return the uid plus 2, to make sure we don't get +# conflicts with W_BytesObject, whose id() might be +# identical +uid = compute_unique_id(self._utf8) + IDTAG_ALT_UID +else: # strings of len <= 1 are unique-ified +if l == 1: +base = rutf8.codepoint_at_pos(self._utf8, 0) +base = ~base # negative base values else: base = 257 # empty unicode string: base value 257 uid = (base << IDTAG_SHIFT) | IDTAG_SPECIAL diff --git a/pypy/objspace/std/util.py b/pypy/objspace/std/util.py --- a/pypy/objspace/std/util.py +++ b/pypy/objspace/std/util.py @@ -4,6 +4,7 @@ from pypy.interpreter import gateway IDTAG_SHIFT = 4 +IDTAG_ALT_UID = 2 # gives an alternate id() from the same real uid IDTAG_INT = 1 IDTAG_LONG= 3 ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy default: Issue #2876
Author: Armin Rigo Branch: Changeset: r95070:88a9f1bbf1c8 Date: 2018-09-01 15:04 +0200 http://bitbucket.org/pypy/pypy/changeset/88a9f1bbf1c8/ Log:Issue #2876 Add select.PIPE_BUF. diff --git a/pypy/module/select/__init__.py b/pypy/module/select/__init__.py --- a/pypy/module/select/__init__.py +++ b/pypy/module/select/__init__.py @@ -3,6 +3,7 @@ import sys import os +from select import PIPE_BUF class Module(MixedModule): @@ -11,7 +12,8 @@ interpleveldefs = { 'select': 'interp_select.select', -'error' : 'space.fromcache(interp_select.Cache).w_error' +'error' : 'space.fromcache(interp_select.Cache).w_error', +'PIPE_BUF' : 'space.wrap(%r)' % PIPE_BUF, } if os.name =='posix': diff --git a/pypy/module/select/test/test_select.py b/pypy/module/select/test/test_select.py --- a/pypy/module/select/test/test_select.py +++ b/pypy/module/select/test/test_select.py @@ -245,6 +245,10 @@ raises(OverflowError, pollster.modify, 1, -1) raises(OverflowError, pollster.modify, 1, 1 << 64) +def test_PIPE_BUF(self): +import select +assert isinstance(select.PIPE_BUF, int) + class AppTestSelectWithPipes(_AppTestSelect): "Use a pipe to get pairs of file descriptors" ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy default: merge heads
Author: Armin Rigo Branch: Changeset: r95071:942ad6c1866e Date: 2018-09-02 11:15 +0200 http://bitbucket.org/pypy/pypy/changeset/942ad6c1866e/ Log:merge heads diff --git a/rpython/rlib/rstring.py b/rpython/rlib/rstring.py --- a/rpython/rlib/rstring.py +++ b/rpython/rlib/rstring.py @@ -464,6 +464,10 @@ raise InvalidBaseError("%s() base must be >= 2 and <= 36" % fname) self.base = base +# Leading underscores are not allowed +if s.startswith('_'): +self.error() + if base == 16 and (s.startswith('0x') or s.startswith('0X')): s = s[2:] if base == 8 and (s.startswith('0o') or s.startswith('0O')): diff --git a/rpython/rlib/test/test_rarithmetic.py b/rpython/rlib/test/test_rarithmetic.py --- a/rpython/rlib/test/test_rarithmetic.py +++ b/rpython/rlib/test/test_rarithmetic.py @@ -554,50 +554,52 @@ py.test.raises(ParseStringError, string_to_int, '+'+s, base) py.test.raises(ParseStringError, string_to_int, '-'+s, base) -def test_number_underscores(self): -VALID_UNDERSCORE_LITERALS = [ -'0_0_0', -'4_2', -'1__', -'0b1001_0100', -'0xfff_', -'0o5_7_7', -'0b_0', -'0x_f', -'0o_5', -] -INVALID_UNDERSCORE_LITERALS = [ -# Trailing underscores: -'0_', -'42_', -'1.4j_', -'0x_', -'0b1_', -'0xf_', -'0o5_', -# Underscores in the base selector: -'0_b0', -'0_xf', -'0_o5', -# Old-style octal, still disallowed: -'09_99', -# Multiple consecutive underscores: -'4___2', -'0b1001__0100', -'0xfff__', -'0x___', -'0o5__77', -'1e1__0', -] -for x in VALID_UNDERSCORE_LITERALS: -print x -y = string_to_int(x, base=0, allow_underscores=True, - no_implicit_octal=True) -assert y == int(x.replace('_', ''), base=0) -for x in INVALID_UNDERSCORE_LITERALS: -print x -py.test.raises(ParseStringError, string_to_int, x, base=0, - allow_underscores=True) +@py.test.mark.parametrize('s', [ +'0_0_0', +'4_2', +'1__', +'0b1001_0100', +'0xfff_', +'0o5_7_7', +'0b_0', +'0x_f', +'0o_5', +]) +def test_valid_underscores(self, s): +result = string_to_int( +s, base=0, allow_underscores=True, no_implicit_octal=True) +assert result == int(s.replace('_', ''), base=0) + +@py.test.mark.parametrize('s', [ +# Leading underscores +'_100', +'_', +'_0b1001_0100', +# Trailing underscores: +'0_', +'42_', +'1.4j_', +'0x_', +'0b1_', +'0xf_', +'0o5_', +# Underscores in the base selector: +'0_b0', +'0_xf', +'0_o5', +# Old-style octal, still disallowed: +'09_99', +# Multiple consecutive underscores: +'4___2', +'0b1001__0100', +'0xfff__', +'0x___', +'0o5__77', +'1e1__0', +]) +def test_invalid_underscores(self, s): +with py.test.raises(ParseStringError): +string_to_int(s, base=0, allow_underscores=True) def test_no_implicit_octal(self): TESTS = ['00', '000', '00_00', '02', '0377', '02_34'] ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8-py3: fix tests
Author: Matti Picus Branch: unicode-utf8-py3 Changeset: r95072:99ad3d85cb7a Date: 2018-09-01 23:46 +0200 http://bitbucket.org/pypy/pypy/changeset/99ad3d85cb7a/ Log:fix tests diff --git a/pypy/objspace/std/test/test_bytesobject.py b/pypy/objspace/std/test/test_bytesobject.py --- a/pypy/objspace/std/test/test_bytesobject.py +++ b/pypy/objspace/std/test/test_bytesobject.py @@ -97,7 +97,7 @@ monkeypatch.setattr(jit, 'isconstant', lambda x: True) space = self.space w_res = space.call_function(space.w_bytes, space.wrap([42])) -assert space.text_w(w_res) == '*' +assert space.bytes_w(w_res) == b'*' class AppTestBytesObject: diff --git a/pypy/objspace/std/test/test_stdobjspace.py b/pypy/objspace/std/test/test_stdobjspace.py --- a/pypy/objspace/std/test/test_stdobjspace.py +++ b/pypy/objspace/std/test/test_stdobjspace.py @@ -93,4 +93,4 @@ from pypy.objspace.std.unicodeobject import W_UnicodeObject w_x = self.space.wrap('foo\xF0') assert isinstance(w_x, W_UnicodeObject) -assert w_x._utf8 == 'foo\uxF0' +assert w_x._utf8 == 'foo\xF0' diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py --- a/pypy/objspace/std/test/test_unicodeobject.py +++ b/pypy/objspace/std/test/test_unicodeobject.py @@ -110,8 +110,8 @@ space = self.space w_uni = space.wrap(u'abcd') assert space.text_w(w_uni) == 'abcd' -# TODO : how to handle this? w_uni = space.wrap(unichr(0xd921) + unichr(0x)) +# Test is from py3.5, should this still fail? space.raises_w(space.w_UnicodeEncodeError, space.text_w, w_uni) diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -463,7 +463,7 @@ try: builder.append_code(codepoint) except ValueError: -raise oefmt(space.w_TypeError, +raise oefmt(space.w_ValueError, "character mapping must be in range(0x11)") return self.from_utf8builder(builder) ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8-py3: revert 58568f219c61
Author: Matti Picus Branch: unicode-utf8-py3 Changeset: r95076:cc3f3f3b7285 Date: 2018-09-02 11:51 +0200 http://bitbucket.org/pypy/pypy/changeset/cc3f3f3b7285/ Log:revert 58568f219c61 diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -50,7 +50,6 @@ chr((0x80 | ((code >> 12) & 0x3f))) + chr((0x80 | ((code >> 6) & 0x3f))) + chr((0x80 | (code & 0x3f -return '?' raise ValueError @try_inline ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8-py3: use encode_utf8, str_decode_utf8, and maybe handle surrogates in the latter
Author: Matti Picus Branch: unicode-utf8-py3 Changeset: r95073:b040f44dc71b Date: 2018-09-02 10:18 +0200 http://bitbucket.org/pypy/pypy/changeset/b040f44dc71b/ Log:use encode_utf8, str_decode_utf8, and maybe handle surrogates in the latter diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -50,6 +50,23 @@ return u'', None, 0 return raise_unicode_exception_encode +@specialize.memo() +def encode_unicode_error_handler(space): +# Fast version of the "strict" errors handler. +def raise_unicode_exception_encode(errors, encoding, msg, uni, + startingpos, endingpos): +assert isinstance(uni, unicode) +u_len = len(uni) +utf8 = runicode.unicode_encode_utf8sp(uni, u_len) +raise OperationError(space.w_UnicodeEncodeError, + space.newtuple([space.newtext(encoding), + space.newtext(utf8, u_len), + space.newint(startingpos), + space.newint(endingpos), + space.newtext(msg)])) +return u'', None, 0 +return raise_unicode_exception_encode + def default_error_encode( errors, encoding, msg, u, startingpos, endingpos): """A default handler, for tests""" @@ -322,7 +339,6 @@ valid so we're trying to either raise or pack stuff with error handler. The key difference is that this is call_may_force """ -# XXX need to handle allow_surrogates slen = len(s) res = StringBuilder(slen) pos = 0 @@ -377,7 +393,7 @@ ordch2 = ord(s[pos]) ordch3 = ord(s[pos + 1]) -if rutf8._invalid_byte_2_of_3(ordch1, ordch2, True): +if rutf8._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates): r, pos = errorhandler(errors, "utf8", "invalid continuation byte", s, pos - 1, pos) res.append(r) @@ -994,7 +1010,7 @@ assert isinstance(uni, unicode) return runicode.unicode_encode_utf_8( uni, len(uni), "strict", -errorhandler=encode_error_handler(space), +errorhandler=encode_unicode_error_handler(space), allow_surrogates=allow_surrogates) def encode_utf8sp(space, uni): diff --git a/pypy/objspace/std/stringmethods.py b/pypy/objspace/std/stringmethods.py --- a/pypy/objspace/std/stringmethods.py +++ b/pypy/objspace/std/stringmethods.py @@ -7,6 +7,7 @@ find, rfind, count, endswith, replace, rsplit, split, startswith) from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter.gateway import WrappedDefault, unwrap_spec +from pypy.interpreter.unicodehelper import str_decode_utf8 from pypy.objspace.std.sliceobject import W_SliceObject, unwrap_start_stop @@ -197,6 +198,12 @@ errors = 'strict' if encoding is None: encoding = 'utf8' +if encoding == 'utf8' or encoding == 'utf-8': +from pypy.module._codecs.interp_codecs import CodecState +state = space.fromcache(CodecState) +eh = state.decode_error_handler +s = space.charbuf_w(self) +ret, lgt, pos = str_decode_utf8(s, errors, True, eh) return decode_object(space, self, encoding, errors) @unwrap_spec(tabsize=int) diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -1898,12 +1898,8 @@ raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr) value = _rpy_unicode_to_decimal_w(space, w_unistr.utf8_w(space).decode('utf8')) # XXX this is the only place in the code that this funcion is called. -# It does not translate, since it uses a pypy-level error handler -# to throw the UnicodeEncodeError not the rpython default handler -#return unicodehelper.encode_utf8(space, value, -# allow_surrogates=allow_surrogates) -assert isinstance(value, unicode) -return value.encode('utf8') +return unicodehelper.encode_utf8(space, value, + allow_surrogates=allow_surrogates) def _rpy_unicode_to_decimal_w(space, unistr): # XXX rewrite this to accept a utf8 string and use a StringBuilder ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8-py3: check class name for valid utf8
Author: Matti Picus Branch: unicode-utf8-py3 Changeset: r95075:f7c1e87b6a3c Date: 2018-09-02 11:25 +0200 http://bitbucket.org/pypy/pypy/changeset/f7c1e87b6a3c/ Log:check class name for valid utf8 diff --git a/pypy/objspace/std/typeobject.py b/pypy/objspace/std/typeobject.py --- a/pypy/objspace/std/typeobject.py +++ b/pypy/objspace/std/typeobject.py @@ -14,6 +14,7 @@ from rpython.rlib.objectmodel import current_object_addr_as_int, compute_hash from rpython.rlib.objectmodel import we_are_translated, not_rpython from rpython.rlib.rarithmetic import intmask, r_uint +from rpython.rlib.rutf8 import CheckError, check_utf8 class MutableCell(W_Root): def unwrap_cell(self, space): @@ -177,6 +178,15 @@ overridetypedef=None, force_new_layout=False, is_heaptype=True): self.space = space +try: +check_utf8(name, False) +except CheckError as e: +raise OperationError(space.w_UnicodeEncodeError, + space.newtuple([space.newtext('utf8'), + space.newtext(name), + space.newint(e.pos), + space.newint(e.pos + 1), + space.newtext('surrogates not allowed')])) self.name = name self.qualname = None self.bases_w = bases_w ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8-py3: expand test
Author: Matti Picus Branch: unicode-utf8-py3 Changeset: r95074:34c6d0d3499f Date: 2018-09-02 10:51 +0200 http://bitbucket.org/pypy/pypy/changeset/34c6d0d3499f/ Log:expand test diff --git a/pypy/objspace/std/test/test_bytesobject.py b/pypy/objspace/std/test/test_bytesobject.py --- a/pypy/objspace/std/test/test_bytesobject.py +++ b/pypy/objspace/std/test/test_bytesobject.py @@ -1031,4 +1031,5 @@ a = b'abcabc' id_b = id(str(a, 'latin1')) id_a = id(a) +assert a is not str(a, 'latin1') assert id_a != id_b ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] extradoc extradoc: start a blog post draft about cpyext
Author: Antonio Cuni Branch: extradoc Changeset: r5890:01e42155cfe8 Date: 2018-09-02 16:28 +0200 http://bitbucket.org/pypy/extradoc/changeset/01e42155cfe8/ Log:start a blog post draft about cpyext diff --git a/blog/draft/2018-09-cpyext/cpyext.rst b/blog/draft/2018-09-cpyext/cpyext.rst new file mode 100644 --- /dev/null +++ b/blog/draft/2018-09-cpyext/cpyext.rst @@ -0,0 +1,92 @@ +Inside cpyext: why emulating CPython C API is so hard +== + +cpyext is PyPy's subsistem which is responsible to provide a compatibility +layer to compile and run CPython C extensions inside PyPy. Often people asks +why it this particular extension doesn't work or it is very slow on PyPy, but +usually it is hard to answer without going into technical details: the goal of +this blog post is to explain some of these technical details, so that we can +simply link here instead of explaing again and again :). + +From a 10.000 foot view, cpyext is PyPy's version of `"Python.h"`: every time +you compile and extension which uses that header file, you are using cpyext: +this includes extension explicitly written in C (such as `numpy`) and +extensions which are generated from other compilers/preprocessors +(e.g. `Cython`). + +At the time of writing, the current status is that most C extensions "just +work": generally speaking, you can simply `pip install` all of them, provided +they use the public, `official C API`_ instead of poking at private +implementation details. + +.. _`official C API`: https://docs.python.org/2/c-api/index.html + +Prologue: the PyPy GC +-- + +To understand some of cpyext challenges, you need to have at least a rough +idea of how the PyPy GC works. + +Contrarily to the popular belief, the "Garbage Collector" is not only about +collecting garbage: instead, it is generally responsible of all memory +management, including allocation and deallocation. + +CPython uses a very simple memory management scheme: when you create an +object, you allocate a block of memory of the appropriate size on the heap: +depending on the details you might end up calling different allocators, but +for the sake of simplicity, you can think that this ends up being a call to +`malloc()`. Handles to objects have the C type `PyObject *`, which point to +the memory just allocated: this address never changes during the object +lifetime, and the C code can freely pass it around, store it inside +containers, retrieve it later, etc. + +Memory is managed using reference counting: when you create a new reference to +an object, or you discard a reference you own, you have to increment_ or +decrement_ reference counter accordingly. When the reference counter goes to +0, it means that the object is no longer used by anyone and can safely be +destroyed. Again, we can simplify and say that this results in a call to +`free()`, which finally releases the memory which was allocated by `malloc()`. + +.. _increment: https://docs.python.org/2/c-api/refcounting.html#c.Py_INCREF +.. _decrement: https://docs.python.org/2/c-api/refcounting.html#c.Py_DECREF + +The PyPy GC is completely different: it is designed assuming that a dynamic +language like Python behaves the following way: + + - you create, either directly or indirectly, lots of objects; + + - most of these objects are temporary and very short-lived: think e.g. of +doing `a + b + c`: you need to allocate an object to hold the temporary +result of `a + b`, but it dies very quickly because you no longer need it +when you do the final `+ c` part; + + - only small fraction of the objects survives and stay around for a while. + +So, the strategy is: make allocation as fast as possible; make deallocation of +short-lived objects as fast as possible; find a way to handle the remaining +small set of objects which actually survive long enough to be important. + +This is done using a **Generational GC**: the basic idea is the following: + + 1. we have a nursery, where we allocate "young objects" very fast; + + 2. when the nursery is full, we start what we call a "minor collection": we + do quick scan to determine the small set of objects which survived so + far; + + 3. we **move** these objects out of the nursery, and we place them in the + area of memory which contains the "old objects"; since the address of the + objects just changed, we fix all the references to them accordingly; + + 4. now the nursery contains only objects which died young: we can simply + discard all of them very quickly, reset the nursery and use the same area + of memory to allocate new objects from now. + +In practice, this scheme works very well and it is one of the reasons why PyPy +is much faster than CPython. However, careful readers have surely noticed +that this is a problem for `cpyext`: on one hand, we have PyPy objects which +can potentially move and change their underlying memory address; on the other +hand, we nee