[pypy-commit] pypy unicode-utf8-py3: Fix for id(unicode)

2018-09-02 Thread arigo
Author: Armin Rigo 
Branch: unicode-utf8-py3
Changeset: r95069:f9566e8f8110
Date: 2018-09-02 11:14 +0200
http://bitbucket.org/pypy/pypy/changeset/f9566e8f8110/

Log:Fix for id(unicode)

diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -22,7 +22,7 @@
 from pypy.objspace.std.sliceobject import (W_SliceObject,
 unwrap_start_stop, normalize_simple_slice)
 from pypy.objspace.std.stringmethods import StringMethods
-from pypy.objspace.std.util import IDTAG_SPECIAL, IDTAG_SHIFT
+from pypy.objspace.std.util import IDTAG_SPECIAL, IDTAG_SHIFT, IDTAG_ALT_UID
 
 __all__ = ['W_UnicodeObject', 'encode_object', 'decode_object',
'unicode_from_object', 'unicode_to_decimal_w']
@@ -68,7 +68,7 @@
 return False
 s1 = space.utf8_w(self)
 s2 = space.utf8_w(w_other)
-if len(s2) > 2:
+if self._len() > 1:
 return s1 is s2
 else:# strings of len <= 1 are unique-ified
 return s1 == s2
@@ -76,14 +76,16 @@
 def immutable_unique_id(self, space):
 if self.user_overridden_class:
 return None
-s = space.utf8_w(self)
-if len(s) > 2:
-uid = compute_unique_id(s)
-else:# strings of len <= 1 are unique-ified
-if len(s) == 1:
-base = ~ord(s[0])  # negative base values
-elif len(s) == 2:
-base = ~((ord(s[1]) << 8) | ord(s[0]))
+l = self._len()
+if l > 1:
+# return the uid plus 2, to make sure we don't get
+# conflicts with W_BytesObject, whose id() might be
+# identical
+uid = compute_unique_id(self._utf8) + IDTAG_ALT_UID
+else:   # strings of len <= 1 are unique-ified
+if l == 1:
+base = rutf8.codepoint_at_pos(self._utf8, 0)
+base = ~base # negative base values
 else:
 base = 257   # empty unicode string: base value 257
 uid = (base << IDTAG_SHIFT) | IDTAG_SPECIAL
diff --git a/pypy/objspace/std/util.py b/pypy/objspace/std/util.py
--- a/pypy/objspace/std/util.py
+++ b/pypy/objspace/std/util.py
@@ -4,6 +4,7 @@
 from pypy.interpreter import gateway
 
 IDTAG_SHIFT   = 4
+IDTAG_ALT_UID = 2 # gives an alternate id() from the same real uid
 
 IDTAG_INT = 1
 IDTAG_LONG= 3
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy default: Issue #2876

2018-09-02 Thread arigo
Author: Armin Rigo 
Branch: 
Changeset: r95070:88a9f1bbf1c8
Date: 2018-09-01 15:04 +0200
http://bitbucket.org/pypy/pypy/changeset/88a9f1bbf1c8/

Log:Issue #2876

Add select.PIPE_BUF.

diff --git a/pypy/module/select/__init__.py b/pypy/module/select/__init__.py
--- a/pypy/module/select/__init__.py
+++ b/pypy/module/select/__init__.py
@@ -3,6 +3,7 @@
 
 import sys
 import os
+from select import PIPE_BUF
 
 
 class Module(MixedModule):
@@ -11,7 +12,8 @@
 
 interpleveldefs = {
 'select': 'interp_select.select',
-'error' : 'space.fromcache(interp_select.Cache).w_error'
+'error' : 'space.fromcache(interp_select.Cache).w_error',
+'PIPE_BUF' : 'space.wrap(%r)' % PIPE_BUF,
 }
 
 if os.name =='posix':
diff --git a/pypy/module/select/test/test_select.py 
b/pypy/module/select/test/test_select.py
--- a/pypy/module/select/test/test_select.py
+++ b/pypy/module/select/test/test_select.py
@@ -245,6 +245,10 @@
 raises(OverflowError, pollster.modify, 1, -1)
 raises(OverflowError, pollster.modify, 1, 1 << 64)
 
+def test_PIPE_BUF(self):
+import select
+assert isinstance(select.PIPE_BUF, int)
+
 
 class AppTestSelectWithPipes(_AppTestSelect):
 "Use a pipe to get pairs of file descriptors"
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy default: merge heads

2018-09-02 Thread arigo
Author: Armin Rigo 
Branch: 
Changeset: r95071:942ad6c1866e
Date: 2018-09-02 11:15 +0200
http://bitbucket.org/pypy/pypy/changeset/942ad6c1866e/

Log:merge heads

diff --git a/rpython/rlib/rstring.py b/rpython/rlib/rstring.py
--- a/rpython/rlib/rstring.py
+++ b/rpython/rlib/rstring.py
@@ -464,6 +464,10 @@
 raise InvalidBaseError("%s() base must be >= 2 and <= 36" % fname)
 self.base = base
 
+# Leading underscores are not allowed
+if s.startswith('_'):
+self.error()
+
 if base == 16 and (s.startswith('0x') or s.startswith('0X')):
 s = s[2:]
 if base == 8 and (s.startswith('0o') or s.startswith('0O')):
diff --git a/rpython/rlib/test/test_rarithmetic.py 
b/rpython/rlib/test/test_rarithmetic.py
--- a/rpython/rlib/test/test_rarithmetic.py
+++ b/rpython/rlib/test/test_rarithmetic.py
@@ -554,50 +554,52 @@
 py.test.raises(ParseStringError, string_to_int, '+'+s, base)
 py.test.raises(ParseStringError, string_to_int, '-'+s, base)
 
-def test_number_underscores(self):
-VALID_UNDERSCORE_LITERALS = [
-'0_0_0',
-'4_2',
-'1__',
-'0b1001_0100',
-'0xfff_',
-'0o5_7_7',
-'0b_0',
-'0x_f',
-'0o_5',
-]
-INVALID_UNDERSCORE_LITERALS = [
-# Trailing underscores:
-'0_',
-'42_',
-'1.4j_',
-'0x_',
-'0b1_',
-'0xf_',
-'0o5_',
-# Underscores in the base selector:
-'0_b0',
-'0_xf',
-'0_o5',
-# Old-style octal, still disallowed:
-'09_99',
-# Multiple consecutive underscores:
-'4___2',
-'0b1001__0100',
-'0xfff__',
-'0x___',
-'0o5__77',
-'1e1__0',
-]
-for x in VALID_UNDERSCORE_LITERALS:
-print x
-y = string_to_int(x, base=0, allow_underscores=True,
-  no_implicit_octal=True)
-assert y == int(x.replace('_', ''), base=0)
-for x in INVALID_UNDERSCORE_LITERALS:
-print x
-py.test.raises(ParseStringError, string_to_int, x, base=0,
-   allow_underscores=True)
+@py.test.mark.parametrize('s', [
+'0_0_0',
+'4_2',
+'1__',
+'0b1001_0100',
+'0xfff_',
+'0o5_7_7',
+'0b_0',
+'0x_f',
+'0o_5',
+])
+def test_valid_underscores(self, s):
+result = string_to_int(
+s, base=0, allow_underscores=True, no_implicit_octal=True)
+assert result == int(s.replace('_', ''), base=0)
+
+@py.test.mark.parametrize('s', [
+# Leading underscores
+'_100',
+'_',
+'_0b1001_0100',
+# Trailing underscores:
+'0_',
+'42_',
+'1.4j_',
+'0x_',
+'0b1_',
+'0xf_',
+'0o5_',
+# Underscores in the base selector:
+'0_b0',
+'0_xf',
+'0_o5',
+# Old-style octal, still disallowed:
+'09_99',
+# Multiple consecutive underscores:
+'4___2',
+'0b1001__0100',
+'0xfff__',
+'0x___',
+'0o5__77',
+'1e1__0',
+])
+def test_invalid_underscores(self, s):
+with py.test.raises(ParseStringError):
+string_to_int(s, base=0, allow_underscores=True)
 
 def test_no_implicit_octal(self):
 TESTS = ['00', '000', '00_00', '02', '0377', '02_34']
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-py3: fix tests

2018-09-02 Thread mattip
Author: Matti Picus 
Branch: unicode-utf8-py3
Changeset: r95072:99ad3d85cb7a
Date: 2018-09-01 23:46 +0200
http://bitbucket.org/pypy/pypy/changeset/99ad3d85cb7a/

Log:fix tests

diff --git a/pypy/objspace/std/test/test_bytesobject.py 
b/pypy/objspace/std/test/test_bytesobject.py
--- a/pypy/objspace/std/test/test_bytesobject.py
+++ b/pypy/objspace/std/test/test_bytesobject.py
@@ -97,7 +97,7 @@
 monkeypatch.setattr(jit, 'isconstant', lambda x: True)
 space = self.space
 w_res = space.call_function(space.w_bytes, space.wrap([42]))
-assert space.text_w(w_res) == '*'
+assert space.bytes_w(w_res) == b'*'
 
 
 class AppTestBytesObject:
diff --git a/pypy/objspace/std/test/test_stdobjspace.py 
b/pypy/objspace/std/test/test_stdobjspace.py
--- a/pypy/objspace/std/test/test_stdobjspace.py
+++ b/pypy/objspace/std/test/test_stdobjspace.py
@@ -93,4 +93,4 @@
 from pypy.objspace.std.unicodeobject import W_UnicodeObject
 w_x = self.space.wrap('foo\xF0')
 assert isinstance(w_x, W_UnicodeObject)
-assert w_x._utf8 == 'foo\uxF0'
+assert w_x._utf8 == 'foo\xF0'
diff --git a/pypy/objspace/std/test/test_unicodeobject.py 
b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -110,8 +110,8 @@
 space = self.space
 w_uni = space.wrap(u'abcd')
 assert space.text_w(w_uni) == 'abcd'
-# TODO : how to handle this?
 w_uni = space.wrap(unichr(0xd921) + unichr(0x))
+#  Test is from py3.5, should this still fail?
 space.raises_w(space.w_UnicodeEncodeError, space.text_w, w_uni)
 
 
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -463,7 +463,7 @@
 try:
 builder.append_code(codepoint)
 except ValueError:
-raise oefmt(space.w_TypeError,
+raise oefmt(space.w_ValueError,
 "character mapping must be in range(0x11)")
 return self.from_utf8builder(builder)
 
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-py3: revert 58568f219c61

2018-09-02 Thread mattip
Author: Matti Picus 
Branch: unicode-utf8-py3
Changeset: r95076:cc3f3f3b7285
Date: 2018-09-02 11:51 +0200
http://bitbucket.org/pypy/pypy/changeset/cc3f3f3b7285/

Log:revert 58568f219c61

diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -50,7 +50,6 @@
 chr((0x80 | ((code >> 12) & 0x3f))) +
 chr((0x80 | ((code >> 6) & 0x3f))) +
 chr((0x80 | (code & 0x3f
-return '?'
 raise ValueError
 
 @try_inline
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-py3: use encode_utf8, str_decode_utf8, and maybe handle surrogates in the latter

2018-09-02 Thread mattip
Author: Matti Picus 
Branch: unicode-utf8-py3
Changeset: r95073:b040f44dc71b
Date: 2018-09-02 10:18 +0200
http://bitbucket.org/pypy/pypy/changeset/b040f44dc71b/

Log:use encode_utf8, str_decode_utf8, and maybe handle surrogates in the
latter

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -50,6 +50,23 @@
 return u'', None, 0
 return raise_unicode_exception_encode
 
+@specialize.memo()
+def encode_unicode_error_handler(space):
+# Fast version of the "strict" errors handler.
+def raise_unicode_exception_encode(errors, encoding, msg, uni,
+   startingpos, endingpos):
+assert isinstance(uni, unicode)
+u_len = len(uni)
+utf8 = runicode.unicode_encode_utf8sp(uni, u_len)
+raise OperationError(space.w_UnicodeEncodeError,
+ space.newtuple([space.newtext(encoding),
+ space.newtext(utf8, u_len),
+ space.newint(startingpos),
+ space.newint(endingpos),
+ space.newtext(msg)]))
+return u'', None, 0
+return raise_unicode_exception_encode
+
 def default_error_encode(
 errors, encoding, msg, u, startingpos, endingpos):
 """A default handler, for tests"""
@@ -322,7 +339,6 @@
 valid so we're trying to either raise or pack stuff with error handler.
 The key difference is that this is call_may_force
 """
-# XXX need to handle allow_surrogates
 slen = len(s)
 res = StringBuilder(slen)
 pos = 0
@@ -377,7 +393,7 @@
 ordch2 = ord(s[pos])
 ordch3 = ord(s[pos + 1])
 
-if rutf8._invalid_byte_2_of_3(ordch1, ordch2, True):
+if rutf8._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
 r, pos = errorhandler(errors, "utf8", "invalid continuation 
byte",
 s, pos - 1, pos)
 res.append(r)
@@ -994,7 +1010,7 @@
 assert isinstance(uni, unicode)
 return runicode.unicode_encode_utf_8(
 uni, len(uni), "strict",
-errorhandler=encode_error_handler(space),
+errorhandler=encode_unicode_error_handler(space),
 allow_surrogates=allow_surrogates)
 
 def encode_utf8sp(space, uni):
diff --git a/pypy/objspace/std/stringmethods.py 
b/pypy/objspace/std/stringmethods.py
--- a/pypy/objspace/std/stringmethods.py
+++ b/pypy/objspace/std/stringmethods.py
@@ -7,6 +7,7 @@
 find, rfind, count, endswith, replace, rsplit, split, startswith)
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import WrappedDefault, unwrap_spec
+from pypy.interpreter.unicodehelper import str_decode_utf8
 from pypy.objspace.std.sliceobject import W_SliceObject, unwrap_start_stop
 
 
@@ -197,6 +198,12 @@
 errors = 'strict'
 if encoding is None:
 encoding = 'utf8'
+if encoding == 'utf8' or encoding == 'utf-8':
+from pypy.module._codecs.interp_codecs import CodecState
+state = space.fromcache(CodecState)
+eh = state.decode_error_handler
+s = space.charbuf_w(self)
+ret, lgt, pos = str_decode_utf8(s, errors, True, eh)
 return decode_object(space, self, encoding, errors)
 
 @unwrap_spec(tabsize=int)
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1898,12 +1898,8 @@
 raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
 value = _rpy_unicode_to_decimal_w(space, 
w_unistr.utf8_w(space).decode('utf8'))
 # XXX this is the only place in the code that this funcion is called.
-# It does not translate, since it uses a pypy-level error handler
-# to throw the UnicodeEncodeError not the rpython default handler
-#return unicodehelper.encode_utf8(space, value,
-# allow_surrogates=allow_surrogates)
-assert isinstance(value, unicode)
-return value.encode('utf8')
+return unicodehelper.encode_utf8(space, value,
+ allow_surrogates=allow_surrogates)
 
 def _rpy_unicode_to_decimal_w(space, unistr):
 # XXX rewrite this to accept a utf8 string and use a StringBuilder
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-py3: check class name for valid utf8

2018-09-02 Thread mattip
Author: Matti Picus 
Branch: unicode-utf8-py3
Changeset: r95075:f7c1e87b6a3c
Date: 2018-09-02 11:25 +0200
http://bitbucket.org/pypy/pypy/changeset/f7c1e87b6a3c/

Log:check class name for valid utf8

diff --git a/pypy/objspace/std/typeobject.py b/pypy/objspace/std/typeobject.py
--- a/pypy/objspace/std/typeobject.py
+++ b/pypy/objspace/std/typeobject.py
@@ -14,6 +14,7 @@
 from rpython.rlib.objectmodel import current_object_addr_as_int, compute_hash
 from rpython.rlib.objectmodel import we_are_translated, not_rpython
 from rpython.rlib.rarithmetic import intmask, r_uint
+from rpython.rlib.rutf8 import CheckError, check_utf8
 
 class MutableCell(W_Root):
 def unwrap_cell(self, space):
@@ -177,6 +178,15 @@
  overridetypedef=None, force_new_layout=False,
  is_heaptype=True):
 self.space = space
+try:
+check_utf8(name, False)
+except CheckError as e:
+raise OperationError(space.w_UnicodeEncodeError,
+ space.newtuple([space.newtext('utf8'),
+ space.newtext(name),
+ space.newint(e.pos),
+ space.newint(e.pos + 1),
+ space.newtext('surrogates not allowed')]))
 self.name = name
 self.qualname = None
 self.bases_w = bases_w
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-py3: expand test

2018-09-02 Thread mattip
Author: Matti Picus 
Branch: unicode-utf8-py3
Changeset: r95074:34c6d0d3499f
Date: 2018-09-02 10:51 +0200
http://bitbucket.org/pypy/pypy/changeset/34c6d0d3499f/

Log:expand test

diff --git a/pypy/objspace/std/test/test_bytesobject.py 
b/pypy/objspace/std/test/test_bytesobject.py
--- a/pypy/objspace/std/test/test_bytesobject.py
+++ b/pypy/objspace/std/test/test_bytesobject.py
@@ -1031,4 +1031,5 @@
 a = b'abcabc'
 id_b = id(str(a, 'latin1'))
 id_a = id(a)
+assert a is not str(a, 'latin1')
 assert id_a != id_b
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] extradoc extradoc: start a blog post draft about cpyext

2018-09-02 Thread antocuni
Author: Antonio Cuni 
Branch: extradoc
Changeset: r5890:01e42155cfe8
Date: 2018-09-02 16:28 +0200
http://bitbucket.org/pypy/extradoc/changeset/01e42155cfe8/

Log:start a blog post draft about cpyext

diff --git a/blog/draft/2018-09-cpyext/cpyext.rst 
b/blog/draft/2018-09-cpyext/cpyext.rst
new file mode 100644
--- /dev/null
+++ b/blog/draft/2018-09-cpyext/cpyext.rst
@@ -0,0 +1,92 @@
+Inside cpyext: why emulating CPython C API is so hard
+==
+
+cpyext is PyPy's subsistem which is responsible to provide a compatibility
+layer to compile and run CPython C extensions inside PyPy.  Often people asks
+why it this particular extension doesn't work or it is very slow on PyPy, but
+usually it is hard to answer without going into technical details: the goal of
+this blog post is to explain some of these technical details, so that we can
+simply link here instead of explaing again and again :).
+
+From a 10.000 foot view, cpyext is PyPy's version of `"Python.h"`: every time
+you compile and extension which uses that header file, you are using cpyext:
+this includes extension explicitly written in C (such as `numpy`) and
+extensions which are generated from other compilers/preprocessors
+(e.g. `Cython`).
+
+At the time of writing, the current status is that most C extensions "just
+work": generally speaking, you can simply `pip install` all of them, provided
+they use the public, `official C API`_ instead of poking at private
+implementation details.
+
+.. _`official C API`: https://docs.python.org/2/c-api/index.html
+
+Prologue: the PyPy GC
+--
+
+To understand some of cpyext challenges, you need to have at least a rough
+idea of how the PyPy GC works.
+
+Contrarily to the popular belief, the "Garbage Collector" is not only about
+collecting garbage: instead, it is generally responsible of all memory
+management, including allocation and deallocation.
+
+CPython uses a very simple memory management scheme: when you create an
+object, you allocate a block of memory of the appropriate size on the heap:
+depending on the details you might end up calling different allocators, but
+for the sake of simplicity, you can think that this ends up being a call to
+`malloc()`. Handles to objects have the C type `PyObject *`, which point to
+the memory just allocated: this address never changes during the object
+lifetime, and the C code can freely pass it around, store it inside
+containers, retrieve it later, etc.
+
+Memory is managed using reference counting: when you create a new reference to
+an object, or you discard a reference you own, you have to increment_ or
+decrement_ reference counter accordingly. When the reference counter goes to
+0, it means that the object is no longer used by anyone and can safely be
+destroyed. Again, we can simplify and say that this results in a call to
+`free()`, which finally releases the memory which was allocated by `malloc()`.
+
+.. _increment: https://docs.python.org/2/c-api/refcounting.html#c.Py_INCREF
+.. _decrement: https://docs.python.org/2/c-api/refcounting.html#c.Py_DECREF
+
+The PyPy GC is completely different: it is designed assuming that a dynamic
+language like Python behaves the following way:
+
+  - you create, either directly or indirectly, lots of objects;
+
+  - most of these objects are temporary and very short-lived: think e.g. of
+doing `a + b + c`: you need to allocate an object to hold the temporary
+result of `a + b`, but it dies very quickly because you no longer need it
+when you do the final `+ c` part;
+
+  - only small fraction of the objects survives and stay around for a while.
+
+So, the strategy is: make allocation as fast as possible; make deallocation of
+short-lived objects as fast as possible; find a way to handle the remaining
+small set of objects which actually survive long enough to be important.
+
+This is done using a **Generational GC**: the basic idea is the following:
+
+  1. we have a nursery, where we allocate "young objects" very fast;
+
+  2. when the nursery is full, we start what we call a "minor collection": we
+ do quick scan to determine the small set of objects which survived so
+ far;
+
+  3. we **move** these objects out of the nursery, and we place them in the
+ area of memory which contains the "old objects"; since the address of the
+ objects just changed, we fix all the references to them accordingly;
+
+  4. now the nursery contains only objects which died young: we can simply
+ discard all of them very quickly, reset the nursery and use the same area
+ of memory to allocate new objects from now.
+
+In practice, this scheme works very well and it is one of the reasons why PyPy
+is much faster than CPython.  However, careful readers have surely noticed
+that this is a problem for `cpyext`: on one hand, we have PyPy objects which
+can potentially move and change their underlying memory address; on the other
+hand, we nee