Author: Ronan Lamy <ronan.l...@gmail.com>
Branch: py3.5
Changeset: r93402:da4b6cf751a5
Date: 2017-12-12 23:37 +0000
http://bitbucket.org/pypy/pypy/changeset/da4b6cf751a5/

Log:    hg merge default

diff --git a/pypy/module/test_lib_pypy/test_json_extra.py 
b/extra_tests/test_json.py
rename from pypy/module/test_lib_pypy/test_json_extra.py
rename to extra_tests/test_json.py
--- a/pypy/module/test_lib_pypy/test_json_extra.py
+++ b/extra_tests/test_json.py
@@ -1,4 +1,6 @@
-import py, json
+import pytest
+import json
+from hypothesis import given, strategies
 
 def is_(x, y):
     return type(x) is type(y) and x == y
@@ -6,12 +8,26 @@
 def test_no_ensure_ascii():
     assert is_(json.dumps(u"\u1234", ensure_ascii=False), u'"\u1234"')
     assert is_(json.dumps("\xc0", ensure_ascii=False), '"\xc0"')
-    e = py.test.raises(UnicodeDecodeError, json.dumps,
-                       (u"\u1234", "\xc0"), ensure_ascii=False)
-    assert str(e.value).startswith("'ascii' codec can't decode byte 0xc0 ")
-    e = py.test.raises(UnicodeDecodeError, json.dumps,
-                       ("\xc0", u"\u1234"), ensure_ascii=False)
-    assert str(e.value).startswith("'ascii' codec can't decode byte 0xc0 ")
+    with pytest.raises(UnicodeDecodeError) as excinfo:
+        json.dumps((u"\u1234", "\xc0"), ensure_ascii=False)
+    assert str(excinfo.value).startswith(
+        "'ascii' codec can't decode byte 0xc0 ")
+    with pytest.raises(UnicodeDecodeError) as excinfo:
+        json.dumps(("\xc0", u"\u1234"), ensure_ascii=False)
+    assert str(excinfo.value).startswith(
+        "'ascii' codec can't decode byte 0xc0 ")
 
 def test_issue2191():
     assert is_(json.dumps(u"xxx", ensure_ascii=False), u'"xxx"')
+
+jsondata = strategies.recursive(
+    strategies.none() |
+    strategies.booleans() |
+    strategies.floats(allow_nan=False) |
+    strategies.text(),
+    lambda children: strategies.lists(children) |
+        strategies.dictionaries(strategies.text(), children))
+
+@given(jsondata)
+def test_roundtrip(d):
+    assert json.loads(json.dumps(d)) == d
diff --git a/pypy/doc/build.rst b/pypy/doc/build.rst
--- a/pypy/doc/build.rst
+++ b/pypy/doc/build.rst
@@ -149,7 +149,7 @@
     xz-devel # For lzma on PyPy3.
     (XXX plus the SLES11 version of libgdbm-dev and tk-dev)
 
-On Mac OS X::
+On Mac OS X:
 
 Most of these build-time dependencies are installed alongside
 the Developer Tools. However, note that in order for the installation to
diff --git a/pypy/doc/cpython_differences.rst b/pypy/doc/cpython_differences.rst
--- a/pypy/doc/cpython_differences.rst
+++ b/pypy/doc/cpython_differences.rst
@@ -362,7 +362,11 @@
 containers (as list items or in sets for example), the exact rule of
 equality used is "``if x is y or x == y``" (on both CPython and PyPy);
 as a consequence, because all ``nans`` are identical in PyPy, you
-cannot have several of them in a set, unlike in CPython.  (Issue `#1974`__)
+cannot have several of them in a set, unlike in CPython.  (Issue `#1974`__).
+Another consequence is that ``cmp(float('nan'), float('nan')) == 0``, because
+``cmp`` checks with ``is`` first whether the arguments are identical (there is
+no good value to return from this call to ``cmp``, because ``cmp`` pretends
+that there is a total order on floats, but that is wrong for NaNs).
 
 .. __: 
https://bitbucket.org/pypy/pypy/issue/1974/different-behaviour-for-collections-of
 
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -1,31 +1,41 @@
-===========================
-What's new in PyPy2.7 5.10+
-===========================
-
-.. this is a revision shortly after release-pypy2.7-v5.9.0
-.. startrev:d56dadcef996
-
-.. branch: cppyy-packaging
-Cleanup and improve cppyy packaging
-
-.. branch: docs-osx-brew-openssl
-
-.. branch: keep-debug-symbols
-Add a smartstrip tool, which can optionally keep the debug symbols in a
-separate file, instead of just stripping them away. Use it in packaging
-
-.. branch: bsd-patches
-Fix failures on FreeBSD, contributed by David Naylor as patches on the issue
-tracker (issues 2694, 2695, 2696, 2697)
-
-.. branch: run-extra-tests
-Run extra_tests/ in buildbot
-
-.. branch: vmprof-0.4.10
-Upgrade the _vmprof backend to vmprof 0.4.10
-
-.. branch: fix-vmprof-stacklet-switch
-Fix a vmprof+continulets (i.e. greenelts, eventlet, gevent, ...)
-
-.. branch: win32-vcvars
-
+===========================
+What's new in PyPy2.7 5.10+
+===========================
+
+.. this is a revision shortly after release-pypy2.7-v5.9.0
+.. startrev:d56dadcef996
+
+
+.. branch: cppyy-packaging
+
+Cleanup and improve cppyy packaging
+
+.. branch: docs-osx-brew-openssl
+
+.. branch: keep-debug-symbols
+
+Add a smartstrip tool, which can optionally keep the debug symbols in a
+separate file, instead of just stripping them away. Use it in packaging
+
+.. branch: bsd-patches
+
+Fix failures on FreeBSD, contributed by David Naylor as patches on the issue
+tracker (issues 2694, 2695, 2696, 2697)
+
+.. branch: run-extra-tests
+
+Run extra_tests/ in buildbot
+
+.. branch: vmprof-0.4.10
+
+Upgrade the _vmprof backend to vmprof 0.4.10
+
+.. branch: fix-vmprof-stacklet-switch
+
+Fix a vmprof+continulets (i.e. greenelts, eventlet, gevent, ...)
+
+.. branch: win32-vcvars
+
+.. branch rdict-fast-hash
+
+Make it possible to declare that the hash function of an r_dict is fast in 
RPython.
diff --git a/pypy/doc/whatsnew-pypy2-5.6.0.rst 
b/pypy/doc/whatsnew-pypy2-5.6.0.rst
--- a/pypy/doc/whatsnew-pypy2-5.6.0.rst
+++ b/pypy/doc/whatsnew-pypy2-5.6.0.rst
@@ -107,7 +107,7 @@
 
 .. branch: newinitwarn
 
-Match CPython's stricter handling of __new/init__ arguments
+Match CPython's stricter handling of ``__new__``/``__init__`` arguments
 
 .. branch: openssl-1.1
 
diff --git a/pypy/doc/windows.rst b/pypy/doc/windows.rst
--- a/pypy/doc/windows.rst
+++ b/pypy/doc/windows.rst
@@ -11,7 +11,7 @@
 
 To build pypy-c you need a working python environment, and a C compiler.
 It is possible to translate with a CPython 2.6 or later, but this is not
-the preferred way, because it will take a lot longer to run &#65533; depending
+the preferred way, because it will take a lot longer to run &#8211; depending
 on your architecture, between two and three times as long. So head to
 `our downloads`_ and get the latest stable version.
 
@@ -103,6 +103,7 @@
 must also copy the ``vcvarsall.bat`` file fron the ``...\9.0`` directory to the
 ``...\9.0\VC`` directory, and edit it, changing the lines that set
 ``VCINSTALLDIR`` and ``WindowsSdkDir``::
+
     set VCINSTALLDIR=%~dp0\
     set WindowsSdkDir=%~dp0\..\WinSDK\
 
diff --git a/pypy/interpreter/astcompiler/test/test_astbuilder.py 
b/pypy/interpreter/astcompiler/test/test_astbuilder.py
--- a/pypy/interpreter/astcompiler/test/test_astbuilder.py
+++ b/pypy/interpreter/astcompiler/test/test_astbuilder.py
@@ -1404,3 +1404,7 @@
         exc = py.test.raises(SyntaxError, self.get_ast, input).value
         assert exc.msg == ("(unicode error) 'unicodeescape' codec can't decode"
                            " bytes in position 0-1: truncated \\xXX escape")
+        input = "u'\\x1'"
+        exc = py.test.raises(SyntaxError, self.get_ast, input).value
+        assert exc.msg == ("(unicode error) 'unicodeescape' codec can't decode"
+                           " bytes in position 0-2: truncated \\xXX escape")
diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,5 +1,8 @@
 import py
-from pypy.interpreter.unicodehelper import encode_utf8, decode_utf8
+import pytest
+import struct
+from pypy.interpreter.unicodehelper import (
+    encode_utf8, decode_utf8, unicode_encode_utf_32_be)
 from pypy.interpreter.unicodehelper import encode_utf8sp, decode_utf8sp
 
 
@@ -67,3 +70,23 @@
     assert map(ord, got) == [0xd800, 0xdc00]
     got = decode_utf8sp(space, "\xf0\x90\x80\x80")
     assert map(ord, got) == [0x10000]
+
+@pytest.mark.parametrize('unich', [u"\ud800", u"\udc80"])
+def test_utf32_surrogates(unich):
+    assert (unicode_encode_utf_32_be(unich, 1, None) ==
+            struct.pack('>i', ord(unich)))
+    with pytest.raises(UnicodeEncodeError):
+        unicode_encode_utf_32_be(unich, 1, None, allow_surrogates=False)
+
+    def replace_with(ru, rs):
+        def errorhandler(errors, enc, msg, u, startingpos, endingpos):
+            if errors == 'strict':
+                raise UnicodeEncodeError(enc, u, startingpos, endingpos, msg)
+            return ru, rs, endingpos
+        return unicode_encode_utf_32_be(
+            u"<%s>" % unich, 3, None,
+            errorhandler, allow_surrogates=False)
+
+    assert replace_with(u'rep', None) == u'<rep>'.encode('utf-32-be')
+    assert (replace_with(None, '\xca\xfe\xca\xfe') ==
+            '\x00\x00\x00<\xca\xfe\xca\xfe\x00\x00\x00>')
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,8 +1,13 @@
 import sys
 from pypy.interpreter.error import OperationError, oefmt
 from rpython.rlib.objectmodel import specialize
+from rpython.rlib.rarithmetic import intmask
+from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
 from rpython.rlib import runicode
-from pypy.module._codecs import interp_codecs
+from rpython.rlib.runicode import (
+    default_unicode_error_encode, default_unicode_error_decode,
+    MAXUNICODE, BYTEORDER, BYTEORDER2, UNICHR)
+
 _WIN32 = sys.platform == 'win32'
 _MACOSX = sys.platform == 'darwin'
 if _WIN32:
@@ -40,6 +45,7 @@
 # ____________________________________________________________
 
 def fsdecode(space, w_string):
+    from pypy.module._codecs import interp_codecs
     state = space.fromcache(interp_codecs.CodecState)
     if _WIN32:
         bytes = space.bytes_w(w_string)
@@ -70,6 +76,7 @@
     return space.newunicode(uni)
 
 def fsencode(space, w_uni):
+    from pypy.module._codecs import interp_codecs
     state = space.fromcache(interp_codecs.CodecState)
     if _WIN32:
         uni = space.unicode_w(w_uni)
@@ -107,6 +114,7 @@
 
 # These functions take and return unwrapped rpython strings and unicodes
 def decode_unicode_escape(space, string):
+    from pypy.module._codecs import interp_codecs
     state = space.fromcache(interp_codecs.CodecState)
     unicodedata_handler = state.get_unicodedata_handler(space)
     result, consumed = runicode.str_decode_unicode_escape(
@@ -157,3 +165,196 @@
     # encoding error, it should always be reversible, and the reverse is
     # encode_utf8sp().
     return decode_utf8(space, string, allow_surrogates=True)
+
+# ____________________________________________________________
+# utf-32
+
+def str_decode_utf_32(s, size, errors, final=True,
+                           errorhandler=None):
+    result, length, byteorder = str_decode_utf_32_helper(
+        s, size, errors, final, errorhandler, "native", 'utf-32-' + BYTEORDER2)
+    return result, length
+
+def str_decode_utf_32_be(s, size, errors, final=True,
+                              errorhandler=None):
+    result, length, byteorder = str_decode_utf_32_helper(
+        s, size, errors, final, errorhandler, "big", 'utf-32-be')
+    return result, length
+
+def str_decode_utf_32_le(s, size, errors, final=True,
+                              errorhandler=None):
+    result, length, byteorder = str_decode_utf_32_helper(
+        s, size, errors, final, errorhandler, "little", 'utf-32-le')
+    return result, length
+
+BOM32_DIRECT = intmask(0x0000FEFF)
+BOM32_REVERSE = intmask(0xFFFE0000)
+
+def str_decode_utf_32_helper(s, size, errors, final=True,
+                             errorhandler=None,
+                             byteorder="native",
+                             public_encoding_name='utf32'):
+    if errorhandler is None:
+        errorhandler = default_unicode_error_decode
+    bo = 0
+
+    if BYTEORDER == 'little':
+        iorder = [0, 1, 2, 3]
+    else:
+        iorder = [3, 2, 1, 0]
+
+    #  Check for BOM marks (U+FEFF) in the input and adjust current
+    #  byte order setting accordingly. In native mode, the leading BOM
+    #  mark is skipped, in all other modes, it is copied to the output
+    #  stream as-is (giving a ZWNBSP character).
+    pos = 0
+    if byteorder == 'native':
+        if size >= 4:
+            bom = intmask(
+                (ord(s[iorder[3]]) << 24) | (ord(s[iorder[2]]) << 16) |
+                (ord(s[iorder[1]]) << 8) | ord(s[iorder[0]]))
+            if BYTEORDER == 'little':
+                if bom == BOM32_DIRECT:
+                    pos += 4
+                    bo = -1
+                elif bom == BOM32_REVERSE:
+                    pos += 4
+                    bo = 1
+            else:
+                if bom == BOM32_DIRECT:
+                    pos += 4
+                    bo = 1
+                elif bom == BOM32_REVERSE:
+                    pos += 4
+                    bo = -1
+    elif byteorder == 'little':
+        bo = -1
+    else:
+        bo = 1
+    if size == 0:
+        return u'', 0, bo
+    if bo == -1:
+        # force little endian
+        iorder = [0, 1, 2, 3]
+    elif bo == 1:
+        # force big endian
+        iorder = [3, 2, 1, 0]
+
+    result = UnicodeBuilder(size // 4)
+
+    while pos < size:
+        # remaining bytes at the end? (size should be divisible by 4)
+        if len(s) - pos < 4:
+            if not final:
+                break
+            r, pos = errorhandler(errors, public_encoding_name,
+                                  "truncated data",
+                                  s, pos, len(s))
+            result.append(r)
+            if len(s) - pos < 4:
+                break
+            continue
+        ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 
16) |
+            (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]]))
+        if ch >= 0x110000:
+            r, pos = errorhandler(errors, public_encoding_name,
+                                  "codepoint not in range(0x110000)",
+                                  s, pos, len(s))
+            result.append(r)
+            continue
+
+        if MAXUNICODE < 65536 and ch >= 0x10000:
+            ch -= 0x10000L
+            result.append(unichr(0xD800 + (ch >> 10)))
+            result.append(unichr(0xDC00 + (ch & 0x03FF)))
+        else:
+            result.append(UNICHR(ch))
+        pos += 4
+    return result.build(), pos, bo
+
+def _STORECHAR32(result, CH, byteorder):
+    c0 = chr(((CH) >> 24) & 0xff)
+    c1 = chr(((CH) >> 16) & 0xff)
+    c2 = chr(((CH) >> 8) & 0xff)
+    c3 = chr((CH) & 0xff)
+    if byteorder == 'little':
+        result.append(c3)
+        result.append(c2)
+        result.append(c1)
+        result.append(c0)
+    else:
+        result.append(c0)
+        result.append(c1)
+        result.append(c2)
+        result.append(c3)
+
+def unicode_encode_utf_32_helper(s, size, errors,
+                                 errorhandler=None,
+                                 allow_surrogates=True,
+                                 byteorder='little',
+                                 public_encoding_name='utf32'):
+    if errorhandler is None:
+        errorhandler = default_unicode_error_encode
+    if size == 0:
+        if byteorder == 'native':
+            result = StringBuilder(4)
+            _STORECHAR32(result, 0xFEFF, BYTEORDER)
+            return result.build()
+        return ""
+
+    result = StringBuilder(size * 4 + 4)
+    if byteorder == 'native':
+        _STORECHAR32(result, 0xFEFF, BYTEORDER)
+        byteorder = BYTEORDER
+
+    pos = 0
+    while pos < size:
+        ch = ord(s[pos])
+        pos += 1
+        ch2 = 0
+        if not allow_surrogates and 0xD800 <= ch < 0xE000:
+            ru, rs, pos = errorhandler(
+                errors, public_encoding_name, 'surrogates not allowed',
+                s, pos - 1, pos)
+            if rs is not None:
+                # py3k only
+                if len(rs) % 4 != 0:
+                    errorhandler(
+                        'strict', public_encoding_name, 'surrogates not 
allowed',
+                        s, pos - 1, pos)
+                result.append(rs)
+                continue
+            for ch in ru:
+                if ord(ch) < 0xD800:
+                    _STORECHAR32(result, ord(ch), byteorder)
+                else:
+                    errorhandler(
+                        'strict', public_encoding_name,
+                        'surrogates not allowed', s, pos - 1, pos)
+            continue
+        if 0xD800 <= ch < 0xDC00 and MAXUNICODE < 65536 and pos < size:
+            ch2 = ord(s[pos])
+            if 0xDC00 <= ch2 < 0xE000:
+                ch = (((ch & 0x3FF) << 10) | (ch2 & 0x3FF)) + 0x10000
+                pos += 1
+        _STORECHAR32(result, ch, byteorder)
+
+    return result.build()
+
+def unicode_encode_utf_32(s, size, errors,
+                               errorhandler=None, allow_surrogates=True):
+    return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+                                        allow_surrogates, "native",
+                                        'utf-32-' + BYTEORDER2)
+
+def unicode_encode_utf_32_be(s, size, errors,
+                                  errorhandler=None, allow_surrogates=True):
+    return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+                                        allow_surrogates, "big",
+                                        'utf-32-be')
+
+def unicode_encode_utf_32_le(s, size, errors,
+                                  errorhandler=None, allow_surrogates=True):
+    return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+                                        allow_surrogates, "little",
+                                        'utf-32-le')
diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -2,12 +2,14 @@
 from rpython.rlib import jit
 from rpython.rlib.objectmodel import we_are_translated, not_rpython
 from rpython.rlib.rstring import UnicodeBuilder, StringBuilder
+from rpython.rlib import runicode
 from rpython.rlib.runicode import (
     code_to_unichr, MAXUNICODE,
     raw_unicode_escape_helper_unicode)
 
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
+from pypy.interpreter import unicodehelper
 from pypy.module.unicodedata import unicodedb
 
 
@@ -244,7 +246,8 @@
 def xmlcharrefreplace_errors(space, w_exc):
     check_exception(space, w_exc)
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
-        obj = space.realunicode_w(space.getattr(w_exc, 
space.newtext('object')))
+        w_obj = space.getattr(w_exc, space.newtext('object'))
+        obj = space.realunicode_w(w_obj)
         start = space.int_w(space.getattr(w_exc, space.newtext('start')))
         w_end = space.getattr(w_exc, space.newtext('end'))
         end = space.int_w(w_end)
@@ -301,7 +304,8 @@
 def namereplace_errors(space, w_exc):
     check_exception(space, w_exc)
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
-        obj = space.realunicode_w(space.getattr(w_exc, 
space.newtext('object')))
+        w_obj = space.getattr(w_exc, space.newtext('object'))
+        obj = space.realunicode_w(w_obj)
         start = space.int_w(space.getattr(w_exc, space.newtext('start')))
         w_end = space.getattr(w_exc, space.newtext('end'))
         end = space.int_w(w_end)
@@ -611,48 +615,47 @@
     return _call_codec(space, w_decoder, w_obj, "decoding", encoding, errors)
 
 # ____________________________________________________________
-# delegation to runicode
+# delegation to runicode/unicodehelper
 
-from rpython.rlib import runicode
+def _find_implementation(impl_name):
+    try:
+        func = getattr(unicodehelper, impl_name)
+    except AttributeError:
+        if hasattr(runicode, 'py3k_' + impl_name):
+            impl_name = 'py3k_' + impl_name
+        func = getattr(runicode, impl_name)
+    return func
 
 def make_encoder_wrapper(name):
     rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
-    assert hasattr(runicode, rname)
-    if hasattr(runicode, 'py3k_' + rname):
-        rname = 'py3k_' + rname
+    func = _find_implementation(rname)
     @unwrap_spec(uni=unicode, errors='text_or_none')
     def wrap_encoder(space, uni, errors="strict"):
         if errors is None:
             errors = 'strict'
         state = space.fromcache(CodecState)
-        func = getattr(runicode, rname)
         result = func(uni, len(uni), errors, state.encode_error_handler)
         return space.newtuple([space.newbytes(result), space.newint(len(uni))])
-    wrap_encoder.func_name = rname
+    wrap_encoder.__name__ = func.__name__
     globals()[name] = wrap_encoder
 
 def make_utf_encoder_wrapper(name):
     rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
-    assert hasattr(runicode, rname)
-    if hasattr(runicode, 'py3k_' + rname):
-        rname = 'py3k_' + rname
+    func = _find_implementation(rname)
     @unwrap_spec(uni=unicode, errors='text_or_none')
     def wrap_encoder(space, uni, errors="strict"):
         if errors is None:
             errors = 'strict'
         state = space.fromcache(CodecState)
-        func = getattr(runicode, rname)
         result = func(uni, len(uni), errors, state.encode_error_handler,
                       allow_surrogates=False)
         return space.newtuple([space.newbytes(result), space.newint(len(uni))])
-    wrap_encoder.func_name = rname
+    wrap_encoder.__name__ = func.__name__
     globals()[name] = wrap_encoder
 
 def make_decoder_wrapper(name):
     rname = "str_decode_%s" % (name.replace("_decode", ""), )
-    assert hasattr(runicode, rname)
-    if hasattr(runicode, 'py3k_' + rname):
-        rname = 'py3k_' + rname
+    func = _find_implementation(rname)
     @unwrap_spec(string='bufferstr', errors='text_or_none',
                  w_final=WrappedDefault(False))
     def wrap_decoder(space, string, errors="strict", w_final=None):
@@ -660,11 +663,10 @@
             errors = 'strict'
         final = space.is_true(w_final)
         state = space.fromcache(CodecState)
-        func = getattr(runicode, rname)
         result, consumed = func(string, len(string), errors,
                                 final, state.decode_error_handler)
         return space.newtuple([space.newunicode(result), 
space.newint(consumed)])
-    wrap_decoder.func_name = rname
+    wrap_decoder.__name__ = func.__name__
     globals()[name] = wrap_decoder
 
 for encoder in [
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -116,10 +116,10 @@
         raises(TypeError, charmap_decode, b'\xff', "strict",  {0xff: 0x110000})
         assert (charmap_decode(b"\x00\x01\x02", "strict",
                                {0: 0x10FFFF, 1: ord('b'), 2: ord('c')}) ==
-                u"\U0010FFFFbc", 3)
+                (u"\U0010FFFFbc", 3))
         assert (charmap_decode(b"\x00\x01\x02", "strict",
                                {0: u'\U0010FFFF', 1: u'b', 2: u'c'}) ==
-                u"\U0010FFFFbc", 3)
+                (u"\U0010FFFFbc", 3))
 
     def test_escape_decode_errors(self):
         from _codecs import escape_decode as decode
@@ -590,6 +590,12 @@
 
     def test_backslashreplace(self):
         import codecs
+        sin = u"a\xac\u1234\u20ac\u8000\U0010ffff"
+        expected = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
+        assert sin.encode('ascii', 'backslashreplace') == expected
+        expected = b"a\xac\\u1234\xa4\\u8000\\U0010ffff"
+        assert sin.encode("iso-8859-15", "backslashreplace") == expected
+
         assert 'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') 
== b'a\\xac\u1234\u20ac\u8000'
         assert b'\x00\x60\x80'.decode(
             'ascii', 'backslashreplace') == u'\x00\x60\\x80'
@@ -732,7 +738,7 @@
         def handler_unicodeinternal(exc):
             if not isinstance(exc, UnicodeDecodeError):
                 raise TypeError("don't know how to handle %r" % exc)
-            return ("\x01", 1)
+            return (u"\x01", 1)
         codecs.register_error("test.hui", handler_unicodeinternal)
         res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui")
         if sys.maxunicode > 65535:
@@ -939,3 +945,31 @@
             assert len(w) == 1
             assert str(w[0].message) == warning_msg
             assert w[0].category == DeprecationWarning
+
+    def test_xmlcharrefreplace(self):
+        r = u'\u1234\u0080\u2345\u0079\u00AB'.encode('latin1', 
'xmlcharrefreplace')
+        assert r == b'&#4660;\x80&#9029;y\xab'
+        r = u'\u1234\u0080\u2345\u0079\u00AB'.encode('ascii', 
'xmlcharrefreplace')
+        assert r == b'&#4660;&#128;&#9029;y&#171;'
+
+    def test_errorhandler_collection(self):
+        import _codecs
+        errors = []
+        def record_error(exc):
+            if not isinstance(exc, UnicodeEncodeError):
+                raise TypeError("don't know how to handle %r" % exc)
+            errors.append(exc.object[exc.start:exc.end])
+            return (u'', exc.end)
+        _codecs.register_error("test.record", record_error)
+
+        sin = u"\xac\u1234\u1234\u20ac\u8000"
+        assert sin.encode("ascii", "test.record") == b""
+        assert errors == [sin]
+
+        errors = []
+        assert sin.encode("latin-1", "test.record") == b"\xac"
+        assert errors == [u'\u1234\u1234\u20ac\u8000']
+
+        errors = []
+        assert sin.encode("iso-8859-15", "test.record") == b"\xac\xa4"
+        assert errors == [u'\u1234\u1234', u'\u8000']
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -429,6 +429,7 @@
     if not space.isinstance_w(w_decoded, space.w_unicode):
         msg = "decoder should return a string result, not '%T'"
         raise oefmt(space.w_TypeError, msg, w_decoded)
+    return w_decoded
 
 
 class W_TextIOWrapper(W_TextIOBase):
@@ -997,12 +998,13 @@
 
             w_decoded = space.call_method(self.w_decoder, "decode",
                                           w_chunk, 
space.newbool(bool(cookie.need_eof)))
-            self.decoded.set(space, w_decoded)
+            w_decoded = check_decoded(space, w_decoded)
 
             # Skip chars_to_skip of the decoded characters
-            if len(self.decoded.text) < cookie.chars_to_skip:
+            if space.len_w(w_decoded) < cookie.chars_to_skip:
                 raise oefmt(space.w_IOError,
                             "can't restore logical file position")
+            self.decoded.set(space, w_decoded)
             self.decoded.pos = cookie.chars_to_skip
         else:
             self.snapshot = PositionSnapshot(cookie.dec_flags, "")
@@ -1015,11 +1017,9 @@
 
     def tell_w(self, space):
         self._check_closed(space)
-
         if not self.seekable:
             self._unsupportedoperation(space,
                                        "underlying stream is not seekable")
-
         if not self.telling:
             raise oefmt(space.w_IOError,
                         "telling position disabled by next() call")
@@ -1089,14 +1089,14 @@
                 # We didn't get enough decoded data; signal EOF to get more.
                 w_decoded = space.call_method(self.w_decoder, "decode",
                                               space.newbytes(""),
-                                              space.newint(1)) # final=1
+                                              space.newint(1))  # final=1
                 check_decoded(space, w_decoded)
-                chars_decoded += len(space.unicode_w(w_decoded))
+                chars_decoded += space.len_w(w_decoded)
                 cookie.need_eof = 1
 
                 if chars_decoded < chars_to_skip:
                     raise oefmt(space.w_IOError,
-                                "can't reconstruct logical file position")
+                        "can't reconstruct logical file position")
         finally:
             space.call_method(self.w_decoder, "setstate", w_saved_state)
 
diff --git a/pypy/module/_io/test/test_interp_textio.py 
b/pypy/module/_io/test/test_interp_textio.py
--- a/pypy/module/_io/test/test_interp_textio.py
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -40,7 +40,8 @@
         w_newline=space.newtext(mode))
     lines = []
     for limit in limits:
-        line = space.unicode_w(w_textio.readline_w(space, space.newint(limit)))
+        w_line = w_textio.readline_w(space, space.newint(limit))
+        line = space.unicode_w(w_line)
         if limit >= 0:
             assert len(line) <= limit
         if line:
diff --git a/pypy/module/_pypyjson/interp_decoder.py 
b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -76,7 +76,7 @@
         self.ll_chars = rffi.str2charp(s)
         self.end_ptr = lltype.malloc(rffi.CCHARPP.TO, 1, flavor='raw')
         self.pos = 0
-        self.cache = r_dict(slice_eq, slice_hash)
+        self.cache = r_dict(slice_eq, slice_hash, simple_hash_eq=True)
 
     def close(self):
         rffi.free_charp(self.ll_chars)
diff --git a/pypy/module/_pypyjson/interp_encoder.py 
b/pypy/module/_pypyjson/interp_encoder.py
--- a/pypy/module/_pypyjson/interp_encoder.py
+++ b/pypy/module/_pypyjson/interp_encoder.py
@@ -49,24 +49,24 @@
         first = 0
 
     for i in range(first, len(u)):
-        c = u[i]
-        if c <= u'~':
-            if c == u'"' or c == u'\\':
+        c = ord(u[i])
+        if c <= ord('~'):
+            if c == ord('"') or c == ord('\\'):
                 sb.append('\\')
-            elif c < u' ':
-                sb.append(ESCAPE_BEFORE_SPACE[ord(c)])
+            elif c < ord(' '):
+                sb.append(ESCAPE_BEFORE_SPACE[c])
                 continue
-            sb.append(chr(ord(c)))
+            sb.append(chr(c))
         else:
-            if c <= u'\uffff':
+            if c <= ord(u'\uffff'):
                 sb.append('\\u')
-                sb.append(HEX[ord(c) >> 12])
-                sb.append(HEX[(ord(c) >> 8) & 0x0f])
-                sb.append(HEX[(ord(c) >> 4) & 0x0f])
-                sb.append(HEX[ord(c) & 0x0f])
+                sb.append(HEX[c >> 12])
+                sb.append(HEX[(c >> 8) & 0x0f])
+                sb.append(HEX[(c >> 4) & 0x0f])
+                sb.append(HEX[c & 0x0f])
             else:
                 # surrogate pair
-                n = ord(c) - 0x10000
+                n = c - 0x10000
                 s1 = 0xd800 | ((n >> 10) & 0x3ff)
                 sb.append('\\ud')
                 sb.append(HEX[(s1 >> 8) & 0x0f])
diff --git a/pypy/module/_rawffi/alt/type_converter.py 
b/pypy/module/_rawffi/alt/type_converter.py
--- a/pypy/module/_rawffi/alt/type_converter.py
+++ b/pypy/module/_rawffi/alt/type_converter.py
@@ -128,7 +128,7 @@
         intval: lltype.Signed
         """
         self.error(w_ffitype, w_obj)
-        
+
     def handle_unichar(self, w_ffitype, w_obj, intval):
         """
         intval: lltype.Signed
@@ -174,7 +174,7 @@
     def handle_struct_rawffi(self, w_ffitype, w_structinstance):
         """
         This method should be killed as soon as we remove support for _rawffi 
structures
-        
+
         w_structinstance: W_StructureInstance
         """
         self.error(w_ffitype, w_structinstance)
@@ -349,7 +349,7 @@
     def get_struct_rawffi(self, w_ffitype, w_structdescr):
         """
         This should be killed as soon as we kill support for _rawffi structures
-        
+
         Return type: lltype.Unsigned
         (the address of the structure)
         """
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -580,11 +580,13 @@
 
     @unwrap_spec(w_groupnum=WrappedDefault(0))
     def start_w(self, w_groupnum):
-        return self.space.newint(self.do_span(w_groupnum)[0])
+        start, end = self.do_span(w_groupnum)
+        return self.space.newint(start)
 
     @unwrap_spec(w_groupnum=WrappedDefault(0))
     def end_w(self, w_groupnum):
-        return self.space.newint(self.do_span(w_groupnum)[1])
+        start, end = self.do_span(w_groupnum)
+        return self.space.newint(end)
 
     @unwrap_spec(w_groupnum=WrappedDefault(0))
     def span_w(self, w_groupnum):
diff --git a/pypy/module/_sre/test/test_app_sre.py 
b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -94,6 +94,14 @@
         assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus")
         assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs")
 
+    def test_findall_unicode(self):
+        import re
+        assert [u"\u1234"] == re.findall(u"\u1234", u"\u1000\u1234\u2000")
+        assert ["a", "u"] == re.findall("b(.)", "abalbus")
+        assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus")
+        assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs")
+        assert [u"xyz"] == re.findall(u".*yz", u"xyz")
+
     def test_finditer(self):
         import re
         it = re.finditer("b(.)", "brabbel")
@@ -1046,3 +1054,14 @@
         import re
         raises(ValueError, re.split, '', '')
         re.split("a*", '')    # -> warning
+
+class AppTestUnicodeExtra:
+    def test_string_attribute(self):
+        import re
+        match = re.search(u"\u1234", u"\u1233\u1234\u1235")
+        assert match.string == u"\u1233\u1234\u1235"
+
+    def test_match_start(self):
+        import re
+        match = re.search(u"\u1234", u"\u1233\u1234\u1235")
+        assert match.start() == 1
diff --git a/pypy/module/cpyext/unicodeobject.py 
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -15,6 +15,7 @@
 from pypy.module.cpyext.bytesobject import PyBytes_Check, PyBytes_FromObject
 from pypy.module._codecs.interp_codecs import (
     CodecState, latin_1_decode, utf_16_decode, utf_32_decode)
+from pypy.interpreter import unicodehelper
 from pypy.objspace.std import unicodeobject
 from rpython.rlib import rstring, runicode
 from rpython.tool.sourcetools import func_renamer
@@ -869,7 +870,7 @@
     else:
         errors = None
 
-    result, length, byteorder = runicode.str_decode_utf_32_helper(
+    result, length, byteorder = unicodehelper.str_decode_utf_32_helper(
         string, size, errors,
         True, # final ? false for multiple passes?
         None, # errorhandler
diff --git a/pypy/module/time/interp_time.py b/pypy/module/time/interp_time.py
--- a/pypy/module/time/interp_time.py
+++ b/pypy/module/time/interp_time.py
@@ -245,7 +245,7 @@
     LPDWORD = rwin32.LPDWORD
     _GetSystemTimeAdjustment = rwin32.winexternal(
                                             'GetSystemTimeAdjustment',
-                                            [LPDWORD, LPDWORD, rwin32.LPBOOL], 
+                                            [LPDWORD, LPDWORD, rwin32.LPBOOL],
                                             rffi.INT)
     def gettimeofday(space, w_info=None):
         with lltype.scoped_alloc(rwin32.FILETIME) as system_time:
@@ -270,7 +270,7 @@
                      lltype.scoped_alloc(rwin32.LPBOOL.TO, 1) as 
is_time_adjustment_disabled:
                     _GetSystemTimeAdjustment(time_adjustment, time_increment,
                                              is_time_adjustment_disabled)
-                    
+
                     _setinfo(space, w_info, "GetSystemTimeAsFileTime()",
                              time_increment[0] * 1e-7, False, True)
             return space.newfloat(tv_sec + tv_usec * 1e-6)
@@ -303,7 +303,7 @@
                           widen(t.c_millitm) * 0.001)
                 if w_info is not None:
                     _setinfo(space, w_info, "ftime()", 1e-3,
-                             False, True) 
+                             False, True)
             return space.newfloat(result)
         else:
             if w_info:
@@ -955,7 +955,7 @@
                                                  
[rffi.CArrayPtr(lltype.SignedLongLong)],
                                                  rwin32.DWORD)
     QueryPerformanceFrequency = rwin32.winexternal(
-        'QueryPerformanceFrequency', [rffi.CArrayPtr(lltype.SignedLongLong)], 
+        'QueryPerformanceFrequency', [rffi.CArrayPtr(lltype.SignedLongLong)],
         rffi.INT)
     def win_perf_counter(space, w_info=None):
         with lltype.scoped_alloc(rffi.CArray(rffi.lltype.SignedLongLong), 1) 
as a:
diff --git a/pypy/module/time/test/test_time.py 
b/pypy/module/time/test/test_time.py
--- a/pypy/module/time/test/test_time.py
+++ b/pypy/module/time/test/test_time.py
@@ -19,6 +19,8 @@
         raises(TypeError, time.sleep, "foo")
         time.sleep(0.12345)
         raises(ValueError, time.sleep, -1.0)
+        raises(ValueError, time.sleep, float('nan'))
+        raises(OverflowError, time.sleep, float('inf'))
 
     def test_clock(self):
         import time
diff --git a/pypy/module/unicodedata/interp_ucd.py 
b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -268,10 +268,10 @@
                 result[0] = ch
 
         if not composed: # If decomposed normalization we are done
-            return space.newunicode(u''.join([unichr(i) for i in result[:j]]))
+            return self.build(space, result, stop=j)
 
         if j <= 1:
-            return space.newunicode(u''.join([unichr(i) for i in result[:j]]))
+            return self.build(space, result, stop=j)
 
         current = result[0]
         starter_pos = 0
@@ -319,7 +319,10 @@
 
         result[starter_pos] = current
 
-        return space.newunicode(u''.join([unichr(i) for i in 
result[:next_insert]]))
+        return self.build(space, result, stop=next_insert)
+
+    def build(self, space, r, stop):
+        return space.newunicode(u''.join([unichr(i) for i in r[:stop]]))
 
 
 methods = {}
diff --git a/pypy/objspace/std/test/test_unicodeobject.py 
b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -57,6 +57,11 @@
         assert 'a' + 'b' == 'ab'
         raises(TypeError, operator.add, b'a', 'b')
 
+    def test_getitem(self):
+        assert u'abc'[2] == 'c'
+        raises(IndexError, u'abc'.__getitem__, 15)
+        assert u'g\u0105\u015b\u0107'[2] == u'\u015b'
+
     def test_join(self):
         def check(a, b):
             assert a == b
@@ -82,6 +87,8 @@
         assert '\n\n'.splitlines() == ['', '']
         assert 'a\nb\nc'.splitlines(1) == ['a\n', 'b\n', 'c']
         assert '\na\nb\n'.splitlines(1) == ['\n', 'a\n', 'b\n']
+        assert ((u'a' + b'\xc2\x85'.decode('utf8') + u'b\n').splitlines() ==
+                ['a', 'b'])
 
     def test_zfill(self):
         assert '123'.zfill(2) == '123'
@@ -128,55 +135,57 @@
         raises(ValueError, 'abc'.split, '')
         raises(ValueError, 'abc'.split, '')
         assert '   a b c d'.split(None, 0) == ['a b c d']
+        assert u'a\nb\u1680c'.split() == [u'a', u'b', u'c']
 
     def test_rsplit(self):
-        assert "".rsplit() == []
-        assert " ".rsplit() == []
-        assert "a".rsplit() == ['a']
-        assert "a".rsplit("a", 1) == ['', '']
-        assert " ".rsplit(" ", 1) == ['', '']
-        assert "aa".rsplit("a", 2) == ['', '', '']
-        assert " a ".rsplit() == ['a']
-        assert "a b c".rsplit() == ['a','b','c']
-        assert 'this is the rsplit function'.rsplit() == ['this', 'is', 'the', 
'rsplit', 'function']
-        assert 'a|b|c|d'.rsplit('|') == ['a', 'b', 'c', 'd']
-        assert 'a|b|c|d'.rsplit('|') == ['a', 'b', 'c', 'd']
-        assert 'a|b|c|d'.rsplit('|') == ['a', 'b', 'c', 'd']
-        assert 'a|b|c|d'.rsplit('|', 2) == ['a|b', 'c', 'd']
-        assert 'a b c d'.rsplit(None, 1) == ['a b c', 'd']
-        assert 'a b c d'.rsplit(None, 2) == ['a b', 'c', 'd']
-        assert 'a b c d'.rsplit(None, 3) == ['a', 'b', 'c', 'd']
-        assert 'a b c d'.rsplit(None, 4) == ['a', 'b', 'c', 'd']
-        assert 'a b c d'.rsplit(None, 0) == ['a b c d']
-        assert 'a  b  c  d'.rsplit(None, 2) == ['a  b', 'c', 'd']
-        assert 'a b c d '.rsplit() == ['a', 'b', 'c', 'd']
-        assert 'a//b//c//d'.rsplit('//') == ['a', 'b', 'c', 'd']
-        assert 'endcase test'.rsplit('test') == ['endcase ', '']
-        raises(ValueError, 'abc'.rsplit, '')
-        raises(ValueError, 'abc'.rsplit, '')
-        raises(ValueError, 'abc'.rsplit, '')
-        assert '  a b c  '.rsplit(None, 0) == ['  a b c']
-        assert ''.rsplit('aaa') == ['']
+        assert u"".rsplit() == []
+        assert u" ".rsplit() == []
+        assert u"a".rsplit() == [u'a']
+        assert u"a".rsplit(u"a", 1) == [u'', u'']
+        assert u" ".rsplit(u" ", 1) == [u'', u'']
+        assert u"aa".rsplit(u"a", 2) == [u'', u'', u'']
+        assert u" a ".rsplit() == [u'a']
+        assert u"a b c".rsplit() == [u'a',u'b',u'c']
+        assert u'this is the rsplit function'.rsplit() == [u'this', u'is', 
u'the', u'rsplit', u'function']
+        assert u'a|b|c|d'.rsplit(u'|') == [u'a', u'b', u'c', u'd']
+        assert u'a|b|c|d'.rsplit('|') == [u'a', u'b', u'c', u'd']
+        assert 'a|b|c|d'.rsplit(u'|') == [u'a', u'b', u'c', u'd']
+        assert u'a|b|c|d'.rsplit(u'|', 2) == [u'a|b', u'c', u'd']
+        assert u'a b c d'.rsplit(None, 1) == [u'a b c', u'd']
+        assert u'a b c d'.rsplit(None, 2) == [u'a b', u'c', u'd']
+        assert u'a b c d'.rsplit(None, 3) == [u'a', u'b', u'c', u'd']
+        assert u'a b c d'.rsplit(None, 4) == [u'a', u'b', u'c', u'd']
+        assert u'a b c d'.rsplit(None, 0) == [u'a b c d']
+        assert u'a  b  c  d'.rsplit(None, 2) == [u'a  b', u'c', u'd']
+        assert u'a b c d '.rsplit() == [u'a', u'b', u'c', u'd']
+        assert u'a//b//c//d'.rsplit(u'//') == [u'a', u'b', u'c', u'd']
+        assert u'endcase test'.rsplit(u'test') == [u'endcase ', u'']
+        raises(ValueError, u'abc'.rsplit, u'')
+        raises(ValueError, u'abc'.rsplit, '')
+        raises(ValueError, 'abc'.rsplit, u'')
+        assert u'  a b c  '.rsplit(None, 0) == [u'  a b c']
+        assert u''.rsplit('aaa') == [u'']
+        assert u'a\nb\u1680c'.rsplit() == [u'a', u'b', u'c']
 
     def test_center(self):
-        s="a b"
-        assert s.center(0) == "a b"
-        assert s.center(1) == "a b"
-        assert s.center(2) == "a b"
-        assert s.center(3) == "a b"
-        assert s.center(4) == "a b "
-        assert s.center(5) == " a b "
-        assert s.center(6) == " a b  "
-        assert s.center(7) == "  a b  "
-        assert s.center(8) == "  a b   "
-        assert s.center(9) == "   a b   "
-        assert 'abc'.center(10) == '   abc    '
-        assert 'abc'.center(6) == ' abc  '
-        assert 'abc'.center(3) == 'abc'
-        assert 'abc'.center(2) == 'abc'
-        assert 'abc'.center(5, '*') == '*abc*'    # Python 2.4
-        assert 'abc'.center(5, '*') == '*abc*'     # Python 2.4
-        raises(TypeError, 'abc'.center, 4, 'cba')
+        s=u"a b"
+        assert s.center(0) == u"a b"
+        assert s.center(1) == u"a b"
+        assert s.center(2) == u"a b"
+        assert s.center(3) == u"a b"
+        assert s.center(4) == u"a b "
+        assert s.center(5) == u" a b "
+        assert s.center(6) == u" a b  "
+        assert s.center(7) == u"  a b  "
+        assert s.center(8) == u"  a b   "
+        assert s.center(9) == u"   a b   "
+        assert u'abc'.center(10) == u'   abc    '
+        assert u'abc'.center(6) == u' abc  '
+        assert u'abc'.center(3) == u'abc'
+        assert u'abc'.center(2) == u'abc'
+        assert u'abc'.center(5, u'*') == u'*abc*'    # Python 2.4
+        assert u'abc'.center(5, '*') == u'*abc*'     # Python 2.4
+        raises(TypeError, u'abc'.center, 4, u'cba')
 
     def test_title(self):
         assert "brown fox".title() == "Brown Fox"
@@ -186,23 +195,25 @@
         assert "bro!wn fox".title() == "Bro!Wn Fox"
         assert u'A\u03a3 \u1fa1xy'.title() == u'A\u03c2 \u1fa9xy'
         assert u'A\u03a3A'.title() == u'A\u03c3a'
+        assert u"brow\u4321n fox".title() == u"Brow\u4321N Fox"
+        assert u'\ud800'.title() == u'\ud800'
 
     def test_istitle(self):
-        assert "".istitle() == False
-        assert "!".istitle() == False
-        assert "!!".istitle() == False
-        assert "brown fox".istitle() == False
-        assert "!brown fox".istitle() == False
-        assert "bROWN fOX".istitle() == False
-        assert "Brown Fox".istitle() == True
-        assert "bro!wn fox".istitle() == False
-        assert "Bro!wn fox".istitle() == False
-        assert "!brown Fox".istitle() == False
-        assert "!Brown Fox".istitle() == True
-        assert "Brow&&&&N Fox".istitle() == True
-        assert "!Brow&&&&n Fox".istitle() == False
-        assert '\u1FFc'.istitle()
-        assert 'Greek \u1FFcitlecases ...'.istitle()
+        assert u"".istitle() == False
+        assert u"!".istitle() == False
+        assert u"!!".istitle() == False
+        assert u"brown fox".istitle() == False
+        assert u"!brown fox".istitle() == False
+        assert u"bROWN fOX".istitle() == False
+        assert u"Brown Fox".istitle() == True
+        assert u"bro!wn fox".istitle() == False
+        assert u"Bro!wn fox".istitle() == False
+        assert u"!brown Fox".istitle() == False
+        assert u"!Brown Fox".istitle() == True
+        assert u"Brow&&&&N Fox".istitle() == True
+        assert u"!Brow&&&&n Fox".istitle() == False
+        assert u'\u1FFc'.istitle()
+        assert u'Greek \u1FFcitlecases ...'.istitle()
 
     def test_islower_isupper_with_titlecase(self):
         # \u01c5 is a char which is neither lowercase nor uppercase, but
@@ -220,24 +231,36 @@
         assert "_!var".isidentifier() is False
         assert "3abc".isidentifier() is False
 
+    def test_lower_upper(self):
+        assert u'a'.lower() == u'a'
+        assert u'A'.lower() == u'a'
+        assert u'\u0105'.lower() == u'\u0105'
+        assert u'\u0104'.lower() == u'\u0105'
+        assert u'\ud800'.lower() == u'\ud800'
+        assert u'a'.upper() == u'A'
+        assert u'A'.upper() == u'A'
+        assert u'\u0105'.upper() == u'\u0104'
+        assert u'\u0104'.upper() == u'\u0104'
+        assert u'\ud800'.upper() == u'\ud800'
+
     def test_capitalize(self):
-        assert "brown fox".capitalize() == "Brown fox"
-        assert ' hello '.capitalize() == ' hello '
-        assert 'Hello '.capitalize() == 'Hello '
-        assert 'hello '.capitalize() == 'Hello '
-        assert 'aaaa'.capitalize() == 'Aaaa'
-        assert 'AaAa'.capitalize() == 'Aaaa'
+        assert u"brown fox".capitalize() == u"Brown fox"
+        assert u' hello '.capitalize() == u' hello '
+        assert u'Hello '.capitalize() == u'Hello '
+        assert u'hello '.capitalize() == u'Hello '
+        assert u'aaaa'.capitalize() == u'Aaaa'
+        assert u'AaAa'.capitalize() == u'Aaaa'
         # check that titlecased chars are lowered correctly
         # \u1ffc is the titlecased char
-        assert ('\u1ff3\u1ff3\u1ffc\u1ffc'.capitalize() ==
-                '\u03a9\u0399\u1ff3\u1ff3\u1ff3')
+        assert (u'\u1ff3\u1ff3\u1ffc\u1ffc'.capitalize() ==
+                u'\u03a9\u0399\u1ff3\u1ff3\u1ff3')
         # check with cased non-letter chars
-        assert ('\u24c5\u24ce\u24c9\u24bd\u24c4\u24c3'.capitalize() ==
-                '\u24c5\u24e8\u24e3\u24d7\u24de\u24dd')
-        assert ('\u24df\u24e8\u24e3\u24d7\u24de\u24dd'.capitalize() ==
-                '\u24c5\u24e8\u24e3\u24d7\u24de\u24dd')
-        assert '\u2160\u2161\u2162'.capitalize() == '\u2160\u2171\u2172'
-        assert '\u2170\u2171\u2172'.capitalize() == '\u2160\u2171\u2172'
+        assert (u'\u24c5\u24ce\u24c9\u24bd\u24c4\u24c3'.capitalize() ==
+                u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd')
+        assert (u'\u24df\u24e8\u24e3\u24d7\u24de\u24dd'.capitalize() ==
+                u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd')
+        assert u'\u2160\u2161\u2162'.capitalize() == u'\u2160\u2171\u2172'
+        assert u'\u2170\u2171\u2172'.capitalize() == u'\u2160\u2171\u2172'
         # check with Ll chars with no upper - nothing changes here
         assert ('\u019b\u1d00\u1d86\u0221\u1fb7'.capitalize() ==
                 '\u019b\u1d00\u1d86\u0221\u1fb7')
@@ -261,34 +284,36 @@
     def test_isprintable_wide(self):
         assert '\U0001F46F'.isprintable()  # Since unicode 6.0
         assert not '\U000E0020'.isprintable()
+        assert u'\ud800'.capitalize() == u'\ud800'
+        assert u'xx\ud800'.capitalize() == u'Xx\ud800'
 
     def test_rjust(self):
-        s = "abc"
+        s = u"abc"
         assert s.rjust(2) == s
         assert s.rjust(3) == s
-        assert s.rjust(4) == " " + s
-        assert s.rjust(5) == "  " + s
-        assert 'abc'.rjust(10) == '       abc'
-        assert 'abc'.rjust(6) == '   abc'
-        assert 'abc'.rjust(3) == 'abc'
-        assert 'abc'.rjust(2) == 'abc'
-        assert 'abc'.rjust(5, '*') == '**abc'    # Python 2.4
-        assert 'abc'.rjust(5, '*') == '**abc'     # Python 2.4
-        raises(TypeError, 'abc'.rjust, 5, 'xx')
+        assert s.rjust(4) == u" " + s
+        assert s.rjust(5) == u"  " + s
+        assert u'abc'.rjust(10) == u'       abc'
+        assert u'abc'.rjust(6) == u'   abc'
+        assert u'abc'.rjust(3) == u'abc'
+        assert u'abc'.rjust(2) == u'abc'
+        assert u'abc'.rjust(5, u'*') == u'**abc'    # Python 2.4
+        assert u'abc'.rjust(5, '*') == u'**abc'     # Python 2.4
+        raises(TypeError, u'abc'.rjust, 5, u'xx')
 
     def test_ljust(self):
-        s = "abc"
+        s = u"abc"
         assert s.ljust(2) == s
         assert s.ljust(3) == s
-        assert s.ljust(4) == s + " "
-        assert s.ljust(5) == s + "  "
-        assert 'abc'.ljust(10) == 'abc       '
-        assert 'abc'.ljust(6) == 'abc   '
-        assert 'abc'.ljust(3) == 'abc'
-        assert 'abc'.ljust(2) == 'abc'
-        assert 'abc'.ljust(5, '*') == 'abc**'    # Python 2.4
-        assert 'abc'.ljust(5, '*') == 'abc**'     # Python 2.4
-        raises(TypeError, 'abc'.ljust, 6, '')
+        assert s.ljust(4) == s + u" "
+        assert s.ljust(5) == s + u"  "
+        assert u'abc'.ljust(10) == u'abc       '
+        assert u'abc'.ljust(6) == u'abc   '
+        assert u'abc'.ljust(3) == u'abc'
+        assert u'abc'.ljust(2) == u'abc'
+        assert u'abc'.ljust(5, u'*') == u'abc**'    # Python 2.4
+        assert u'abc'.ljust(5, '*') == u'abc**'     # Python 2.4
+        raises(TypeError, u'abc'.ljust, 6, u'')
 
     def test_replace(self):
         assert 'one!two!three!'.replace('!', '@', 1) == 'one@two!three!'
@@ -300,6 +325,16 @@
         assert 'one!two!three!'.replace('!', '@') == 'one@two@three@'
         assert 'one!two!three!'.replace('x', '@') == 'one!two!three!'
         assert 'one!two!three!'.replace('x', '@', 2) == 'one!two!three!'
+        assert u'\u1234'.replace(u'', '-') == u'-\u1234-'
+        assert u'\u0234\u5678'.replace('', u'-') == u'-\u0234-\u5678-'
+        assert u'\u0234\u5678'.replace('', u'-', 0) == u'\u0234\u5678'
+        assert u'\u0234\u5678'.replace('', u'-', 1) == u'-\u0234\u5678'
+        assert u'\u0234\u5678'.replace('', u'-', 2) == u'-\u0234-\u5678'
+        assert u'\u0234\u5678'.replace('', u'-', 3) == u'-\u0234-\u5678-'
+        assert u'\u0234\u5678'.replace('', u'-', 4) == u'-\u0234-\u5678-'
+        assert u'\u0234\u5678'.replace('', u'-', 700) == u'-\u0234-\u5678-'
+        assert u'\u0234\u5678'.replace('', u'-', -1) == u'-\u0234-\u5678-'
+        assert u'\u0234\u5678'.replace('', u'-', -42) == u'-\u0234-\u5678-'
         assert 'abc'.replace('', '-') == '-a-b-c-'
         assert 'abc'.replace('', '-', 3) == '-a-b-c'
         assert 'abc'.replace('', '-', 0) == 'abc'
@@ -387,6 +422,9 @@
         assert ''.startswith('a') is False
         assert 'x'.startswith('xx') is False
         assert 'y'.startswith('xx') is False
+        assert u'\u1234\u5678\u4321'.startswith(u'\u1234') is True
+        assert u'\u1234\u5678\u4321'.startswith(u'\u1234\u4321') is False
+        assert u'\u1234'.startswith(u'') is True
 
     def test_startswith_more(self):
         assert 'ab'.startswith('a', 0) is True
@@ -533,7 +571,7 @@
         raises(TypeError, 'hello'.translate)
         raises(TypeError, 'abababc'.translate, 'abc', 'xyz')
 
-    def test_unicode_form_encoded_object(self):
+    def test_unicode_from_encoded_object(self):
         assert str(b'x', 'utf-8') == 'x'
         assert str(b'x', 'utf-8', 'strict') == 'x'
 
@@ -659,31 +697,31 @@
 
 
     def test_partition(self):
-        assert ('this is the par', 'ti', 'tion method') == \
-            'this is the partition method'.partition('ti')
+        assert (u'this is the par', u'ti', u'tion method') == \
+            u'this is the partition method'.partition(u'ti')
 
         # from raymond's original specification
-        S = 'http://www.python.org'
-        assert ('http', '://', 'www.python.org') == S.partition('://')
-        assert ('http://www.python.org', '', '') == S.partition('?')
-        assert ('', 'http://', 'www.python.org') == S.partition('http://')
-        assert ('http://www.python.', 'org', '') == S.partition('org')
+        S = u'http://www.python.org'
+        assert (u'http', u'://', u'www.python.org') == S.partition(u'://')
+        assert (u'http://www.python.org', u'', u'') == S.partition(u'?')
+        assert (u'', u'http://', u'www.python.org') == S.partition(u'http://')
+        assert (u'http://www.python.', u'org', u'') == S.partition(u'org')
 
-        raises(ValueError, S.partition, '')
+        raises(ValueError, S.partition, u'')
         raises(TypeError, S.partition, None)
 
     def test_rpartition(self):
-        assert ('this is the rparti', 'ti', 'on method') == \
-            'this is the rpartition method'.rpartition('ti')
+        assert (u'this is the rparti', u'ti', u'on method') == \
+            u'this is the rpartition method'.rpartition(u'ti')
 
         # from raymond's original specification
-        S = 'http://www.python.org'
-        assert ('http', '://', 'www.python.org') == S.rpartition('://')
-        assert ('', '', 'http://www.python.org') == S.rpartition('?')
-        assert ('', 'http://', 'www.python.org') == S.rpartition('http://')
-        assert ('http://www.python.', 'org', '') == S.rpartition('org')
+        S = u'http://www.python.org'
+        assert (u'http', u'://', u'www.python.org') == S.rpartition(u'://')
+        assert (u'', u'', u'http://www.python.org') == S.rpartition(u'?')
+        assert (u'', u'http://', u'www.python.org') == S.rpartition(u'http://')
+        assert (u'http://www.python.', u'org', u'') == S.rpartition(u'org')
 
-        raises(ValueError, S.rpartition, '')
+        raises(ValueError, S.rpartition, u'')
         raises(TypeError, S.rpartition, None)
 
     def test_mul(self):
@@ -706,6 +744,7 @@
     def test_index(self):
         assert "rrarrrrrrrrra".index('a', 4, None) == 12
         assert "rrarrrrrrrrra".index('a', None, 6) == 2
+        assert u"\u1234\u4321\u5678".index(u'\u5678', 1) == 2
 
     def test_rindex(self):
         from sys import maxsize
@@ -715,6 +754,7 @@
         assert 'abcdefghiabc'.rindex('abc', 0, -1) == 0
         assert 'abcdefghiabc'.rindex('abc', -4*maxsize, 4*maxsize) == 9
         assert 'rrarrrrrrrrra'.rindex('a', 4, None) == 12
+        assert u"\u1234\u5678".rindex(u'\u5678') == 1
 
         raises(ValueError, 'abcdefghiabc'.rindex, 'hib')
         raises(ValueError, 'defghiabc'.rindex, 'def', 1)
@@ -729,6 +769,7 @@
         assert 'abcdefghiabc'.rfind('') == 12
         assert 'abcdefghiabc'.rfind('abcd') == 0
         assert 'abcdefghiabc'.rfind('abcz') == -1
+        assert u"\u1234\u5678".rfind(u'\u5678') == 1
 
     def test_rfind_corner_case(self):
         assert 'abc'.rfind('', 4) == -1
@@ -802,17 +843,31 @@
         assert str(Y()).__class__ is X
 
     def test_getslice(self):
-        assert '123456'[1:5] == '2345'
-        s = "abc"
-        assert s[:] == "abc"
-        assert s[1:] == "bc"
-        assert s[:2] == "ab"
-        assert s[1:2] == "b"
-        assert s[-2:] == "bc"
-        assert s[:-1] == "ab"
-        assert s[-2:2] == "b"
-        assert s[1:-1] == "b"
-        assert s[-2:-1] == "b"
+        s = u"\u0105b\u0107"
+        assert s[:] == u"\u0105b\u0107"
+        assert s[1:] == u"b\u0107"
+        assert s[:2] == u"\u0105b"
+        assert s[1:2] == u"b"
+        assert s[-2:] == u"b\u0107"
+        assert s[:-1] == u"\u0105b"
+        assert s[-2:2] == u"b"
+        assert s[1:-1] == u"b"
+        assert s[-2:-1] == u"b"
+
+    def test_getitem_slice(self):
+        assert u'123456'.__getitem__(slice(1, 5)) == u'2345'
+        s = u"\u0105b\u0107"
+        assert s[slice(3)] == u"\u0105b\u0107"
+        assert s[slice(1, 3)] == u"b\u0107"
+        assert s[slice(2)] == u"\u0105b"
+        assert s[slice(1, 2)] == u"b"
+        assert s[slice(-2, 3)] == u"b\u0107"
+        assert s[slice(-1)] == u"\u0105b"
+        assert s[slice(-2, 2)] == u"b"
+        assert s[slice(1, -1)] == u"b"
+        assert s[slice(-2, -1)] == u"b"
+        assert u"abcde"[::2] == u"ace"
+        assert u"\u0105\u0106\u0107abcd"[::2] == u"\u0105\u0107bd"
 
     def test_iter(self):
         foo = "\u1111\u2222\u3333"
@@ -898,7 +953,7 @@
     def test_formatting_unicode__str__2(self):
         class A:
             def __str__(self):
-                return 'baz'
+                return u'baz'
 
         class B:
             def __str__(self):
@@ -913,12 +968,12 @@
         # "bah" is all I can say
         class X(object):
             def __repr__(self):
-                return '\u1234'
+                return u'\u1234'
         '%s' % X()
         #
         class X(object):
             def __str__(self):
-                return '\u1234'
+                return u'\u1234'
         '%s' % X()
 
     def test_formatting_unicode__repr__(self):
diff --git a/rpython/annotator/bookkeeper.py b/rpython/annotator/bookkeeper.py
--- a/rpython/annotator/bookkeeper.py
+++ b/rpython/annotator/bookkeeper.py
@@ -194,13 +194,14 @@
             listdef.generalize_range_step(flags['range_step'])
         return SomeList(listdef)
 
-    def getdictdef(self, is_r_dict=False, force_non_null=False):
+    def getdictdef(self, is_r_dict=False, force_non_null=False, 
simple_hash_eq=False):
         """Get the DictDef associated with the current position."""
         try:
             dictdef = self.dictdefs[self.position_key]
         except KeyError:
             dictdef = DictDef(self, is_r_dict=is_r_dict,
-                              force_non_null=force_non_null)
+                              force_non_null=force_non_null,
+                              simple_hash_eq=simple_hash_eq)
             self.dictdefs[self.position_key] = dictdef
         return dictdef
 
diff --git a/rpython/annotator/builtin.py b/rpython/annotator/builtin.py
--- a/rpython/annotator/builtin.py
+++ b/rpython/annotator/builtin.py
@@ -237,22 +237,30 @@
     return SomeInstance(clsdef)
 
 @analyzer_for(rpython.rlib.objectmodel.r_dict)
-def robjmodel_r_dict(s_eqfn, s_hashfn, s_force_non_null=None):
+def robjmodel_r_dict(s_eqfn, s_hashfn, s_force_non_null=None, 
s_simple_hash_eq=None):
+    return _r_dict_helper(SomeDict, s_eqfn, s_hashfn, s_force_non_null, 
s_simple_hash_eq)
+
+@analyzer_for(rpython.rlib.objectmodel.r_ordereddict)
+def robjmodel_r_ordereddict(s_eqfn, s_hashfn, s_force_non_null=None, 
s_simple_hash_eq=None):
+    return _r_dict_helper(SomeOrderedDict, s_eqfn, s_hashfn,
+                          s_force_non_null, s_simple_hash_eq)
+
+def _r_dict_helper(cls, s_eqfn, s_hashfn, s_force_non_null, s_simple_hash_eq):
     if s_force_non_null is None:
         force_non_null = False
     else:
         assert s_force_non_null.is_constant()
         force_non_null = s_force_non_null.const
+    if s_simple_hash_eq is None:
+        simple_hash_eq = False
+    else:
+        assert s_simple_hash_eq.is_constant()
+        simple_hash_eq = s_simple_hash_eq.const
     dictdef = getbookkeeper().getdictdef(is_r_dict=True,
-                                         force_non_null=force_non_null)
+                                         force_non_null=force_non_null,
+                                         simple_hash_eq=simple_hash_eq)
     dictdef.dictkey.update_rdict_annotations(s_eqfn, s_hashfn)
-    return SomeDict(dictdef)
-
-@analyzer_for(rpython.rlib.objectmodel.r_ordereddict)
-def robjmodel_r_ordereddict(s_eqfn, s_hashfn):
-    dictdef = getbookkeeper().getdictdef(is_r_dict=True)
-    dictdef.dictkey.update_rdict_annotations(s_eqfn, s_hashfn)
-    return SomeOrderedDict(dictdef)
+    return cls(dictdef)
 
 @analyzer_for(rpython.rlib.objectmodel.hlinvoke)
 def robjmodel_hlinvoke(s_repr, s_llcallable, *args_s):
diff --git a/rpython/annotator/dictdef.py b/rpython/annotator/dictdef.py
--- a/rpython/annotator/dictdef.py
+++ b/rpython/annotator/dictdef.py
@@ -81,12 +81,14 @@
     def __init__(self, bookkeeper, s_key = s_ImpossibleValue,
                                  s_value = s_ImpossibleValue,
                                is_r_dict = False,
-                           force_non_null = False):
+                           force_non_null = False,
+                           simple_hash_eq = False):
         self.dictkey = DictKey(bookkeeper, s_key, is_r_dict)
         self.dictkey.itemof[self] = True
         self.dictvalue = DictValue(bookkeeper, s_value)
         self.dictvalue.itemof[self] = True
         self.force_non_null = force_non_null
+        self.simple_hash_eq = simple_hash_eq
 
     def read_key(self, position_key):
         self.dictkey.read_locations.add(position_key)
diff --git a/rpython/jit/metainterp/typesystem.py 
b/rpython/jit/metainterp/typesystem.py
--- a/rpython/jit/metainterp/typesystem.py
+++ b/rpython/jit/metainterp/typesystem.py
@@ -106,11 +106,11 @@
     # It is an r_dict on lltype.  Two copies, to avoid conflicts with
     # the value type.  Note that NULL is not allowed as a key.
     def new_ref_dict(self):
-        return r_dict(rd_eq, rd_hash)
+        return r_dict(rd_eq, rd_hash, simple_hash_eq=True)
     def new_ref_dict_2(self):
-        return r_dict(rd_eq, rd_hash)
+        return r_dict(rd_eq, rd_hash, simple_hash_eq=True)
     def new_ref_dict_3(self):
-        return r_dict(rd_eq, rd_hash)
+        return r_dict(rd_eq, rd_hash, simple_hash_eq=True)
 
     def cast_vtable_to_hashable(self, cpu, ptr):
         adr = llmemory.cast_ptr_to_adr(ptr)
diff --git a/rpython/rlib/debug.py b/rpython/rlib/debug.py
--- a/rpython/rlib/debug.py
+++ b/rpython/rlib/debug.py
@@ -288,6 +288,9 @@
 def mark_dict_non_null(d):
     """ Mark dictionary as having non-null keys and values. A warning would
     be emitted (not an error!) in case annotation disagrees.
+
+    This doesn't work for r_dicts. For them, pass
+    r_dict(..., force_non_null=True) to the constructor.
     """
     assert isinstance(d, dict)
     return d
diff --git a/rpython/rlib/objectmodel.py b/rpython/rlib/objectmodel.py
--- a/rpython/rlib/objectmodel.py
+++ b/rpython/rlib/objectmodel.py
@@ -748,11 +748,19 @@
     def _newdict(self):
         return {}
 
-    def __init__(self, key_eq, key_hash, force_non_null=False):
+    def __init__(self, key_eq, key_hash, force_non_null=False, 
simple_hash_eq=False):
+        """ force_non_null=True means that the key can never be None (even if
+        the annotator things it could be)
+
+        simple_hash_eq=True means that the hash function is very fast, meaning 
it's
+        efficient enough that the dict does not have to store the hash per key.
+        It also implies that neither the hash nor the eq function will mutate
+        the dictionary. """
         self._dict = self._newdict()
         self.key_eq = key_eq
         self.key_hash = key_hash
         self.force_non_null = force_non_null
+        self.simple_hash_eq = simple_hash_eq
 
     def __getitem__(self, key):
         return self._dict[_r_dictkey(self, key)]
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -710,7 +710,7 @@
 
 
 # ____________________________________________________________
-# utf-32
+# utf-32 (not used in PyPy any more)
 
 def str_decode_utf_32(s, size, errors, final=True,
                       errorhandler=None):
diff --git a/rpython/rlib/test/test_objectmodel.py 
b/rpython/rlib/test/test_objectmodel.py
--- a/rpython/rlib/test/test_objectmodel.py
+++ b/rpython/rlib/test/test_objectmodel.py
@@ -330,6 +330,13 @@
         res = self.interpret(g, [3])
         assert res == 77
 
+    def test_r_dict_fast_functions(self):
+        def fn():
+            d1 = r_dict(strange_key_eq, strange_key_hash, simple_hash_eq=True)
+            return play_with_r_dict(d1)
+        res = self.interpret(fn, [])
+        assert res
+
     def test_prepare_dict_update(self):
         def g(n):
             d = {}
diff --git a/rpython/rtyper/lltypesystem/rdict.py 
b/rpython/rtyper/lltypesystem/rdict.py
--- a/rpython/rtyper/lltypesystem/rdict.py
+++ b/rpython/rtyper/lltypesystem/rdict.py
@@ -42,7 +42,8 @@
 class DictRepr(AbstractDictRepr):
 
     def __init__(self, rtyper, key_repr, value_repr, dictkey, dictvalue,
-                 custom_eq_hash=None, force_non_null=False):
+                 custom_eq_hash=None, force_non_null=False, fast_hash=False):
+        # fast_hash is ignored (only implemented in rordereddict.py)
         self.rtyper = rtyper
         self.DICT = lltype.GcForwardReference()
         self.lowleveltype = lltype.Ptr(self.DICT)
diff --git a/rpython/rtyper/lltypesystem/rordereddict.py 
b/rpython/rtyper/lltypesystem/rordereddict.py
--- a/rpython/rtyper/lltypesystem/rordereddict.py
+++ b/rpython/rtyper/lltypesystem/rordereddict.py
@@ -66,7 +66,7 @@
 
 def get_ll_dict(DICTKEY, DICTVALUE, get_custom_eq_hash=None, DICT=None,
                 ll_fasthash_function=None, ll_hash_function=None,
-                ll_eq_function=None, method_cache={},
+                ll_eq_function=None, method_cache={}, simple_hash_eq=False,
                 dummykeyobj=None, dummyvalueobj=None, rtyper=None):
     # get the actual DICT type. if DICT is None, it's created, otherwise
     # forward reference is becoming DICT
@@ -114,11 +114,14 @@
     # * the value
     entryfields.append(("value", DICTVALUE))
 
-    if ll_fasthash_function is None:
+    if simple_hash_eq:
+        assert get_custom_eq_hash is not None
+        entrymeths['entry_hash'] = ll_hash_custom_fast
+    elif ll_fasthash_function is None:
         entryfields.append(("f_hash", lltype.Signed))
-        entrymeths['hash'] = ll_hash_from_cache
+        entrymeths['entry_hash'] = ll_hash_from_cache
     else:
-        entrymeths['hash'] = ll_hash_recomputed
+        entrymeths['entry_hash'] = ll_hash_recomputed
         entrymeths['fasthashfn'] = ll_fasthash_function
 
     # Build the lltype data structures
@@ -140,7 +143,7 @@
             'keyeq':          ll_keyeq_custom,
             'r_rdict_eqfn':   r_rdict_eqfn,
             'r_rdict_hashfn': r_rdict_hashfn,
-            'paranoia':       True,
+            'paranoia':       not simple_hash_eq,
             }
     else:
         # figure out which functions must be used to hash and compare
@@ -167,13 +170,14 @@
 class OrderedDictRepr(AbstractDictRepr):
 
     def __init__(self, rtyper, key_repr, value_repr, dictkey, dictvalue,
-                 custom_eq_hash=None, force_non_null=False):
+                 custom_eq_hash=None, force_non_null=False, 
simple_hash_eq=False):
         #assert not force_non_null
         self.rtyper = rtyper
         self.finalized = False
         self.DICT = lltype.GcForwardReference()
         self.lowleveltype = lltype.Ptr(self.DICT)
         self.custom_eq_hash = custom_eq_hash is not None
+        self.simple_hash_eq = simple_hash_eq
         if not isinstance(key_repr, rmodel.Repr):  # not computed yet, done by 
setup()
             assert callable(key_repr)
             self._key_repr_computer = key_repr
@@ -211,6 +215,7 @@
                 self.r_rdict_eqfn, self.r_rdict_hashfn = (
                     self._custom_eq_hash_repr())
                 kwd['get_custom_eq_hash'] = self._custom_eq_hash_repr
+                kwd['simple_hash_eq'] = self.simple_hash_eq
             else:
                 kwd['ll_hash_function'] = self.key_repr.get_ll_hash_function()
                 kwd['ll_eq_function'] = self.key_repr.get_ll_eq_function()
@@ -600,15 +605,21 @@
     dummy = ENTRIES.dummy_obj.ll_dummy_value
     entries[i].value = dummy
 
-@signature(types.any(), types.int(), returns=types.any())
-def ll_hash_from_cache(entries, i):
+@signature(types.any(), types.any(), types.int(), returns=types.any())
+def ll_hash_from_cache(entries, d, i):
     return entries[i].f_hash
 
-@signature(types.any(), types.int(), returns=types.any())
-def ll_hash_recomputed(entries, i):
+@signature(types.any(), types.any(), types.int(), returns=types.any())
+def ll_hash_recomputed(entries, d, i):
     ENTRIES = lltype.typeOf(entries).TO
     return ENTRIES.fasthashfn(entries[i].key)
 
+@signature(types.any(), types.any(), types.int(), returns=types.any())
+def ll_hash_custom_fast(entries, d, i):
+    DICT = lltype.typeOf(d).TO
+    key = entries[i].key
+    return objectmodel.hlinvoke(DICT.r_rdict_hashfn, d.fnkeyhash, key)
+
 def ll_keyhash_custom(d, key):
     DICT = lltype.typeOf(d).TO
     return objectmodel.hlinvoke(DICT.r_rdict_hashfn, d.fnkeyhash, key)
@@ -962,22 +973,22 @@
     if fun == FUNC_BYTE:
         while i < ibound:
             if entries.valid(i):
-                ll_dict_store_clean(d, entries.hash(i), i, TYPE_BYTE)
+                ll_dict_store_clean(d, entries.entry_hash(d, i), i, TYPE_BYTE)
             i += 1
     elif fun == FUNC_SHORT:
         while i < ibound:
             if entries.valid(i):
-                ll_dict_store_clean(d, entries.hash(i), i, TYPE_SHORT)
+                ll_dict_store_clean(d, entries.entry_hash(d, i), i, TYPE_SHORT)
             i += 1
     elif IS_64BIT and fun == FUNC_INT:
         while i < ibound:
             if entries.valid(i):
-                ll_dict_store_clean(d, entries.hash(i), i, TYPE_INT)
+                ll_dict_store_clean(d, entries.entry_hash(d, i), i, TYPE_INT)
             i += 1
     elif fun == FUNC_LONG:
         while i < ibound:
             if entries.valid(i):
-                ll_dict_store_clean(d, entries.hash(i), i, TYPE_LONG)
+                ll_dict_store_clean(d, entries.entry_hash(d, i), i, TYPE_LONG)
             i += 1
     else:
         assert False
@@ -1015,7 +1026,7 @@
         checkingkey = entries[index - VALID_OFFSET].key
         if direct_compare and checkingkey == key:
             return index - VALID_OFFSET   # found the entry
-        if d.keyeq is not None and entries.hash(index - VALID_OFFSET) == hash:
+        if d.keyeq is not None and entries.entry_hash(d, index - VALID_OFFSET) 
== hash:
             # correct hash, maybe the key is e.g. a different pointer to
             # an equal object
             found = d.keyeq(checkingkey, key)
@@ -1056,7 +1067,7 @@
             checkingkey = entries[index - VALID_OFFSET].key
             if direct_compare and checkingkey == key:
                 return index - VALID_OFFSET   # found the entry
-            if d.keyeq is not None and entries.hash(index - VALID_OFFSET) == 
hash:
+            if d.keyeq is not None and entries.entry_hash(d, index - 
VALID_OFFSET) == hash:
                 # correct hash, maybe the key is e.g. a different pointer to
                 # an equal object
                 found = d.keyeq(checkingkey, key)
@@ -1305,14 +1316,14 @@
 def ll_dict_update(dic1, dic2):
     if dic1 == dic2:
         return
-    ll_ensure_indexes(dic2)    # needed for entries.hash() below
+    ll_ensure_indexes(dic2)    # needed for entries.entry_hash() below
     ll_prepare_dict_update(dic1, dic2.num_live_items)
     i = 0
     while i < dic2.num_ever_used_items:
         entries = dic2.entries
         if entries.valid(i):
             entry = entries[i]
-            hash = entries.hash(i)
+            hash = entries.entry_hash(dic2, i)
             key = entry.key
             value = entry.value
             index = dic1.lookup_function(dic1, key, hash, FLAG_STORE)
@@ -1413,7 +1424,7 @@
     r = lltype.malloc(ELEM.TO)
     r.item0 = recast(ELEM.TO.item0, entry.key)
     r.item1 = recast(ELEM.TO.item1, entry.value)
-    _ll_dict_del(dic, dic.entries.hash(i), i)
+    _ll_dict_del(dic, dic.entries.entry_hash(dic, i), i)
     return r
 
 def ll_dict_pop(dic, key):
diff --git a/rpython/rtyper/rbuiltin.py b/rpython/rtyper/rbuiltin.py
--- a/rpython/rtyper/rbuiltin.py
+++ b/rpython/rtyper/rbuiltin.py
@@ -717,9 +717,9 @@
 @typer_for(OrderedDict)
 @typer_for(objectmodel.r_dict)
 @typer_for(objectmodel.r_ordereddict)
-def rtype_dict_constructor(hop, i_force_non_null=None):
-    # 'i_force_non_null' is ignored here; if it has any effect, it
-    # has already been applied to 'hop.r_result'
+def rtype_dict_constructor(hop, i_force_non_null=None, i_simple_hash_eq=None):
+    # 'i_force_non_null' and 'i_simple_hash_eq' are ignored here; if they have 
any
+    # effect, it has already been applied to 'hop.r_result'
     hop.exception_cannot_occur()
     r_dict = hop.r_result
     cDICT = hop.inputconst(lltype.Void, r_dict.DICT)
diff --git a/rpython/rtyper/rdict.py b/rpython/rtyper/rdict.py
--- a/rpython/rtyper/rdict.py
+++ b/rpython/rtyper/rdict.py
@@ -15,6 +15,7 @@
         s_key = dictkey.s_value
         s_value = dictvalue.s_value
         force_non_null = self.dictdef.force_non_null
+        simple_hash_eq = self.dictdef.simple_hash_eq
         if dictkey.custom_eq_hash:
             custom_eq_hash = lambda: (rtyper.getrepr(dictkey.s_rdict_eqfn),
                                       rtyper.getrepr(dictkey.s_rdict_hashfn))
@@ -22,7 +23,7 @@
             custom_eq_hash = None
         return self.get_dict_repr()(rtyper, lambda: rtyper.getrepr(s_key),
                         lambda: rtyper.getrepr(s_value), dictkey, dictvalue,
-                        custom_eq_hash, force_non_null)
+                        custom_eq_hash, force_non_null, simple_hash_eq)
 
     def rtyper_makekey(self):
         self.dictdef.dictkey  .dont_change_any_more = True
@@ -89,7 +90,7 @@
                               resulttype=ENTRIES)
         # call the correct variant_*() method
         method = getattr(self, 'variant_' + self.variant)
-        return method(hop, ENTRIES, v_entries, v_index)
+        return method(hop, ENTRIES, v_entries, v_dict, v_index)
 
     def get_tuple_result(self, hop, items_v):
         # this allocates the tuple for the result, directly in the function
@@ -109,7 +110,7 @@
             hop.genop('setfield', [v_result, c_item, v_item])
         return v_result
 
-    def variant_keys(self, hop, ENTRIES, v_entries, v_index):
+    def variant_keys(self, hop, ENTRIES, v_entries, v_dict, v_index):
         KEY = ENTRIES.TO.OF.key
         c_key = hop.inputconst(lltype.Void, 'key')
         v_key = hop.genop('getinteriorfield', [v_entries, v_index, c_key],
@@ -118,30 +119,30 @@
 
     variant_reversed = variant_keys
 
-    def variant_values(self, hop, ENTRIES, v_entries, v_index):
+    def variant_values(self, hop, ENTRIES, v_entries, v_dict, v_index):
         VALUE = ENTRIES.TO.OF.value
         c_value = hop.inputconst(lltype.Void, 'value')
         v_value = hop.genop('getinteriorfield', [v_entries,v_index,c_value],
                             resulttype=VALUE)
         return self.r_dict.recast_value(hop.llops, v_value)
 
-    def variant_items(self, hop, ENTRIES, v_entries, v_index):
-        v_key = self.variant_keys(hop, ENTRIES, v_entries, v_index)
-        v_value = self.variant_values(hop, ENTRIES, v_entries, v_index)
+    def variant_items(self, hop, ENTRIES, v_entries, v_dict, v_index):
+        v_key = self.variant_keys(hop, ENTRIES, v_entries, v_dict, v_index)
+        v_value = self.variant_values(hop, ENTRIES, v_entries, v_dict, v_index)
         return self.get_tuple_result(hop, (v_key, v_value))
 
-    def variant_hashes(self, hop, ENTRIES, v_entries, v_index):
+    def variant_hashes(self, hop, ENTRIES, v_entries, v_dict, v_index):
         # there is not really a variant 'hashes', but this method is
         # convenient for the following variants
-        return hop.gendirectcall(ENTRIES.TO.hash, v_entries, v_index)
+        return hop.gendirectcall(ENTRIES.TO.entry_hash, v_entries, v_dict, 
v_index)
 
-    def variant_keys_with_hash(self, hop, ENTRIES, v_entries, v_index):
-        v_key = self.variant_keys(hop, ENTRIES, v_entries, v_index)
-        v_hash = self.variant_hashes(hop, ENTRIES, v_entries, v_index)
+    def variant_keys_with_hash(self, hop, ENTRIES, v_entries, v_dict, v_index):
+        v_key = self.variant_keys(hop, ENTRIES, v_entries, v_dict, v_index)
+        v_hash = self.variant_hashes(hop, ENTRIES, v_entries, v_dict, v_index)
         return self.get_tuple_result(hop, (v_key, v_hash))
 
-    def variant_items_with_hash(self, hop, ENTRIES, v_entries, v_index):
-        v_key = self.variant_keys(hop, ENTRIES, v_entries, v_index)
-        v_value = self.variant_values(hop, ENTRIES, v_entries, v_index)
-        v_hash = self.variant_hashes(hop, ENTRIES, v_entries, v_index)
+    def variant_items_with_hash(self, hop, ENTRIES, v_entries, v_dict, 
v_index):
+        v_key = self.variant_keys(hop, ENTRIES, v_entries, v_dict, v_index)
+        v_value = self.variant_values(hop, ENTRIES, v_entries, v_dict, v_index)
+        v_hash = self.variant_hashes(hop, ENTRIES, v_entries, v_dict, v_index)
         return self.get_tuple_result(hop, (v_key, v_value, v_hash))
diff --git a/rpython/rtyper/test/test_rdict.py 
b/rpython/rtyper/test/test_rdict.py
--- a/rpython/rtyper/test/test_rdict.py
+++ b/rpython/rtyper/test/test_rdict.py
@@ -538,6 +538,25 @@
         r_dict = rtyper.getrepr(s)
         assert not hasattr(r_dict.lowleveltype.TO.entries.TO.OF, "f_hash")
 
+    def test_r_dict_can_be_fast(self):
+        def myeq(n, m):
+            return n == m
+        def myhash(n):
+            return ~n
+        def f():
+            d = self.new_r_dict(myeq, myhash, simple_hash_eq=True)
+            d[5] = 7
+            d[12] = 19
+            return d
+
+        t = TranslationContext()
+        s = t.buildannotator().build_types(f, [])
+        rtyper = t.buildrtyper()
+        rtyper.specialize()
+
+        r_dict = rtyper.getrepr(s)
+        assert not hasattr(r_dict.lowleveltype.TO.entries.TO.OF, "f_hash")
+
     def test_tuple_dict(self):
         def f(i):
             d = self.newdict()
@@ -1000,8 +1019,8 @@
         return {}
 
     @staticmethod
-    def new_r_dict(myeq, myhash):
-        return r_dict(myeq, myhash)
+    def new_r_dict(myeq, myhash, force_non_null=False, simple_hash_eq=False):
+        return r_dict(myeq, myhash, force_non_null=force_non_null, 
simple_hash_eq=simple_hash_eq)
 
     def test_two_dicts_with_different_value_types(self):
         def func(i):
diff --git a/rpython/rtyper/test/test_rordereddict.py 
b/rpython/rtyper/test/test_rordereddict.py
--- a/rpython/rtyper/test/test_rordereddict.py
+++ b/rpython/rtyper/test/test_rordereddict.py
@@ -386,8 +386,10 @@
         return OrderedDict()
 
     @staticmethod
-    def new_r_dict(myeq, myhash):
-        return objectmodel.r_ordereddict(myeq, myhash)
+    def new_r_dict(myeq, myhash, force_non_null=False, simple_hash_eq=False):
+        return objectmodel.r_ordereddict(
+            myeq, myhash, force_non_null=force_non_null,
+            simple_hash_eq=simple_hash_eq)
 
     def test_two_dicts_with_different_value_types(self):
         def func(i):
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to