Author: Maciej Fijalkowski <fij...@gmail.com>
Branch: 
Changeset: r60527:53723bf32fd5
Date: 2013-01-27 20:02 +0200
http://bitbucket.org/pypy/pypy/changeset/53723bf32fd5/

Log:    Help with unicode issues

diff --git a/pypy/module/unicodedata/interp_ucd.py 
b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -9,7 +9,7 @@
 from rpython.rlib.objectmodel import we_are_translated
 from rpython.rlib.runicode import MAXUNICODE
 from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0
-from rpython.rlib.runicode import code_to_unichr, ORD
+from rpython.rlib.runicode import code_to_unichr, ord_accepts_surrogate
 import sys
 
 
@@ -28,8 +28,6 @@
 # handling: on narrow unicode builds, a surrogate pair is considered as one
 # unicode code point.
 
-# The functions below are subtly different from the ones in runicode.py.
-# When PyPy implements Python 3 they should be merged.
 
 if MAXUNICODE > 0xFFFF:
     # Target is wide build
@@ -41,7 +39,7 @@
         if not we_are_translated() and sys.maxunicode == 0xFFFF:
             # Host CPython is narrow build, accept surrogates
             try:
-                return ORD(space.unicode_w(w_unichr))
+                return ord_accepts_surrogate(space.unicode_w(w_unichr))
             except ValueError:
                 raise OperationError(space.w_TypeError, space.wrap(
                     'need a single Unicode character as parameter'))
@@ -68,7 +66,7 @@
         else:
             # Accept surrogates
             try:
-                return ORD(space.unicode_w(w_unichr))
+                return ord_accepts_surrogate(space.unicode_w(w_unichr))
             except ValueError:
                 raise OperationError(space.w_TypeError, space.wrap(
                     'need a single Unicode character as parameter'))
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -13,6 +13,26 @@
 
 BYTEORDER = sys.byteorder
 
+# python 2.7 has a preview of py3k behavior, so those functions
+# are used either when we're testing wide pypy on narrow cpython
+# or in unicodedata in pypy
+
+def unichr_returns_surrogate(c):
+    if c <= sys.maxunicode or c > MAXUNICODE:
+        return unichr(c)
+    else:
+        c -= 0x10000
+        return (unichr(0xD800 + (c >> 10)) +
+                unichr(0xDC00 + (c & 0x03FF)))
+
+def ord_accepts_surrogate(u):
+    if isinstance(u, unicode) and len(u) == 2:
+        ch1 = ord(u[0])
+        ch2 = ord(u[1])
+        if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
+            return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000
+    return ord(u)
+
 if MAXUNICODE > sys.maxunicode:
     # A version of unichr which allows codes outside the BMP
     # even on narrow unicode builds.
@@ -21,12 +41,7 @@
     # Note that Python3 uses a similar implementation.
     def UNICHR(c):
         assert not we_are_translated()
-        if c <= sys.maxunicode or c > MAXUNICODE:
-            return unichr(c)
-        else:
-            c -= 0x10000
-            return (unichr(0xD800 + (c >> 10)) +
-                    unichr(0xDC00 + (c & 0x03FF)))
+        return unichr_returns_surrogate(c)
     UNICHR._flowspace_rewrite_directly_as_ = unichr
     # ^^^ NB.: for translation, it's essential to use this hack instead
     # of calling unichr() from UNICHR(), because unichr() detects if there
@@ -34,12 +49,7 @@
 
     def ORD(u):
         assert not we_are_translated()
-        if isinstance(u, unicode) and len(u) == 2:
-            ch1 = ord(u[0])
-            ch2 = ord(u[1])
-            if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
-                return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000
-        return ord(u)
+        return ord_accepts_surrogate(u)
     ORD._flowspace_rewrite_directly_as_ = ord
 
 else:
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
http://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to