Author: Maciej Fijalkowski <fij...@gmail.com> Branch: Changeset: r60527:53723bf32fd5 Date: 2013-01-27 20:02 +0200 http://bitbucket.org/pypy/pypy/changeset/53723bf32fd5/
Log: Help with unicode issues diff --git a/pypy/module/unicodedata/interp_ucd.py b/pypy/module/unicodedata/interp_ucd.py --- a/pypy/module/unicodedata/interp_ucd.py +++ b/pypy/module/unicodedata/interp_ucd.py @@ -9,7 +9,7 @@ from rpython.rlib.objectmodel import we_are_translated from rpython.rlib.runicode import MAXUNICODE from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0 -from rpython.rlib.runicode import code_to_unichr, ORD +from rpython.rlib.runicode import code_to_unichr, ord_accepts_surrogate import sys @@ -28,8 +28,6 @@ # handling: on narrow unicode builds, a surrogate pair is considered as one # unicode code point. -# The functions below are subtly different from the ones in runicode.py. -# When PyPy implements Python 3 they should be merged. if MAXUNICODE > 0xFFFF: # Target is wide build @@ -41,7 +39,7 @@ if not we_are_translated() and sys.maxunicode == 0xFFFF: # Host CPython is narrow build, accept surrogates try: - return ORD(space.unicode_w(w_unichr)) + return ord_accepts_surrogate(space.unicode_w(w_unichr)) except ValueError: raise OperationError(space.w_TypeError, space.wrap( 'need a single Unicode character as parameter')) @@ -68,7 +66,7 @@ else: # Accept surrogates try: - return ORD(space.unicode_w(w_unichr)) + return ord_accepts_surrogate(space.unicode_w(w_unichr)) except ValueError: raise OperationError(space.w_TypeError, space.wrap( 'need a single Unicode character as parameter')) diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -13,6 +13,26 @@ BYTEORDER = sys.byteorder +# python 2.7 has a preview of py3k behavior, so those functions +# are used either when we're testing wide pypy on narrow cpython +# or in unicodedata in pypy + +def unichr_returns_surrogate(c): + if c <= sys.maxunicode or c > MAXUNICODE: + return unichr(c) + else: + c -= 0x10000 + return (unichr(0xD800 + (c >> 10)) + + unichr(0xDC00 + (c & 0x03FF))) + +def ord_accepts_surrogate(u): + if isinstance(u, unicode) and len(u) == 2: + ch1 = ord(u[0]) + ch2 = ord(u[1]) + if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF: + return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000 + return ord(u) + if MAXUNICODE > sys.maxunicode: # A version of unichr which allows codes outside the BMP # even on narrow unicode builds. @@ -21,12 +41,7 @@ # Note that Python3 uses a similar implementation. def UNICHR(c): assert not we_are_translated() - if c <= sys.maxunicode or c > MAXUNICODE: - return unichr(c) - else: - c -= 0x10000 - return (unichr(0xD800 + (c >> 10)) + - unichr(0xDC00 + (c & 0x03FF))) + return unichr_returns_surrogate(c) UNICHR._flowspace_rewrite_directly_as_ = unichr # ^^^ NB.: for translation, it's essential to use this hack instead # of calling unichr() from UNICHR(), because unichr() detects if there @@ -34,12 +49,7 @@ def ORD(u): assert not we_are_translated() - if isinstance(u, unicode) and len(u) == 2: - ch1 = ord(u[0]) - ch2 = ord(u[1]) - if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF: - return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000 - return ord(u) + return ord_accepts_surrogate(u) ORD._flowspace_rewrite_directly_as_ = ord else: _______________________________________________ pypy-commit mailing list pypy-commit@python.org http://mail.python.org/mailman/listinfo/pypy-commit