Author: Amaury Forgeot d'Arc <[email protected]>
Branch: py3k
Changeset: r60256:4b63836b7e97
Date: 2013-01-20 20:27 +0100
http://bitbucket.org/pypy/pypy/changeset/4b63836b7e97/
Log: hg merge default
diff --git a/pypy/module/unicodedata/interp_ucd.py
b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -9,6 +9,7 @@
from rpython.rlib.objectmodel import we_are_translated
from rpython.rlib.runicode import MAXUNICODE
from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0
+from rpython.rlib.runicode import code_to_unichr, ORD
import sys
@@ -30,25 +31,6 @@
# The functions below are subtly different from the ones in runicode.py.
# When PyPy implements Python 3 they should be merged.
-def UNICHR(c):
- if c <= sys.maxunicode and c <= MAXUNICODE:
- return unichr(c)
- else:
- c -= 0x10000
- return (unichr(0xD800 + (c >> 10)) +
- unichr(0xDC00 + (c & 0x03FF)))
-
-def ORD(u):
- assert isinstance(u, unicode)
- if len(u) == 1:
- return ord(u[0])
- elif len(u) == 2:
- ch1 = ord(u[0])
- ch2 = ord(u[1])
- if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
- return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000
- raise ValueError
-
if MAXUNICODE > 0xFFFF:
# Target is wide build
def unichr_to_code_w(space, w_unichr):
@@ -69,12 +51,6 @@
'need a single Unicode character as parameter'))
return space.int_w(space.ord(w_unichr))
- def code_to_unichr(code):
- if not we_are_translated() and sys.maxunicode == 0xFFFF:
- # Host CPython is narrow build, generate surrogates
- return UNICHR(code)
- else:
- return unichr(code)
else:
# Target is narrow build
def unichr_to_code_w(space, w_unichr):
@@ -97,10 +73,6 @@
raise OperationError(space.w_TypeError, space.wrap(
'need a single Unicode character as parameter'))
- def code_to_unichr(code):
- # generate surrogates for large codes
- return UNICHR(code)
-
class UCD(Wrappable):
def __init__(self, unicodedb):
diff --git a/pypy/module/unicodedata/test_unicodedata.py
b/pypy/module/unicodedata/test_unicodedata.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/unicodedata/test_unicodedata.py
@@ -0,0 +1,103 @@
+
+class AppTestUnicodeData:
+ spaceconfig = dict(usemodules=('unicodedata',))
+
+ def test_hangul_syllables(self):
+ import unicodedata
+ # Test all leading, vowel and trailing jamo
+ # but not every combination of them.
+ for code, name in ((0xAC00, 'HANGUL SYLLABLE GA'),
+ (0xAE69, 'HANGUL SYLLABLE GGAEG'),
+ (0xB0D2, 'HANGUL SYLLABLE NYAGG'),
+ (0xB33B, 'HANGUL SYLLABLE DYAEGS'),
+ (0xB5A4, 'HANGUL SYLLABLE DDEON'),
+ (0xB80D, 'HANGUL SYLLABLE RENJ'),
+ (0xBA76, 'HANGUL SYLLABLE MYEONH'),
+ (0xBCDF, 'HANGUL SYLLABLE BYED'),
+ (0xBF48, 'HANGUL SYLLABLE BBOL'),
+ (0xC1B1, 'HANGUL SYLLABLE SWALG'),
+ (0xC41A, 'HANGUL SYLLABLE SSWAELM'),
+ (0xC683, 'HANGUL SYLLABLE OELB'),
+ (0xC8EC, 'HANGUL SYLLABLE JYOLS'),
+ (0xCB55, 'HANGUL SYLLABLE JJULT'),
+ (0xCDBE, 'HANGUL SYLLABLE CWEOLP'),
+ (0xD027, 'HANGUL SYLLABLE KWELH'),
+ (0xD290, 'HANGUL SYLLABLE TWIM'),
+ (0xD4F9, 'HANGUL SYLLABLE PYUB'),
+ (0xD762, 'HANGUL SYLLABLE HEUBS'),
+ (0xAE27, 'HANGUL SYLLABLE GYIS'),
+ (0xB090, 'HANGUL SYLLABLE GGISS'),
+ (0xB0AD, 'HANGUL SYLLABLE NANG'),
+ (0xB316, 'HANGUL SYLLABLE DAEJ'),
+ (0xB57F, 'HANGUL SYLLABLE DDYAC'),
+ (0xB7E8, 'HANGUL SYLLABLE RYAEK'),
+ (0xBA51, 'HANGUL SYLLABLE MEOT'),
+ (0xBCBA, 'HANGUL SYLLABLE BEP'),
+ (0xBF23, 'HANGUL SYLLABLE BBYEOH'),
+ (0xD7A3, 'HANGUL SYLLABLE HIH')):
+ assert unicodedata.name(chr(code)) == name
+ assert unicodedata.lookup(name) == chr(code)
+ # Test outside the range
+ raises(ValueError, unicodedata.name, chr(0xAC00 - 1))
+ raises(ValueError, unicodedata.name, chr(0xD7A3 + 1))
+
+ def test_cjk(self):
+ import sys
+ import unicodedata
+ cases = ((0x3400, 0x4DB5),
+ (0x4E00, 0x9FA5))
+ if unicodedata.unidata_version >= "5": # don't know the exact limit
+ cases = ((0x3400, 0x4DB5),
+ (0x4E00, 0x9FCB),
+ (0x20000, 0x2A6D6),
+ (0x2A700, 0x2B734))
+ elif unicodedata.unidata_version >= "4.1":
+ cases = ((0x3400, 0x4DB5),
+ (0x4E00, 0x9FBB),
+ (0x20000, 0x2A6D6))
+ for first, last in cases:
+ # Test at and inside the boundary
+ for i in (first, first + 1, last - 1, last):
+ charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
+ char = chr(i)
+ assert unicodedata.name(char) == charname
+ assert unicodedata.lookup(charname) == char
+ # Test outside the boundary
+ for i in first - 1, last + 1:
+ charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
+ char = chr(i)
+ try:
+ unicodedata.name(char)
+ except ValueError as e:
+ assert e.message == 'no such name'
+ raises(KeyError, unicodedata.lookup, charname)
+
+ def test_bug_1704793(self): # from CPython
+ import unicodedata
+ assert unicodedata.lookup("GOTHIC LETTER FAIHU") == '\U00010346'
+
+ def test_normalize(self):
+ import unicodedata
+ raises(TypeError, unicodedata.normalize, 'x')
+
+ def test_normalize_wide(self):
+ import sys, unicodedata
+ if sys.maxunicode < 0x10ffff:
+ skip("requires a 'wide' python build.")
+ assert unicodedata.normalize('NFC', '\U000110a5\U000110ba') ==
'\U000110ab'
+
+ def test_linebreaks(self):
+ linebreaks = (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
+ 0x1c, 0x1d, 0x1e, 0x2028, 0x2029)
+ for i in linebreaks:
+ for j in range(-2, 3):
+ lines = (chr(i + j) + 'A').splitlines()
+ if i + j in linebreaks:
+ assert len(lines) == 2
+ else:
+ assert len(lines) == 1
+
+ def test_mirrored(self):
+ import unicodedata
+ # For no reason, unicodedata.mirrored() returns an int, not a bool
+ assert repr(unicodedata.mirrored(' ')) == '0'
diff --git a/rpython/bin/rpython b/rpython/bin/rpython
--- a/rpython/bin/rpython
+++ b/rpython/bin/rpython
@@ -7,7 +7,8 @@
run with --help for more information
"""
-import sys
+import sys, os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
from rpython.translator.goal.translate import main
# no implicit targets
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -1,15 +1,16 @@
import sys
-from rpython.rlib.bitmanipulation import splitter
-from rpython.rtyper.lltypesystem import lltype, rffi
-from rpython.rlib.objectmodel import we_are_translated, specialize, enforceargs
+from rpython.rlib.objectmodel import specialize, we_are_translated
from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
from rpython.rlib.rarithmetic import r_uint, intmask
from rpython.rlib.unicodedata import unicodedb
+from rpython.rtyper.lltypesystem import lltype, rffi
+
if rffi.sizeof(lltype.UniChar) == 4:
MAXUNICODE = 0x10ffff
else:
MAXUNICODE = 0xffff
+
BYTEORDER = sys.byteorder
if MAXUNICODE > sys.maxunicode:
@@ -45,6 +46,27 @@
UNICHR = unichr
ORD = ord
+if MAXUNICODE > 0xFFFF:
+ def code_to_unichr(code):
+ if not we_are_translated() and sys.maxunicode == 0xFFFF:
+ # Host CPython is narrow build, generate surrogates
+ return UNICHR(code)
+ else:
+ return unichr(code)
+else:
+ def code_to_unichr(code):
+ # generate surrogates for large codes
+ return UNICHR(code)
+
+def _STORECHAR(result, CH, byteorder):
+ hi = chr(((CH) >> 8) & 0xff)
+ lo = chr((CH) & 0xff)
+ if byteorder == 'little':
+ result.append(lo)
+ result.append(hi)
+ else:
+ result.append(hi)
+ result.append(lo)
def default_unicode_error_decode(errors, encoding, msg, s,
startingpos, endingpos):
@@ -446,16 +468,6 @@
result.append(r)
return result.build(), pos, bo
-def _STORECHAR(result, CH, byteorder):
- hi = chr(((CH) >> 8) & 0xff)
- lo = chr((CH) & 0xff)
- if byteorder == 'little':
- result.append(lo)
- result.append(hi)
- else:
- result.append(hi)
- result.append(lo)
-
def unicode_encode_utf_16_helper(s, size, errors,
errorhandler=None,
byteorder='little'):
diff --git a/pypy/module/unicodedata/test_interp_ucd.py
b/rpython/rlib/unicodedata/test/test_ucd.py
rename from pypy/module/unicodedata/test_interp_ucd.py
rename to rpython/rlib/unicodedata/test/test_ucd.py
--- a/pypy/module/unicodedata/test_interp_ucd.py
+++ b/rpython/rlib/unicodedata/test/test_ucd.py
@@ -1,6 +1,6 @@
from rpython.rtyper.test.tool import BaseRtypingTest, LLRtypeMixin
from rpython.rlib.unicodedata import unicodedb_5_2_0
-from pypy.module.unicodedata.interp_ucd import code_to_unichr
+from rpython.rlib.unicodedata.ucd import code_to_unichr
class TestTranslated(BaseRtypingTest, LLRtypeMixin):
diff --git a/rpython/rlib/unicodedata/test/test_unicodedata.py
b/rpython/rlib/unicodedata/test/test_unicodedata.py
--- a/rpython/rlib/unicodedata/test/test_unicodedata.py
+++ b/rpython/rlib/unicodedata/test/test_unicodedata.py
@@ -1,109 +1,6 @@
import py
from rpython.rlib.unicodedata import unicodedb_3_2_0, unicodedb_5_2_0
-class AppTestUnicodeData:
- spaceconfig = dict(usemodules=('unicodedata',))
-
- def test_hangul_syllables(self):
- import unicodedata
- # Test all leading, vowel and trailing jamo
- # but not every combination of them.
- for code, name in ((0xAC00, 'HANGUL SYLLABLE GA'),
- (0xAE69, 'HANGUL SYLLABLE GGAEG'),
- (0xB0D2, 'HANGUL SYLLABLE NYAGG'),
- (0xB33B, 'HANGUL SYLLABLE DYAEGS'),
- (0xB5A4, 'HANGUL SYLLABLE DDEON'),
- (0xB80D, 'HANGUL SYLLABLE RENJ'),
- (0xBA76, 'HANGUL SYLLABLE MYEONH'),
- (0xBCDF, 'HANGUL SYLLABLE BYED'),
- (0xBF48, 'HANGUL SYLLABLE BBOL'),
- (0xC1B1, 'HANGUL SYLLABLE SWALG'),
- (0xC41A, 'HANGUL SYLLABLE SSWAELM'),
- (0xC683, 'HANGUL SYLLABLE OELB'),
- (0xC8EC, 'HANGUL SYLLABLE JYOLS'),
- (0xCB55, 'HANGUL SYLLABLE JJULT'),
- (0xCDBE, 'HANGUL SYLLABLE CWEOLP'),
- (0xD027, 'HANGUL SYLLABLE KWELH'),
- (0xD290, 'HANGUL SYLLABLE TWIM'),
- (0xD4F9, 'HANGUL SYLLABLE PYUB'),
- (0xD762, 'HANGUL SYLLABLE HEUBS'),
- (0xAE27, 'HANGUL SYLLABLE GYIS'),
- (0xB090, 'HANGUL SYLLABLE GGISS'),
- (0xB0AD, 'HANGUL SYLLABLE NANG'),
- (0xB316, 'HANGUL SYLLABLE DAEJ'),
- (0xB57F, 'HANGUL SYLLABLE DDYAC'),
- (0xB7E8, 'HANGUL SYLLABLE RYAEK'),
- (0xBA51, 'HANGUL SYLLABLE MEOT'),
- (0xBCBA, 'HANGUL SYLLABLE BEP'),
- (0xBF23, 'HANGUL SYLLABLE BBYEOH'),
- (0xD7A3, 'HANGUL SYLLABLE HIH')):
- assert unicodedata.name(chr(code)) == name
- assert unicodedata.lookup(name) == chr(code)
- # Test outside the range
- py.test.raises(ValueError, unicodedata.name, chr(0xAC00 - 1))
- py.test.raises(ValueError, unicodedata.name, chr(0xD7A3 + 1))
-
- def test_cjk(self):
- import sys
- import unicodedata
- cases = ((0x3400, 0x4DB5),
- (0x4E00, 0x9FA5))
- if unicodedata.unidata_version >= "5": # don't know the exact limit
- cases = ((0x3400, 0x4DB5),
- (0x4E00, 0x9FCB),
- (0x20000, 0x2A6D6),
- (0x2A700, 0x2B734))
- elif unicodedata.unidata_version >= "4.1":
- cases = ((0x3400, 0x4DB5),
- (0x4E00, 0x9FBB),
- (0x20000, 0x2A6D6))
- for first, last in cases:
- # Test at and inside the boundary
- for i in (first, first + 1, last - 1, last):
- charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
- char = chr(i)
- assert unicodedata.name(char) == charname
- assert unicodedata.lookup(charname) == char
- # Test outside the boundary
- for i in first - 1, last + 1:
- charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
- char = chr(i)
- try:
- unicodedata.name(char)
- except ValueError as e:
- assert e.message == 'no such name'
- py.test.raises(KeyError, unicodedata.lookup, charname)
-
- def test_bug_1704793(self): # from CPython
- import unicodedata
- assert unicodedata.lookup("GOTHIC LETTER FAIHU") == '\U00010346'
-
- def test_normalize(self):
- import unicodedata
- py.test.raises(TypeError, unicodedata.normalize, 'x')
-
- def test_normalize_wide(self):
- import sys, unicodedata
- if sys.maxunicode < 0x10ffff:
- skip("requires a 'wide' python build.")
- assert unicodedata.normalize('NFC', '\U000110a5\U000110ba') ==
'\U000110ab'
-
- def test_linebreaks(self):
- linebreaks = (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
- 0x1c, 0x1d, 0x1e, 0x2028, 0x2029)
- for i in linebreaks:
- for j in range(-2, 3):
- lines = (chr(i + j) + 'A').splitlines()
- if i + j in linebreaks:
- assert len(lines) == 2
- else:
- assert len(lines) == 1
-
- def test_mirrored(self):
- import unicodedata
- # For no reason, unicodedata.mirrored() returns an int, not a bool
- assert repr(unicodedata.mirrored(' ')) == '0'
-
class TestUnicodeData(object):
def setup_class(cls):
import random, unicodedata
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit