Author: Tyler Wade <way...@gmail.com>
Branch: utf8-unicode2
Changeset: r72468:e70f582fd5dc
Date: 2014-07-17 01:43 -0500
http://bitbucket.org/pypy/pypy/changeset/e70f582fd5dc/

Log:    Fix _multibytecodec

diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -2,9 +2,8 @@
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib.runicode import utf8_code_length
 from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
-from rpython.rlib.rarithmetic import r_uint
-from rpython.rtyper.lltypesystem import rffi
-from rpython.rtyper.lltypesystem import lltype
+from rpython.rlib.rarithmetic import r_uint, intmask
+from rpython.rtyper.lltypesystem import rffi, lltype
 
 wchar_rint = rffi.r_uint
 WCHAR_INTP = rffi.UINTP
@@ -464,7 +463,7 @@
             if rffi.sizeof(rffi.WCHAR_T) == 2:
                 if 0xD800 <= c <= 0xDBFF:
                     i += 1
-                    c2 = int(array[i])
+                    c2 = intmask(array[i])
                     if c2 == 0:
                         builder.append(c)
                         break
@@ -485,7 +484,7 @@
         builder = Utf8Builder()
         i = 0;
         while i < size:
-            c = int(array[i])
+            c = intmask(array[i])
             if c == 0:
                 break
 
@@ -513,7 +512,7 @@
         builder = Utf8Builder()
         i = 0;
         while i < size:
-            c = int(array[i])
+            c = intmask(array[i])
 
             if rffi.sizeof(rffi.WCHAR_T) == 2:
                 if i != size - 1 and 0xD800 <= c <= 0xDBFF:
diff --git a/pypy/module/_multibytecodec/c_codecs.py 
b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -1,8 +1,9 @@
 import py
 from rpython.rtyper.lltypesystem import lltype, rffi
 from rpython.translator.tool.cbuild import ExternalCompilationInfo
+from pypy.interpreter.utf8 import Utf8Str
 
-UNICODE_REPLACEMENT_CHARACTER = u'\uFFFD'
+UNICODE_REPLACEMENT_CHARACTER = Utf8Str.from_unicode(u'\uFFFD')
 
 
 class EncodeDecodeError(Exception):
@@ -139,7 +140,7 @@
                                     errorcb, namecb, stringdata)
         src = pypy_cjk_dec_outbuf(decodebuf)
         length = pypy_cjk_dec_outlen(decodebuf)
-        return rffi.wcharpsize2unicode(src, length)
+        return Utf8Str.from_wcharpsize(src, length)
     #
     finally:
         rffi.free_nonmovingbuffer(stringdata, inbuf)
@@ -164,18 +165,18 @@
     if errors == "strict":
         raise EncodeDecodeError(start, end, reason)
     elif errors == "ignore":
-        replace = u""
+        replace = Utf8Str("")
     elif errors == "replace":
         replace = UNICODE_REPLACEMENT_CHARACTER
     else:
         assert errorcb
         replace, end = errorcb(errors, namecb, reason,
                                stringdata, start, end)
-    inbuf = rffi.get_nonmoving_unicodebuffer(replace)
+    inbuf = replace.copy_to_wcharp()
     try:
         r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, len(replace), end)
     finally:
-        rffi.free_nonmoving_unicodebuffer(replace, inbuf)
+        rffi.free_wcharp(inbuf)
     if r == MBERR_NOMEMORY:
         raise MemoryError
 
@@ -222,7 +223,7 @@
 def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None,
              namecb=None, ignore_error=0):
     inleft = len(unicodedata)
-    inbuf = rffi.get_nonmoving_unicodebuffer(unicodedata)
+    inbuf = unicodedata.copy_to_wcharp()
     try:
         if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0:
             raise MemoryError
@@ -247,7 +248,7 @@
         return rffi.charpsize2str(src, length)
     #
     finally:
-        rffi.free_nonmoving_unicodebuffer(unicodedata, inbuf)
+        rffi.free_wcharp(inbuf)
 
 def multibytecodec_encerror(encodebuf, e, errors,
                             errorcb, namecb, unicodedata):
@@ -273,7 +274,7 @@
     elif errors == "replace":
         codec = pypy_cjk_enc_getcodec(encodebuf)
         try:
-            replace = encode(codec, u"?")
+            replace = encode(codec, Utf8Str("?"))
         except EncodeDecodeError:
             replace = "?"
     else:
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py 
b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -1,4 +1,5 @@
 import py
+from pypy.interpreter.utf8 import Utf8Str
 from pypy.module._multibytecodec.c_codecs import getcodec, codecs
 from pypy.module._multibytecodec.c_codecs import decode, encode
 from pypy.module._multibytecodec.c_codecs import EncodeDecodeError
@@ -95,37 +96,38 @@
 
 def test_encode_hz():
     c = getcodec("hz")
-    s = encode(c, u'foobar')
+    s = encode(c, Utf8Str('foobar'))
     assert s == 'foobar' and type(s) is str
-    s = encode(c, u'\u5f95\u6cef')
+    s = encode(c, Utf8Str.from_unicode(u'\u5f95\u6cef'))
     assert s == '~{abc}~}'
 
 def test_encode_hz_error():
     # error
     c = getcodec("hz")
-    e = py.test.raises(EncodeDecodeError, encode, c, u'abc\u1234def').value
+    e = py.test.raises(EncodeDecodeError, encode, c,
+                       Utf8Str.from_unicode(u'abc\u1234def')).value
     assert e.start == 3
     assert e.end == 4
     assert e.reason == "illegal multibyte sequence"
 
 def test_encode_hz_ignore():
     c = getcodec("hz")
-    s = encode(c, u'abc\u1234def', 'ignore')
+    s = encode(c, Utf8Str.from_unicode(u'abc\u1234def'), 'ignore')
     assert s == 'abcdef'
 
 def test_encode_hz_replace():
     c = getcodec("hz")
-    s = encode(c, u'abc\u1234def', 'replace')
+    s = encode(c, Utf8Str.from_unicode(u'abc\u1234def'), 'replace')
     assert s == 'abc?def'
 
 def test_encode_jisx0208():
     c = getcodec('iso2022_jp')
-    s = encode(c, u'\u83ca\u5730\u6642\u592b')
+    s = encode(c, Utf8Str.from_unicode(u'\u83ca\u5730\u6642\u592b'))
     assert s == '\x1b$B5FCO;~IW\x1b(B' and type(s) is str
 
 def test_encode_custom_error_handler_bytes():
     c = getcodec("hz")
     def errorhandler(errors, enc, msg, t, startingpos, endingpos):
         return None, '\xc3', endingpos
-    s = encode(c, u'abc\u1234def', 'foo', errorhandler)
+    s = encode(c, Utf8Str.from_unicode(u'abc\u1234def'), 'foo', errorhandler)
     assert '\xc3' in s
diff --git a/pypy/module/sys/vm.py b/pypy/module/sys/vm.py
--- a/pypy/module/sys/vm.py
+++ b/pypy/module/sys/vm.py
@@ -3,11 +3,11 @@
 """
 
 from rpython.rlib import jit
-from rpython.rlib.runicode import MAXUNICODE
 
 from pypy.interpreter import gateway
 from pypy.interpreter.error import OperationError
 from pypy.interpreter.gateway import unwrap_spec
+from pypy.interpreter.utf8_codecs import MAXUNICODE
 
 
 # ____________________________________________________________
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to