Author: Armin Rigo <[email protected]>
Branch:
Changeset: r44707:4ad72b733e1f
Date: 2011-06-05 10:52 +0200
http://bitbucket.org/pypy/pypy/changeset/4ad72b733e1f/
Log: decode(errors="ignore") at the C level
diff --git a/pypy/module/_multibytecodec/c_codecs.py
b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -103,8 +103,10 @@
[DECODEBUF_P], rffi.SSIZE_T)
pypy_cjk_dec_inbuf_consumed = llexternal('pypy_cjk_dec_inbuf_consumed',
[DECODEBUF_P], rffi.SSIZE_T)
+pypy_cjk_dec_inbuf_add = llexternal('pypy_cjk_dec_inbuf_add',
+ [DECODEBUF_P, rffi.SSIZE_T], lltype.Void)
-def decode(codec, stringdata):
+def decode(codec, stringdata, errors="strict"):
inleft = len(stringdata)
inbuf = rffi.get_nonmovingbuffer(stringdata)
try:
@@ -112,10 +114,11 @@
if not decodebuf:
raise MemoryError
try:
- r = pypy_cjk_dec_chunk(decodebuf)
- if r != 0:
- multibytecodec_decerror(decodebuf, r)
- assert False
+ while True:
+ r = pypy_cjk_dec_chunk(decodebuf)
+ if r == 0:
+ break
+ multibytecodec_decerror(decodebuf, r, errors)
src = pypy_cjk_dec_outbuf(decodebuf)
length = pypy_cjk_dec_outlen(decodebuf)
return rffi.wcharpsize2unicode(src, length)
@@ -126,7 +129,7 @@
finally:
rffi.free_nonmovingbuffer(stringdata, inbuf)
-def multibytecodec_decerror(decodebuf, e):
+def multibytecodec_decerror(decodebuf, e, errors):
if e > 0:
reason = "illegal multibyte sequence"
esize = e
@@ -139,7 +142,9 @@
raise RuntimeError
#
# if errors == ERROR_REPLACE:...
- # if errors == ERROR_IGNORE or errors == ERROR_REPLACE:...
+ if errors == "ignore": # or errors == ERROR_REPLACE
+ pypy_cjk_dec_inbuf_add(decodebuf, esize)
+ return # continue decoding
start = pypy_cjk_dec_inbuf_consumed(decodebuf)
end = start + esize
if 1: # errors == ERROR_STRICT:
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py
b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -36,6 +36,11 @@
assert e.end == 4
assert e.reason == "illegal multibyte sequence"
+def test_decode_hz_ignore():
+ c = getcodec("hz")
+ u = decode(c, 'def~{}abc', 'ignore')
+ assert u == u'def\u5fcf'
+
def test_encode_hz():
c = getcodec("hz")
s = encode(c, u'foobar')
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c
b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
@@ -93,6 +93,11 @@
return d->inbuf - d->inbuf_start;
}
+void pypy_cjk_dec_inbuf_add(struct pypy_cjk_dec_s* d, Py_ssize_t skip)
+{
+ d->inbuf += skip;
+}
+
/************************************************************/
struct pypy_cjk_enc_s *pypy_cjk_enc_init(const MultibyteCodec *codec,
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h
b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
@@ -102,6 +102,7 @@
Py_ssize_t pypy_cjk_dec_outlen(struct pypy_cjk_dec_s *);
Py_ssize_t pypy_cjk_dec_inbuf_remaining(struct pypy_cjk_dec_s *d);
Py_ssize_t pypy_cjk_dec_inbuf_consumed(struct pypy_cjk_dec_s* d);
+void pypy_cjk_dec_inbuf_add(struct pypy_cjk_dec_s*, Py_ssize_t);
struct pypy_cjk_enc_s {
const MultibyteCodec *codec;
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit