Author: Armin Rigo <[email protected]>
Branch:
Changeset: r44722:32f1f17883f4
Date: 2011-06-05 17:22 +0200
http://bitbucket.org/pypy/pypy/changeset/32f1f17883f4/
Log: Custom decode error handlers.
diff --git a/pypy/module/_multibytecodec/c_codecs.py
b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -3,6 +3,8 @@
from pypy.translator.tool.cbuild import ExternalCompilationInfo
from pypy.tool.autopath import pypydir
+UNICODE_REPLACEMENT_CHARACTER = u'\uFFFD'
+
class EncodeDecodeError(Exception):
def __init__(self, start, end, reason):
@@ -103,11 +105,12 @@
[DECODEBUF_P], rffi.SSIZE_T)
pypy_cjk_dec_inbuf_consumed = llexternal('pypy_cjk_dec_inbuf_consumed',
[DECODEBUF_P], rffi.SSIZE_T)
-pypy_cjk_dec_inbuf_add = llexternal('pypy_cjk_dec_inbuf_add',
- [DECODEBUF_P, rffi.SSIZE_T, rffi.INT],
- rffi.INT)
+pypy_cjk_dec_replace_on_error = llexternal('pypy_cjk_dec_replace_on_error',
+ [DECODEBUF_P, rffi.CWCHARP,
+ rffi.SSIZE_T, rffi.SSIZE_T],
+ rffi.SSIZE_T)
-def decode(codec, stringdata, errors="strict"):
+def decode(codec, stringdata, errors="strict", errorcb=None, namecb=None):
inleft = len(stringdata)
inbuf = rffi.get_nonmovingbuffer(stringdata)
try:
@@ -119,7 +122,8 @@
r = pypy_cjk_dec_chunk(decodebuf)
if r == 0:
break
- multibytecodec_decerror(decodebuf, r, errors)
+ multibytecodec_decerror(decodebuf, r, errors,
+ errorcb, namecb, stringdata)
src = pypy_cjk_dec_outbuf(decodebuf)
length = pypy_cjk_dec_outlen(decodebuf)
return rffi.wcharpsize2unicode(src, length)
@@ -130,7 +134,8 @@
finally:
rffi.free_nonmovingbuffer(stringdata, inbuf)
-def multibytecodec_decerror(decodebuf, e, errors):
+def multibytecodec_decerror(decodebuf, e, errors,
+ errorcb, namecb, stringdata):
if e > 0:
reason = "illegal multibyte sequence"
esize = e
@@ -142,19 +147,27 @@
else:
raise RuntimeError
#
- if errors == "ignore":
- pypy_cjk_dec_inbuf_add(decodebuf, esize, rffi.cast(rffi.INT, 0))
- return # continue decoding
- if errors == "replace":
- e = pypy_cjk_dec_inbuf_add(decodebuf, esize, rffi.cast(rffi.INT, 1))
- if rffi.cast(lltype.Signed, e) == MBERR_NOMEMORY:
- raise MemoryError
- return # continue decoding
+ # compute the unicode to use as a replacement -> 'replace', and
+ # the current position in the input 'unicodedata' -> 'end'
start = pypy_cjk_dec_inbuf_consumed(decodebuf)
end = start + esize
- if errors != "strict":
- reason = "not implemented: custom error handlers" # XXX implement me
- raise EncodeDecodeError(start, end, reason)
+ if errors == "strict":
+ raise EncodeDecodeError(start, end, reason)
+ elif errors == "ignore":
+ replace = u""
+ elif errors == "replace":
+ replace = UNICODE_REPLACEMENT_CHARACTER
+ else:
+ assert errorcb != None
+ replace, end = errorcb(errors, namecb, reason,
+ stringdata, start, end)
+ inbuf = rffi.get_nonmoving_unicodebuffer(replace)
+ try:
+ r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, len(replace), end)
+ finally:
+ rffi.free_nonmoving_unicodebuffer(replace, inbuf)
+ if r == MBERR_NOMEMORY:
+ raise MemoryError
# ____________________________________________________________
# Encoding
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py
b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -16,9 +16,11 @@
def decode(self, space, input, errors=None):
if errors is None:
errors = 'strict'
+ state = space.fromcache(CodecState)
#
try:
- output = c_codecs.decode(self.codec, input, errors)
+ output = c_codecs.decode(self.codec, input, errors,
+ state.decode_error_handler, self.name)
except c_codecs.EncodeDecodeError, e:
raise OperationError(
space.w_UnicodeDecodeError,
diff --git a/pypy/module/_multibytecodec/test/test_app_codecs.py
b/pypy/module/_multibytecodec/test/test_app_codecs.py
--- a/pypy/module/_multibytecodec/test/test_app_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_app_codecs.py
@@ -52,6 +52,13 @@
r = codec.decode("def~{}abc", 'replace')
assert r == (u'def\ufffd\u5fcf', 9)
+ def test_decode_custom_error_handler(self):
+ import codecs
+ codecs.register_error("test.decode_custom_error_handler",
+ lambda e: (u'\u1234\u5678', e.end))
+ u = "abc\xDD".decode("hz", "test.decode_custom_error_handler")
+ assert u == u'abc\u1234\u5678'
+
def test_encode_hz(self):
import _codecs_cn
codec = _codecs_cn.getcodec("hz")
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.c
b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.c
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.c
@@ -1,8 +1,7 @@
#include <stdlib.h>
+#include <string.h>
#include "src/cjkcodecs/multibytecodec.h"
-#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
-
struct pypy_cjk_dec_s *pypy_cjk_dec_init(const MultibyteCodec *codec,
char *inbuf, Py_ssize_t inlen)
@@ -95,17 +94,19 @@
return d->inbuf - d->inbuf_start;
}
-int pypy_cjk_dec_inbuf_add(struct pypy_cjk_dec_s* d, Py_ssize_t skip,
- int add_replacement_character)
+Py_ssize_t pypy_cjk_dec_replace_on_error(struct pypy_cjk_dec_s* d,
+ Py_UNICODE *newbuf, Py_ssize_t newlen,
+ Py_ssize_t in_offset)
{
- if (add_replacement_character)
+ if (newlen > 0)
{
- if (d->outbuf >= d->outbuf_end)
- if (expand_decodebuffer(d, 1) == -1)
+ if (d->outbuf + newlen > d->outbuf_end)
+ if (expand_decodebuffer(d, newlen) == -1)
return MBERR_NOMEMORY;
- *d->outbuf++ = Py_UNICODE_REPLACEMENT_CHARACTER;
+ memcpy(d->outbuf, newbuf, newlen * sizeof(Py_UNICODE));
+ d->outbuf += newlen;
}
- d->inbuf += skip;
+ d->inbuf = d->inbuf_start + in_offset;
return 0;
}
diff --git a/pypy/translator/c/src/cjkcodecs/multibytecodec.h
b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
--- a/pypy/translator/c/src/cjkcodecs/multibytecodec.h
+++ b/pypy/translator/c/src/cjkcodecs/multibytecodec.h
@@ -102,7 +102,8 @@
Py_ssize_t pypy_cjk_dec_outlen(struct pypy_cjk_dec_s *);
Py_ssize_t pypy_cjk_dec_inbuf_remaining(struct pypy_cjk_dec_s *d);
Py_ssize_t pypy_cjk_dec_inbuf_consumed(struct pypy_cjk_dec_s* d);
-int pypy_cjk_dec_inbuf_add(struct pypy_cjk_dec_s*, Py_ssize_t, int);
+Py_ssize_t pypy_cjk_dec_replace_on_error(struct pypy_cjk_dec_s* d,
+ Py_UNICODE *, Py_ssize_t, Py_ssize_t);
struct pypy_cjk_enc_s {
const MultibyteCodec *codec;
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit