Author: Philip Jenvey <[email protected]>
Branch: py3k
Changeset: r58424:6e0ce8d44be5
Date: 2012-10-25 18:03 -0700
http://bitbucket.org/pypy/pypy/changeset/6e0ce8d44be5/
Log: add the surrogatepass error handler
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -248,6 +248,50 @@
raise operationerrfmt(space.w_TypeError,
"don't know how to handle %s in error callback", typename)
+def surrogatepass_errors(space, w_exc):
+ check_exception(space, w_exc)
+ if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
+ obj = space.realunicode_w(space.getattr(w_exc, space.wrap('object')))
+ start = space.int_w(space.getattr(w_exc, space.wrap('start')))
+ w_end = space.getattr(w_exc, space.wrap('end'))
+ end = space.int_w(w_end)
+ res = ''
+ pos = start
+ while pos < end:
+ ch = ord(obj[pos])
+ pos += 1
+ if ch < 0xd800 or ch > 0xdfff:
+ # Not a surrogate, fail with original exception
+ raise OperationError(space.type(w_exc), w_exc)
+ res += chr(0xe0 | (ch >> 12))
+ res += chr(0x80 | ((ch >> 6) & 0x3f))
+ res += chr(0x80 | (ch >> 0x3f))
+ return space.newtuple([space.wrapbytes(res), w_end])
+ elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
+ start = space.int_w(space.getattr(w_exc, space.wrap('start')))
+ obj = space.bytes_w(space.getattr(w_exc, space.wrap('object')))
+ ch = 0
+ # Try decoding a single surrogate character. If there are more,
+ # let the codec call us again
+ ch0 = ord(obj[start + 0])
+ ch1 = ord(obj[start + 1])
+ ch2 = ord(obj[start + 2])
+ if (ch0 & 0xf0 == 0xe0 or
+ ch1 & 0xc0 == 0x80 or
+ ch2 & 0xc0 == 0x80):
+ # it's a three-byte code
+ ch = ((ch0 & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f)
+ if ch < 0xd800 or ch > 0xdfff:
+ # it's not a surrogate - fail
+ ch = 0
+ if ch == 0:
+ raise OperationError(space.type(w_exc), w_exc)
+ return space.newtuple([space.wrap(unichr(ch)), space.wrap(start + 3)])
+ else:
+ typename = space.type(w_exc).getname(space)
+ raise operationerrfmt(space.w_TypeError,
+ "don't know how to handle %s in error callback", typename)
+
def surrogateescape_errors(space, w_exc):
check_exception(space, w_exc)
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
@@ -292,7 +336,7 @@
"NOT_RPYTHON"
state = space.fromcache(CodecState)
for error in ("strict", "ignore", "replace", "xmlcharrefreplace",
- "backslashreplace", "surrogateescape"):
+ "backslashreplace", "surrogateescape", "surrogatepass"):
name = error + "_errors"
state.codec_error_registry[error] =
space.wrap(interp2app(globals()[name]))
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -492,6 +492,14 @@
assert b'a\x80b'.decode('utf-8', 'surrogateescape') == 'a\udc80b'
assert 'a\udc80b'.encode('utf-8', 'surrogateescape') == b'a\x80b'
+ def test_surrogatepass_handler(self):
+ import _codecs
+ assert _codecs.lookup_error("surrogatepass")
+ assert ("abc\ud800def".encode("utf-8", "surrogatepass") ==
+ b"abc\xed\xa0\x80def")
+ assert (b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass") ==
+ "abc\ud800def")
+
def test_badhandler(self):
import codecs
results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None),
("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit