Author: Ronan Lamy <[email protected]>
Branch: py3.6
Changeset: r97322:7ace73ce896a
Date: 2019-08-28 15:41 +0100
http://bitbucket.org/pypy/pypy/changeset/7ace73ce896a/
Log: Fix error handler calls when encoding to utf-16
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1215,21 +1215,22 @@
while pos < size:
try:
cp = rutf8.codepoint_at_pos(s, pos)
+ pos = rutf8.next_codepoint_pos(s, pos)
except IndexError:
# malformed codepoint, blindly use ch
pos += 1
if errorhandler:
r, newindex, rettype = errorhandler(
errors, public_encoding_name, 'malformed unicode',
- s, pos - 1, pos)
+ s, index, index + 1)
if rettype == 'u':
for cp in rutf8.Utf8StringIterator(r):
if cp < 0xD800:
_STORECHAR(result, cp, byteorder)
else:
- errorhandler('strict', public_encoding_name,
- 'malformed unicode',
- s, pos-1, pos)
+ errorhandler(
+ 'strict', public_encoding_name,
+ 'malformed unicode', s, index, index + 1)
else:
for ch in r:
cp = ord(ch)
@@ -1238,7 +1239,7 @@
else:
errorhandler('strict', public_encoding_name,
'malformed unicode',
- s, pos-1, pos)
+ s, index, index + 1)
else:
cp = ord(s[pos])
_STORECHAR(result, cp, byteorder)
@@ -1253,7 +1254,7 @@
else:
r, newindex, rettype = errorhandler(
errors, public_encoding_name, 'surrogates not allowed',
- s, pos, pos+1)
+ s, index, index+1)
if rettype == 'u':
for cp in rutf8.Utf8StringIterator(r):
if cp < 0xD800 or allow_surrogates:
@@ -1261,7 +1262,7 @@
else:
errorhandler('strict', public_encoding_name,
'surrogates not allowed',
- s, pos, pos+1)
+ s, index, index+1)
else:
for ch in r:
cp = ord(ch)
@@ -1270,13 +1271,11 @@
else:
errorhandler('strict', public_encoding_name,
'surrogates not allowed',
- s, pos, pos+1)
+ s, index, index+1)
if index != newindex: # Should be uncommon
index = newindex
pos = rutf8._pos_at_index(s, newindex)
continue
-
- pos = rutf8.next_codepoint_pos(s, pos)
index += 1
return result.build()
@@ -1516,19 +1515,19 @@
return result.build()
def utf8_encode_utf_32(s, errors,
- errorhandler=None, allow_surrogates=True):
+ errorhandler=None, allow_surrogates=True):
return utf8_encode_utf_32_helper(s, errors, errorhandler,
allow_surrogates, "native",
'utf-32-' + BYTEORDER2)
def utf8_encode_utf_32_be(s, errors,
- errorhandler=None, allow_surrogates=True):
+ errorhandler=None, allow_surrogates=True):
return utf8_encode_utf_32_helper(s, errors, errorhandler,
allow_surrogates, "big",
'utf-32-be')
def utf8_encode_utf_32_le(s, errors,
- errorhandler=None, allow_surrogates=True):
+ errorhandler=None, allow_surrogates=True):
return utf8_encode_utf_32_helper(s, errors, errorhandler,
allow_surrogates, "little",
'utf-32-le')
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -271,10 +271,10 @@
assert 'unexpected end of data' in str(exc.value)
useq = bseq.decode('utf-8', 'replace')
assert useq == u'\ufffd', (bseq, useq)
- assert ((b'aaaa' + bseq + b'bbbb').decode('utf-8', 'replace') ==
+ assert ((b'aaaa' + bseq + b'bbbb').decode('utf-8', 'replace') ==
u'aaaa\ufffdbbbb')
assert bseq.decode('utf-8', 'ignore') == ''
- assert ((b'aaaa' + bseq + b'bbbb').decode('utf-8', 'ignore') ==
+ assert ((b'aaaa' + bseq + b'bbbb').decode('utf-8', 'ignore') ==
u'aaaabbbb')
def test_invalid_cb_for_3bytes_seq(self):
@@ -337,7 +337,7 @@
exc = raises(UnicodeDecodeError, seq.decode, 'utf-8')
assert err in str(exc.value)
assert seq.decode('utf-8', 'replace') == res
- assert ((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace') ==
+ assert ((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace') ==
'aaaa' + res + 'bbbb')
res = res.replace('\ufffd', '')
assert seq.decode('utf-8', 'ignore') == res
@@ -425,7 +425,7 @@
exc = raises(UnicodeDecodeError, seq.decode, 'utf-8')
assert err in str(exc.value)
assert seq.decode('utf-8', 'replace') == res
- assert ((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace') ==
+ assert ((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace') ==
'aaaa' + res + 'bbbb')
res = res.replace('\ufffd', '')
assert seq.decode('utf-8', 'ignore') == res
@@ -1148,6 +1148,11 @@
'[]'.encode(encoding))
assert (u'[\udc80]'.encode(encoding, "replace") ==
'[?]'.encode(encoding))
+ # surrogate sequences
+ assert (u'[\ud800\udc80]'.encode(encoding, "ignore") ==
+ '[]'.encode(encoding))
+ assert (u'[\ud800\udc80]'.encode(encoding, "replace") ==
+ '[??]'.encode(encoding))
for encoding, ill_surrogate in [('utf-8', b'\xed\xb2\x80'),
('utf-16-le', b'\x80\xdc'),
('utf-16-be', b'\xdc\x80'),
@@ -1167,7 +1172,7 @@
assert test_string.encode(encoding, 'surrogatepass') ==
test_sequence
assert test_sequence.decode(encoding, 'surrogatepass') ==
test_string
assert test_sequence.decode(encoding, 'ignore') == before +
after
- assert test_sequence.decode(encoding, 'replace') == (before +
+ assert test_sequence.decode(encoding, 'replace') == (before +
ill_formed_sequence_replace +
after), str(
(encoding, test_sequence, before + ill_formed_sequence_replace
+ after))
backslashreplace = ''.join('\\x%02x' % b for b in
ill_surrogate)
@@ -1388,7 +1393,7 @@
# from stdlib tests, bad byte: \xa5 is unmapped in iso-8859-3
assert (b"foo\xa5bar".decode("iso-8859-3", "surrogateescape") ==
"foo\udca5bar")
- assert ("foo\udca5bar".encode("iso-8859-3", "surrogateescape") ==
+ assert ("foo\udca5bar".encode("iso-8859-3", "surrogateescape") ==
b"foo\xa5bar")
def test_warn_escape_decode(self):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit