Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95700:ee5c96ad9ed6
Date: 2019-01-23 11:30 +0200
http://bitbucket.org/pypy/pypy/changeset/ee5c96ad9ed6/
Log: test, refactor logic around invalid continuation byte
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -328,7 +328,7 @@
res = rutf8.utf8_encode_mbcs(s, errors, errorhandler,
force_replace=False)
return res
-
+
def str_decode_mbcs(s, errors, final, errorhandler, force_ignore=True):
slen = len(s)
res, size = runicode.str_decode_mbcs(s, slen, errors, final=final,
@@ -345,21 +345,18 @@
res = StringBuilder(slen)
pos = 0
end = len(s)
- suppressing = False # we are in a chain of "bad" unicode, only emit one fix
while pos < end:
ordch1 = ord(s[pos])
# fast path for ASCII
if ordch1 <= 0x7F:
pos += 1
res.append(chr(ordch1))
- suppressing = False
continue
if ordch1 <= 0xC1:
r, pos, rettype = errorhandler(errors, "utf8", "invalid start
byte",
s, pos, pos + 1)
- if not suppressing:
- res.append(r)
+ res.append(r)
continue
pos += 1
@@ -371,16 +368,14 @@
break
r, pos, rettype = errorhandler(errors, "utf8", "unexpected end
of data",
s, pos - 1, pos)
- if not suppressing:
- res.append(r)
+ res.append(r)
continue
ordch2 = ord(s[pos])
if rutf8._invalid_byte_2_of_2(ordch2):
r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos)
- if not suppressing:
- res.append(r)
+ res.append(r)
continue
# 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
pos += 1
@@ -396,10 +391,13 @@
if (pos) < end and rutf8._invalid_byte_2_of_3(ordch1,
ord(s[pos]), allow_surrogates):
msg = "invalid continuation byte"
+ r, pos, rettype = errorhandler(errors, "utf8", msg, s,
+ pos - 1, pos)
else:
msg = "unexpected end of data"
- suppressing = True
- r, pos, rettype = errorhandler(errors, "utf8", msg, s, pos -
1, pos)
+ r, pos, rettype = errorhandler(errors, "utf8", msg, s,
+ pos - 1, pos)
+ pos = end
res.append(r)
continue
ordch2 = ord(s[pos])
@@ -408,14 +406,12 @@
if rutf8._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos)
- if not suppressing:
- res.append(r)
+ res.append(r)
continue
elif rutf8._invalid_byte_3_of_3(ordch3):
r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos + 1)
- if not suppressing:
- res.append(r)
+ res.append(r)
continue
pos += 2
@@ -423,7 +419,6 @@
res.append(chr(ordch1))
res.append(chr(ordch2))
res.append(chr(ordch3))
- suppressing = False
continue
if ordch1 <= 0xF4:
@@ -433,23 +428,27 @@
break
if pos < end and rutf8._invalid_byte_2_of_4(ordch1,
ord(s[pos])):
msg = "invalid continuation byte"
+ r, pos, rettype = errorhandler(errors, "utf8", msg, s,
+ pos - 1, pos)
elif pos + 1 < end and rutf8._invalid_byte_3_of_4(ord(s[pos +
1])):
msg = "invalid continuation byte"
+ pos += 1
+ r, pos, rettype = errorhandler(errors, "utf8", msg, s,
+ pos - 2, pos)
else:
msg = "unexpected end of data"
- suppressing = True
- r, pos, rettype = errorhandler(errors, "utf8", msg, s, pos -
1, pos)
+ r, pos, rettype = errorhandler(errors, "utf8", msg, s,
+ pos - 1, pos)
+ pos = end
res.append(r)
continue
ordch2 = ord(s[pos])
ordch3 = ord(s[pos + 1])
ordch4 = ord(s[pos + 2])
-
if rutf8._invalid_byte_2_of_4(ordch1, ordch2):
r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos)
- if not suppressing:
- res.append(r)
+ res.append(r)
continue
elif rutf8._invalid_byte_3_of_4(ordch3):
r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
@@ -459,8 +458,7 @@
elif rutf8._invalid_byte_4_of_4(ordch4):
r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos + 2)
- if not suppressing:
- res.append(r)
+ res.append(r)
continue
pos += 3
@@ -469,13 +467,11 @@
res.append(chr(ordch2))
res.append(chr(ordch3))
res.append(chr(ordch4))
- suppressing = False
continue
r, pos, rettype = errorhandler(errors, "utf8", "invalid start byte",
s, pos - 1, pos)
- if not suppressing:
- res.append(r)
+ res.append(r)
r = res.build()
return r, rutf8.check_utf8(r, True), pos
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -230,7 +230,7 @@
(b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
]
- for n, (seq, res) in enumerate(sequences):
+ for (seq, res) in sequences:
raises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
uni = seq.decode('utf-8', 'replace')
assert uni == res
@@ -329,7 +329,6 @@
err = 'invalid continuation byte'
for s, res in sequences:
seq = bytes(int(c, 16) for c in s.split())
- print(seq, [hex(ord(c)) for c in res])
exc = raises(UnicodeDecodeError, seq.decode, 'utf-8')
assert err in str(exc.value)
assert seq.decode('utf-8', 'replace') == res
@@ -340,6 +339,95 @@
assert((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore') ==
'aaaa' + res + 'bbbb')
+ def test_invalid_cb_for_4bytes_seq(self):
+ """
+ Test that an 'invalid continuation byte' error is raised when the
+ continuation byte(s) of a 4-bytes sequence are invalid. When
+ errors='replace',the start byte and all the following valid
+ continuation bytes are replaced with a single U+FFFD, and all the bytes
+ starting from the first invalid continuation bytes (included) are
+ handled separately.
+ E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
+ sequence, 80 is a valid continuation byte, but 41 is not a valid cb
+ because it's the ASCII letter 'A'.
+ Note: when the start byte is E0 or ED, the valid ranges for the first
+ continuation byte are limited to A0..BF and 80..9F respectively.
+ However, when the start byte is ED, Python 2 considers all the bytes
+ in range 80..BF valid. This is fixed in Python 3.
+ """
+ FFFD = '\ufffd'
+ FFFDx2 = FFFD * 2
+ sequences = [
+ ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
+ ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
+ ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
+ ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
+ ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
+ ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
+ ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
+ ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
+ ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
+ ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
+ ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
+ ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
+ ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
+ ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
+ ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
+ ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
+ ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
+ ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
+ ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
+ ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
+ ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
+ ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
+ ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
+ ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
+ ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
+ ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
+ ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
+ ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
+ ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
+ ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
+ ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
+ ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
+ ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
+ ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
+ ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
+ ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
+ ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
+ ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
+ ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
+ ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
+ ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
+ ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
+ ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
+ ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
+ ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
+ ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
+ ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
+ ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
+ ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
+ ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
+ ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
+ ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
+ ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
+ ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
+ ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
+ ]
+ err = 'invalid continuation byte'
+ for s, res in sequences:
+ seq = bytes(int(c, 16) for c in s.split())
+ exc = raises(UnicodeDecodeError, seq.decode, 'utf-8')
+ assert err in str(exc.value)
+ assert seq.decode('utf-8', 'replace') == res
+ assert ((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace') ==
+ 'aaaa' + res + 'bbbb')
+ res = res.replace('\ufffd', '')
+ assert seq.decode('utf-8', 'ignore') == res
+ assert((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore') ==
+ 'aaaa' + res + 'bbbb')
+
+
class AppTestPartialEvaluation:
spaceconfig = dict(usemodules=['array',])
@@ -612,7 +700,6 @@
exc = raises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
exc = raises(UnicodeDecodeError, codecs.decode, b'\xe0\x00', 'utf-8')
- print(dir(exc.value))
assert 'invalid continuation byte' in exc.value.reason
def test_bad_errorhandler_return(self):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit