Author: fijal
Branch: unicode-utf8
Changeset: r93091:4668380f4c79
Date: 2017-11-20 13:56 +0100
http://bitbucket.org/pypy/pypy/changeset/4668380f4c79/
Log: * Improve ascii/utf8 codecs and unicode escape
* Raise instead of looping infinitely when errorhandler returns
nonsense
diff --git a/pypy/interpreter/test/test_unicodehelper.py
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -33,25 +33,33 @@
assert lst == [("??", "ascii", input, 0, 2),
("??", "ascii", input, 5, 7)]
+@given(strategies.text())
+def test_utf8_encode_ascii_2(u):
+ def eh(errors, encoding, reason, p, start, end):
+ return "?" * (end - start), end
+
+ assert utf8_encode_ascii(u.encode("utf8"), "replace", eh) ==
u.encode("ascii", "replace")
+
def test_str_decode_ascii():
- assert str_decode_ascii("abc", 3, "??", True, "??") == ("abc", 3, 3)
+ assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3,
rutf8.FLAG_ASCII)
def eh(errors, encoding, reason, p, start, end):
lst.append((errors, encoding, p, start, end))
- return u"\u1234\u5678", end
+ return u"\u1234\u5678".encode("utf8"), end
lst = []
input = "\xe8"
exp = u"\u1234\u5678".encode("utf8")
- assert str_decode_ascii(input, 1, "??", True, eh) == (exp, 1, 2)
+ assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2,
rutf8.FLAG_REGULAR)
assert lst == [("??", "ascii", input, 0, 1)]
lst = []
input = "\xe8\xe9abc\xea\xeb"
- assert str_decode_ascii(input, 7, "??", True, eh) == (
- exp + exp + "abc" + exp + exp, 7, 11)
+ assert str_decode_ascii(input, "??", True, eh) == (
+ exp + exp + "abc" + exp + exp, 7, 11, rutf8.FLAG_REGULAR)
assert lst == [("??", "ascii", input, 0, 1),
("??", "ascii", input, 1, 2),
("??", "ascii", input, 5, 6),
("??", "ascii", input, 6, 7)]
-@given(strategies.binary())
-def test_unicode_raw_escape(s):
- uh.utf8_encode_raw_unicode_escape(s, 'strict')
+@given(strategies.text())
+def test_unicode_raw_escape(u):
+ r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict')
+ assert r == u.encode("raw-unicode-escape")
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -158,6 +158,7 @@
res.append(chr(oc))
i += 1
else:
+ XXX
r, pos = errorhandler(errors, 'latin1',
'ordinal not in range(256)', s, cur,
cur + 1)
@@ -179,10 +180,15 @@
pos = 0
while i < len(utf8):
ch = rutf8.codepoint_at_pos(utf8, i)
- if ch >= 0x7F:
+ if ch > 0x7F:
+ endpos = pos + 1
+ end_i = rutf8.next_codepoint_pos(utf8, i)
+ while end_i < len(utf8) and rutf8.codepoint_at_pos(utf8, end_i) >
0x7F:
+ endpos += 1
+ end_i = rutf8.next_codepoint_pos(utf8, end_i)
msg = "ordinal not in range(128)"
r, newpos = errorhandler(errors, 'ascii', msg, utf8,
- pos, pos + 1)
+ pos, endpos)
for _ in range(newpos - pos):
i = rutf8.next_codepoint_pos(utf8, i)
pos = newpos
@@ -603,13 +609,13 @@
result = StringBuilder(size)
pos = 0
while pos < size:
- oc = ord(s[pos])
+ oc = rutf8.codepoint_at_pos(s, pos)
if oc < 0x100:
result.append(chr(oc))
else:
raw_unicode_escape_helper(result, oc)
- pos += 1
+ pos = rutf8.next_codepoint_pos(s, pos)
return result.build()
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -71,6 +71,9 @@
raise oefmt(space.w_IndexError,
"position %d from error handler out of bounds",
newpos)
+ if newpos < startpos:
+ raise oefmt(space.w_IndexError,
+ "position %d from error handler did not progress", newpos)
w_replace = space.convert_to_w_unicode(w_replace)
return w_replace._utf8, newpos
return call_errorhandler
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit