Author: fijal Branch: unicode-utf8 Changeset: r93112:fd1b64ce9b80 Date: 2017-11-21 15:19 +0100 http://bitbucket.org/pypy/pypy/changeset/fd1b64ce9b80/
Log: some improvements for xmlcharrefreplace diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -164,26 +164,31 @@ while i < size: if ord(s[i]) <= 0x7F: res.append(s[i]) + i += 1 + cur += 1 else: oc = rutf8.codepoint_at_pos(s, i) if oc <= 0xFF: res.append(chr(oc)) - i += 1 + cur += 1 + i = rutf8.next_codepoint_pos(s, i) else: r, pos = errorhandler(errors, 'latin1', 'ordinal not in range(256)', s, cur, cur + 1) for j in range(pos - cur): + i = rutf8.next_codepoint_pos(s, i) + + j = 0 + while j < len(r): c = rutf8.codepoint_at_pos(r, j) if c > 0xFF: errorhandler("strict", 'latin1', 'ordinal not in range(256)', s, cur, cur + 1) + j = rutf8.next_codepoint_pos(r, j) res.append(chr(c)) - i = rutf8.next_codepoint_pos(s, i) cur = pos - cur += 1 - i += 1 r = res.build() return r diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -215,27 +215,30 @@ "don't know how to handle %T in error callback", w_exc) def xmlcharrefreplace_errors(space, w_exc): + from pypy.interpreter import unicodehelper + check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): - obj = space.realunicode_w(space.getattr(w_exc, space.newtext('object'))) + w_obj = space.getattr(w_exc, space.newtext('object')) + space.realutf8_w(w_obj) # weeoes + w_obj = unicodehelper.convert_arg_to_w_unicode(space, w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) - builder = UnicodeBuilder() + start = w_obj._index_to_byte(start) + end = w_obj._index_to_byte(end) + builder = StringBuilder() pos = start + obj = w_obj._utf8 while pos < end: - code = ord(obj[pos]) - if (MAXUNICODE == 0xffff and 0xD800 <= code <= 0xDBFF and - pos + 1 < end and 0xDC00 <= ord(obj[pos+1]) <= 0xDFFF): - code = (code & 0x03FF) << 10 - code |= ord(obj[pos+1]) & 0x03FF - code += 0x10000 - pos += 1 - builder.append(u"&#") - builder.append(unicode(str(code))) - builder.append(u";") - pos += 1 - return space.newtuple([space.newunicode(builder.build()), w_end]) + code = rutf8.codepoint_at_pos(obj, pos) + builder.append("&#") + builder.append(str(code)) + builder.append(";") + pos = rutf8.next_codepoint_pos(obj, pos) + r = builder.build() + lgt, flag = rutf8.check_utf8(r, True) + return space.newtuple([space.newutf8(r, lgt, flag), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc) diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -750,3 +750,9 @@ assert _codecs.unicode_escape_decode(b) == (u'', 0) assert _codecs.raw_unicode_escape_decode(b) == (u'', 0) assert _codecs.unicode_internal_decode(b) == (u'', 0) + + def test_xmlcharrefreplace(self): + r = u'\u1234\u0080\u2345\u0079\u00AB'.encode('latin1', 'xmlcharrefreplace') + assert r == 'ሴ\x80⍅y\xab' + r = u'\u1234\u0080\u2345\u0079\u00AB'.encode('ascii', 'xmlcharrefreplace') + assert r == 'ሴ€⍅y«' _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit