Author: Armin Rigo <[email protected]>
Branch: py3.5
Changeset: r88998:894e8d2f5df8
Date: 2016-12-10 15:58 +0100
http://bitbucket.org/pypy/pypy/changeset/894e8d2f5df8/
Log: hg merge default
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -327,6 +327,16 @@
def unicode_encode_utf_8(s, size, errors, errorhandler=None,
allow_surrogates=allow_surrogate_by_default):
+ # In this function, allow_surrogates can be:
+ #
+ # * True: surrogates are always allowed. A valid surrogate pair
+ # is replaced with the non-BMP unicode char it stands for,
+ # which is then encoded as 4 bytes.
+ #
+ # * False: surrogates are always forbidden.
+ #
+ # See also unicode_encode_utf8sp().
+ #
if errorhandler is None:
errorhandler = default_unicode_error_encode
return unicode_encode_utf_8_impl(s, size, errors, errorhandler,
@@ -391,6 +401,33 @@
_encodeUCS4(result, ch)
return result.build()
+def unicode_encode_utf8sp(s, size):
+ # Surrogate-preserving utf-8 encoding. Any surrogate character
+ # turns into its 3-bytes encoding, whether it is paired or not.
+ # This should always be reversible, and the reverse is the regular
+ # str_decode_utf_8() with allow_surrogates=True.
+ assert(size >= 0)
+ result = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = ord(s[pos])
+ pos += 1
+ if ch < 0x80:
+ # Encode ASCII
+ result.append(chr(ch))
+ elif ch < 0x0800:
+ # Encode Latin-1
+ result.append(chr((0xc0 | (ch >> 6))))
+ result.append(chr((0x80 | (ch & 0x3f))))
+ elif ch < 0x10000:
+ # Encode UCS2 Unicode ordinals, and surrogates
+ result.append((chr((0xe0 | (ch >> 12)))))
+ result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+ result.append((chr((0x80 | (ch & 0x3f)))))
+ else:
+ _encodeUCS4(result, ch)
+ return result.build()
+
# ____________________________________________________________
# utf-16
diff --git a/rpython/rlib/test/test_runicode.py
b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -812,6 +812,21 @@
py.test.raises(UnicodeEncodeError, encoder, u' 12, \u1234 ', 7, None)
assert encoder(u'u\u1234', 2, 'replace') == 'u?'
+ def test_encode_utf8sp(self):
+ # for the following test, go to lengths to avoid CPython's optimizer
+ # and .pyc file storage, which collapse the two surrogates into one
+ c = u"\udc00"
+ for input, expected in [
+ (u"", ""),
+ (u"abc", "abc"),
+ (u"\u1234", "\xe1\x88\xb4"),
+ (u"\ud800", "\xed\xa0\x80"),
+ (u"\udc00", "\xed\xb0\x80"),
+ (u"\ud800" + c, "\xed\xa0\x80\xed\xb0\x80"),
+ ]:
+ got = runicode.unicode_encode_utf8sp(input, len(input))
+ assert got == expected
+
class TestTranslation(object):
def setup_class(cls):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit