Author: Armin Rigo <ar...@tunes.org> Branch: Changeset: r88995:e8b1d9913039 Date: 2016-12-10 15:39 +0100 http://bitbucket.org/pypy/pypy/changeset/e8b1d9913039/
Log: expand the comments diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -51,6 +51,10 @@ return result def decode_utf8(space, string): + # Surrogates are accepted and not treated specially at all. + # If there happen to be two 3-bytes encoding a pair of surrogates, + # you still get two surrogate unicode characters in the result. + # These are the Python2 rules; Python3 differs. result, consumed = runicode.str_decode_utf_8( string, len(string), "strict", final=True, errorhandler=decode_error_handler(space), @@ -59,10 +63,9 @@ def encode_utf8(space, uni): # Note that this function never raises UnicodeEncodeError, - # since surrogate pairs are allowed. - # This is not the case with Python3. - # Also, note that the two characters \d800\dc00 are considered as - # a paired surrogate, and turn into a single 4-byte utf8 char. + # since surrogates are allowed, either paired or lone. + # A paired surrogate is considered like the non-BMP character + # it stands for. These are the Python2 rules; Python3 differs. return runicode.unicode_encode_utf_8( uni, len(uni), "strict", errorhandler=raise_unicode_exception_encode, _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit