Author: Ronan Lamy <[email protected]>
Branch: py3.5
Changeset: r94591:2396fb397495
Date: 2018-05-14 22:34 +0100
http://bitbucket.org/pypy/pypy/changeset/2396fb397495/
Log: hg merge default
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -168,6 +168,222 @@
return decode_utf8(space, string, allow_surrogates=True)
# ____________________________________________________________
+# utf-16
+
+def str_decode_utf_16(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_16_helper(s, size, errors,
final,
+ errorhandler,
"native",
+ 'utf-16-' +
BYTEORDER2)
+ return result, length
+
+def str_decode_utf_16_be(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_16_helper(s, size, errors,
final,
+ errorhandler, "big",
+ 'utf-16-be')
+ return result, length
+
+def str_decode_utf_16_le(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_16_helper(s, size, errors,
final,
+ errorhandler,
"little",
+ 'utf-16-le')
+ return result, length
+
+def str_decode_utf_16_helper(s, size, errors, final=True,
+ errorhandler=None,
+ byteorder="native",
+ public_encoding_name='utf16'):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_decode
+ bo = 0
+
+ if BYTEORDER == 'little':
+ ihi = 1
+ ilo = 0
+ else:
+ ihi = 0
+ ilo = 1
+
+ # Check for BOM marks (U+FEFF) in the input and adjust current
+ # byte order setting accordingly. In native mode, the leading BOM
+ # mark is skipped, in all other modes, it is copied to the output
+ # stream as-is (giving a ZWNBSP character).
+ pos = 0
+ if byteorder == 'native':
+ if size >= 2:
+ bom = (ord(s[ihi]) << 8) | ord(s[ilo])
+ if BYTEORDER == 'little':
+ if bom == 0xFEFF:
+ pos += 2
+ bo = -1
+ elif bom == 0xFFFE:
+ pos += 2
+ bo = 1
+ else:
+ if bom == 0xFEFF:
+ pos += 2
+ bo = 1
+ elif bom == 0xFFFE:
+ pos += 2
+ bo = -1
+ elif byteorder == 'little':
+ bo = -1
+ else:
+ bo = 1
+ if size == 0:
+ return u'', 0, bo
+ if bo == -1:
+ # force little endian
+ ihi = 1
+ ilo = 0
+
+ elif bo == 1:
+ # force big endian
+ ihi = 0
+ ilo = 1
+
+ result = UnicodeBuilder(size // 2)
+
+ #XXX I think the errors are not correctly handled here
+ while pos < size:
+ # remaining bytes at the end? (size should be even)
+ if len(s) - pos < 2:
+ if not final:
+ break
+ r, pos = errorhandler(errors, public_encoding_name,
+ "truncated data",
+ s, pos, len(s))
+ result.append(r)
+ if len(s) - pos < 2:
+ break
+ ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo])
+ pos += 2
+ if ch < 0xD800 or ch > 0xDFFF:
+ result.append(unichr(ch))
+ continue
+ # UTF-16 code pair:
+ if len(s) - pos < 2:
+ pos -= 2
+ if not final:
+ break
+ errmsg = "unexpected end of data"
+ r, pos = errorhandler(errors, public_encoding_name,
+ errmsg, s, pos, len(s))
+ result.append(r)
+ if len(s) - pos < 2:
+ break
+ elif 0xD800 <= ch <= 0xDBFF:
+ ch2 = (ord(s[pos+ihi]) << 8) | ord(s[pos+ilo])
+ pos += 2
+ if 0xDC00 <= ch2 <= 0xDFFF:
+ if MAXUNICODE < 65536:
+ result.append(unichr(ch))
+ result.append(unichr(ch2))
+ else:
+ result.append(UNICHR((((ch & 0x3FF)<<10) |
+ (ch2 & 0x3FF)) + 0x10000))
+ continue
+ else:
+ r, pos = errorhandler(errors, public_encoding_name,
+ "illegal UTF-16 surrogate",
+ s, pos - 4, pos - 2)
+ result.append(r)
+ else:
+ r, pos = errorhandler(errors, public_encoding_name,
+ "illegal encoding",
+ s, pos - 2, pos)
+ result.append(r)
+ return result.build(), pos, bo
+
+def _STORECHAR(result, CH, byteorder):
+ hi = chr(((CH) >> 8) & 0xff)
+ lo = chr((CH) & 0xff)
+ if byteorder == 'little':
+ result.append(lo)
+ result.append(hi)
+ else:
+ result.append(hi)
+ result.append(lo)
+
+def unicode_encode_utf_16_helper(s, size, errors,
+ errorhandler=None,
+ allow_surrogates=True,
+ byteorder='little',
+ public_encoding_name='utf16'):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_encode
+ if size == 0:
+ if byteorder == 'native':
+ result = StringBuilder(2)
+ _STORECHAR(result, 0xFEFF, BYTEORDER)
+ return result.build()
+ return ""
+
+ result = StringBuilder(size * 2 + 2)
+ if byteorder == 'native':
+ _STORECHAR(result, 0xFEFF, BYTEORDER)
+ byteorder = BYTEORDER
+
+ pos = 0
+ while pos < size:
+ ch = ord(s[pos])
+ pos += 1
+
+ if ch < 0xD800:
+ _STORECHAR(result, ch, byteorder)
+ elif ch >= 0x10000:
+ _STORECHAR(result, 0xD800 | ((ch-0x10000) >> 10), byteorder)
+ _STORECHAR(result, 0xDC00 | ((ch-0x10000) & 0x3FF), byteorder)
+ elif ch >= 0xE000 or allow_surrogates:
+ _STORECHAR(result, ch, byteorder)
+ else:
+ ru, rs, pos = errorhandler(errors, public_encoding_name,
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ if rs is not None:
+ # py3k only
+ if len(rs) % 2 != 0:
+ errorhandler('strict', public_encoding_name,
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ result.append(rs)
+ continue
+ for ch in ru:
+ if ord(ch) < 0xD800:
+ _STORECHAR(result, ord(ch), byteorder)
+ else:
+ errorhandler('strict', public_encoding_name,
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ continue
+
+ return result.build()
+
+def unicode_encode_utf_16(s, size, errors,
+ errorhandler=None,
+ allow_surrogates=True):
+ return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+ allow_surrogates, "native",
+ 'utf-16-' + BYTEORDER2)
+
+def unicode_encode_utf_16_be(s, size, errors,
+ errorhandler=None,
+ allow_surrogates=True):
+ return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+ allow_surrogates, "big",
+ 'utf-16-be')
+
+def unicode_encode_utf_16_le(s, size, errors,
+ errorhandler=None,
+ allow_surrogates=True):
+ return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+ allow_surrogates, "little",
+ 'utf-16-le')
+
+
+# ____________________________________________________________
# utf-32
def str_decode_utf_32(s, size, errors, final=True,
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit