Author: Armin Rigo <[email protected]>
Branch:
Changeset: r90569:9ca0089bd94f
Date: 2017-03-05 16:34 +0100
http://bitbucket.org/pypy/pypy/changeset/9ca0089bd94f/
Log: Checked in by mistake in the py3.5 branch. Copy it to default
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -432,6 +432,39 @@
_encodeUCS4(result, ch)
return result.build()
+class SurrogateError(Exception):
+ def __init__(self, char, index):
+ self.char = char
+ self.index = index
+
+def unicode_encode_utf8_forbid_surrogates(s, size):
+ # Strict surrogate-forbidding utf-8 encoding. Any surrogate character
+ # raises an interp-level SurrogateError, even on 16-bit hosts.
+ # --- XXX check in detail what occurs on 16-bit hosts in PyPy 3 ---
+ assert(size >= 0)
+ result = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = ord(s[pos])
+ pos += 1
+ if ch < 0x80:
+ # Encode ASCII
+ result.append(chr(ch))
+ elif ch < 0x0800:
+ # Encode Latin-1
+ result.append(chr((0xc0 | (ch >> 6))))
+ result.append(chr((0x80 | (ch & 0x3f))))
+ elif ch < 0x10000:
+ if 0xD800 <= ch <= 0xDFFF:
+ raise SurrogateError(ch, pos)
+ # Encode UCS2 Unicode ordinals
+ result.append((chr((0xe0 | (ch >> 12)))))
+ result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+ result.append((chr((0x80 | (ch & 0x3f)))))
+ else:
+ _encodeUCS4(result, ch)
+ return result.build()
+
# ____________________________________________________________
# utf-16
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit