Author: Armin Rigo <[email protected]>
Branch: 
Changeset: r90569:9ca0089bd94f
Date: 2017-03-05 16:34 +0100
http://bitbucket.org/pypy/pypy/changeset/9ca0089bd94f/

Log:    Checked in by mistake in the py3.5 branch. Copy it to default

diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -432,6 +432,39 @@
             _encodeUCS4(result, ch)
     return result.build()
 
+class SurrogateError(Exception):
+    def __init__(self, char, index):
+        self.char = char
+        self.index = index
+
+def unicode_encode_utf8_forbid_surrogates(s, size):
+    # Strict surrogate-forbidding utf-8 encoding.  Any surrogate character
+    # raises an interp-level SurrogateError, even on 16-bit hosts.
+    # --- XXX check in detail what occurs on 16-bit hosts in PyPy 3 ---
+    assert(size >= 0)
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = ord(s[pos])
+        pos += 1
+        if ch < 0x80:
+            # Encode ASCII
+            result.append(chr(ch))
+        elif ch < 0x0800:
+            # Encode Latin-1
+            result.append(chr((0xc0 | (ch >> 6))))
+            result.append(chr((0x80 | (ch & 0x3f))))
+        elif ch < 0x10000:
+            if 0xD800 <= ch <= 0xDFFF:
+                raise SurrogateError(ch, pos)
+            # Encode UCS2 Unicode ordinals
+            result.append((chr((0xe0 | (ch >> 12)))))
+            result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+            result.append((chr((0x80 | (ch & 0x3f)))))
+        else:
+            _encodeUCS4(result, ch)
+    return result.build()
+
 # ____________________________________________________________
 # utf-16
 
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to