Author: Philip Jenvey <[email protected]>
Branch: py3k
Changeset: r60646:1b48d48dc26c
Date: 2013-01-28 17:08 -0800
http://bitbucket.org/pypy/pypy/changeset/1b48d48dc26c/

Log:    cpython issue3297: fix parsing of surrogates w/ wide builds

diff --git a/pypy/interpreter/pyparser/parsestring.py 
b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -1,3 +1,4 @@
+# coding: utf-8
 from pypy.interpreter.error import OperationError
 from pypy.interpreter import unicodehelper
 from rpython.rlib.rstring import StringBuilder
@@ -58,7 +59,10 @@
             # latin-1; So multibyte sequences must be escaped.
             lis = [] # using a list to assemble the value
             end = q
-            # Worst case: "\XX" may become "\u005c\uHHLL" (12 bytes)
+            # Worst case:
+            # "&#228;" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
+            # "\&#228;" (3 bytes) may become "\u005c\U000000E4" (16 bytes),
+            # or ~1:6
             while ps < end:
                 if s[ps] == '\\':
                     lis.append(s[ps])
@@ -70,13 +74,15 @@
                         # instead.
                         lis.append("u005c")
                 if ord(s[ps]) & 0x80: # XXX inefficient
-                    w, ps = decode_utf8(space, s, ps, end, "utf-16-be")
+                    w, ps = decode_utf8(space, s, ps, end, "utf-32-be")
                     rn = len(w)
-                    assert rn % 2 == 0
-                    for i in range(0, rn, 2):
-                        lis.append('\\u')
+                    assert rn % 4 == 0
+                    for i in range(0, rn, 4):
+                        lis.append('\\U')
                         lis.append(hexbyte(ord(w[i])))
                         lis.append(hexbyte(ord(w[i+1])))
+                        lis.append(hexbyte(ord(w[i+2])))
+                        lis.append(hexbyte(ord(w[i+3])))
                 else:
                     lis.append(s[ps])
                     ps += 1
diff --git a/pypy/interpreter/test/test_exec.py 
b/pypy/interpreter/test/test_exec.py
--- a/pypy/interpreter/test/test_exec.py
+++ b/pypy/interpreter/test/test_exec.py
@@ -199,3 +199,11 @@
         x = ns['x']
         assert len(x) == 6
         assert ord(x[0]) == 0x0439
+
+    def test_issue3297(self):
+        c = compile("a, b = '\U0001010F', '\\U0001010F'", "dummy", "exec")
+        d = {}
+        exec(c, d)
+        assert d['a'] == d['b']
+        assert len(d['a']) == len(d['b'])
+        assert ascii(d['a']) == ascii(d['b'])
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to