Author: Richard Plangger <[email protected]>
Branch: py3.5-text-utf8
Changeset: r90389:08bd2beb79dc
Date: 2017-02-27 12:39 +0100
http://bitbucket.org/pypy/pypy/changeset/08bd2beb79dc/
Log: (mjacob, plan_rich) forbid surrogates while encoding to utf-8 in
W_UnicodeObject.text_w
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -8,7 +8,7 @@
from rpython.rlib.runicode import (
make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii,
- unicode_encode_utf8sp)
+ unicode_encode_utf8_forbid_surrogates, SurrogateError)
from rpython.rlib import jit
from pypy.interpreter import unicodehelper
@@ -82,7 +82,8 @@
def text_w(self, space):
identifier = jit.conditional_call_elidable(
- self._utf8, g_encode_utf8, self._value)
+ self._utf8, g_encode_utf8, space, self,
+ self._value)
if not jit.isconstant(self):
self._utf8 = identifier
return identifier
@@ -1255,9 +1256,17 @@
return u''.join(result)
@jit.elidable
-def g_encode_utf8(value):
+def g_encode_utf8(space, w_value, value):
"""This is a global function because of jit.conditional_call_value"""
- return unicode_encode_utf8sp(value, len(value))
+ try:
+ return unicode_encode_utf8_forbid_surrogates(value, len(value))
+ except SurrogateError as e:
+ raise OperationError(space.w_UnicodeEncodeError,
+ space.newtuple([space.newtext('utf-8'),
+ w_value,
+ space.newint(e.index-1),
+ space.newint(e.index),
+ space.newtext("surrogates not allowed")]))
_repr_function, _ = make_unicode_escape_function(
pass_printable=True, unicode_output=True, quotes=True, prefix='')
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -433,7 +433,9 @@
return result.build()
class SurrogateError(Exception):
- pass
+ def __init__(self, char, index):
+ self.char = char
+ self.index = index
def unicode_encode_utf8_forbid_surrogates(s, size):
# Strict surrogate-forbidding utf-8 encoding. Any surrogate character
@@ -454,7 +456,7 @@
result.append(chr((0x80 | (ch & 0x3f))))
elif ch < 0x10000:
if 0xD800 <= ch <= 0xDFFF:
- raise SurrogateError
+ raise SurrogateError(ch, pos)
# Encode UCS2 Unicode ordinals
result.append((chr((0xe0 | (ch >> 12)))))
result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit