Author: fijal
Branch: unicode-utf8
Changeset: r92604:f06c4111345a
Date: 2017-10-05 10:28 +0200
http://bitbucket.org/pypy/pypy/changeset/f06c4111345a/
Log: merge
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -95,6 +95,8 @@
"""
pos = r_uint(pos)
pos -= 1
+ if pos >= len(code): # for the case where pos - 1 == len(code):
+ return pos # assume there is an extra '\x00' character
chr1 = ord(code[pos])
if chr1 <= 0x7F:
return pos
@@ -347,6 +349,16 @@
assert pos == len(s)
return pos - continuation_bytes
[email protected]
+def surrogate_in_utf8(value):
+ """Check if the UTF-8 byte string 'value' contains a surrogate.
+ The 'value' argument must be otherwise correctly formed for UTF-8.
+ """
+ for i in range(len(value) - 2):
+ if value[i] == '\xed' and value[i + 1] >= '\xa0':
+ return True
+ return False
+
UTF8_INDEX_STORAGE = lltype.GcArray(lltype.Struct(
'utf8_loc',
@@ -367,9 +379,9 @@
""" Create an index storage which stores index of each 4th character
in utf8 encoded unicode string.
"""
- if len(utf8) == utf8len <= ASCII_INDEX_STORAGE_BLOCKS * 64:
+ if len(utf8) == utf8len < ASCII_INDEX_STORAGE_BLOCKS * 64:
return ASCII_INDEX_STORAGE
- arraysize = (utf8len + 63) // 64
+ arraysize = utf8len // 64 + 1
storage = lltype.malloc(UTF8_INDEX_STORAGE, arraysize)
baseindex = 0
current = 0
@@ -377,10 +389,14 @@
storage[current].baseindex = baseindex
next = baseindex
for i in range(16):
- next = next_codepoint_pos(utf8, next)
+ if utf8len == 0:
+ next += 1 # assume there is an extra '\x00' character
+ else:
+ next = next_codepoint_pos(utf8, next)
storage[current].ofs[i] = chr(next - baseindex)
utf8len -= 4
- if utf8len <= 0:
+ if utf8len < 0:
+ assert current + 1 == len(storage)
break
next = next_codepoint_pos(utf8, next)
next = next_codepoint_pos(utf8, next)
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -93,8 +93,17 @@
ord(item))
@given(strategies.text())
+@example(u'x' * 64 * 5)
+@example(u'x' * (64 * 5 - 1))
def test_codepoint_position_at_index(u):
index = rutf8.create_utf8_index_storage(u.encode('utf8'), len(u))
- for i in range(len(u)):
+ for i in range(len(u) + 1):
assert (rutf8.codepoint_position_at_index(u.encode('utf8'), index, i)
==
len(u[:i].encode('utf8')))
+
+@given(strategies.lists(strategies.characters()))
+def test_surrogate_in_utf8(unichars):
+ uni = u''.join(unichars).encode('utf-8')
+ result = rutf8.surrogate_in_utf8(uni)
+ expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff')
+ assert result == expected
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit