Author: fijal
Branch: unicode-utf8
Changeset: r93138:9ede67aee27e
Date: 2017-11-23 15:49 +0100
http://bitbucket.org/pypy/pypy/changeset/9ede67aee27e/
Log: Utf8StringBuilder
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -16,9 +16,11 @@
"""
import sys
-from rpython.rlib.objectmodel import enforceargs, we_are_translated
+from rpython.rlib.objectmodel import enforceargs, we_are_translated, specialize
from rpython.rlib.rstring import StringBuilder
from rpython.rlib import jit
+from rpython.rlib.signature import signature
+from rpython.rlib.types import char, none
from rpython.rlib.rarithmetic import r_uint
from rpython.rlib.unicodedata import unicodedb
from rpython.rtyper.lltypesystem import lltype, rffi
@@ -316,6 +318,11 @@
return res, flag
raise CheckError(~res)
+def get_utf8_length_flag(s):
+ """ Get the length and flag out of valid utf8. For now just calls
check_utf8
+ """
+ return check_utf8(s, True)
+
@jit.elidable
def _check_utf8(s, allow_surrogates, start, stop):
pos = start
@@ -655,6 +662,53 @@
return unicode_escape #, char_escape_helper
+class Utf8StringBuilder(object):
+ def __init__(self, size=0):
+ self._s = StringBuilder(size)
+ self._lgt = 0
+ self._flag = FLAG_ASCII
+
+ def append(self, s):
+ # for strings
+ self._s.append(s)
+ newlgt, newflag = get_utf8_length_flag(s)
+ self._lgt += newlgt
+ self._flag = combine_flags(self._flag, newflag)
+
+ @signature(char(), returns=none())
+ def append_char(self, s):
+ # for characters, ascii
+ self._lgt += 1
+ self._s.append(s)
+
+ def append_code(self, code):
+ self._flag = combine_flags(self._flag, get_flag_from_code(code))
+ self._lgt += 1
+ unichr_as_utf8_append(self._s, code, True)
+
+ def build(self):
+ return self._s.build()
+
+ def get_flag(self):
+ return self._flag
+
+ def get_length(self):
+ return self._lgt
+
+class Utf8StringIterator(object):
+ def __init__(self, utf8s):
+ self._utf8 = utf8s
+ self._end = len(utf8s)
+ self._pos = 0
+
+ def done(self):
+ return self._pos == self._end
+
+ def next(self):
+ ret = codepoint_at_pos(self._utf8, self._pos)
+ self._pos = next_codepoint_pos(self._utf8, self._pos)
+ return ret
+
def decode_latin_1(s):
if len(s) == 0:
return s
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -139,3 +139,39 @@
result = rutf8.surrogate_in_utf8(uni)
expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff')
assert result == expected
+
+@given(strategies.text())
+def test_get_utf8_length_flag(u):
+ exp_lgt = len(u)
+ exp_flag = rutf8.FLAG_ASCII
+ for c in u:
+ if ord(c) > 0x7F:
+ exp_flag = rutf8.FLAG_REGULAR
+ lgt, flag = rutf8.get_utf8_length_flag(u.encode('utf8'))
+ assert lgt == exp_lgt
+ assert flag == exp_flag
+
+def test_utf8_string_builder():
+ s = rutf8.Utf8StringBuilder()
+ s.append("foo")
+ s.append_char("x")
+ assert s.get_flag() == rutf8.FLAG_ASCII
+ assert s.get_length() == 4
+ assert s.build() == "foox"
+ s.append(u"\u1234".encode("utf8"))
+ assert s.get_flag() == rutf8.FLAG_REGULAR
+ assert s.get_length() == 5
+ assert s.build().decode("utf8") == u"foox\u1234"
+ s.append("foo")
+ s.append_char("x")
+ assert s.get_flag() == rutf8.FLAG_REGULAR
+ assert s.get_length() == 9
+ assert s.build().decode("utf8") == u"foox\u1234foox"
+ s = rutf8.Utf8StringBuilder()
+ s.append_code(0x1234)
+ assert s.build().decode("utf8") == u"\u1234"
+ assert s.get_flag() == rutf8.FLAG_REGULAR
+ assert s.get_length() == 1
+ s.append_code(0xD800)
+ assert s.get_flag() == rutf8.FLAG_HAS_SURROGATES
+ assert s.get_length() == 2
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit