[pypy-commit] pypy utf8-io: Adapt DecodeBuffer to utf8

rlamy Sat, 25 Nov 2017 18:41:26 -0800

Author: Ronan Lamy <[email protected]>
Branch: utf8-io
Changeset: r93179:e509ec2ccea2
Date: 2017-11-26 01:51 +0000
http://bitbucket.org/pypy/pypy/changeset/e509ec2ccea2/


Log:    Adapt DecodeBuffer to utf8

diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -11,7 +11,7 @@
 from rpython.rlib.rarithmetic import intmask, r_uint, r_ulonglong
 from rpython.rlib.rbigint import rbigint
 from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.rutf8 import FLAG_ASCII, check_utf8
+from rpython.rlib.rutf8 import FLAG_ASCII, check_utf8, next_codepoint_pos
 
 
 STATE_ZERO, STATE_OK, STATE_DETACHED = range(3)
@@ -303,7 +303,7 @@
 
     def set(self, space, w_decoded):
         check_decoded(space, w_decoded)
-        self.text = space.unicode_w(w_decoded)
+        self.text = space.utf8_w(w_decoded)
         self.pos = 0
 
     def reset(self):
@@ -312,7 +312,7 @@
 
     def get_chars(self, size):
         if self.text is None:
-            return u""
+            return ""
 
         available = len(self.text) - self.pos
         if size < 0 or size > available:
@@ -341,7 +341,7 @@
         if self.exhausted():
             raise StopIteration
         ch = self.text[self.pos]
-        self.pos += 1
+        self.pos = next_codepoint_pos(self.text, self.pos)
         return ch
 
     def peek_char(self):
@@ -362,16 +362,16 @@
                 ch = self.next_char()
             except StopIteration:
                 return False
-            if ch == u'\n':
+            if ch == '\n':
                 return True
-            if ch == u'\r':
+            if ch == '\r':
                 if scanned >= limit:
                     return False
                 try:
                     ch = self.peek_char()
                 except StopIteration:
                     return False
-                if ch == u'\n':
+                if ch == '\n':
                     self.next_char()
                     return True
                 else:
@@ -388,11 +388,11 @@
             except StopIteration:
                 return False
             scanned += 1
-            if ch == u'\r':
+            if ch == '\r':
                 if scanned >= limit:
                     return False
                 try:
-                    if self.peek_char() == u'\n':
+                    if self.peek_char() == '\n':
                         self.next_char()
                         return True
                 except StopIteration:
@@ -705,11 +705,11 @@
         else:
             if self.readtranslate:
                 # Newlines are already translated, only search for \n
-                newline = u'\n'
+                newline = '\n'
             else:
                 # Non-universal mode.
                 newline = self.readnl
-            if newline == u'\r\n':
+            if newline == '\r\n':
                 return self.decoded.find_crlf(limit)
             else:
                 return self.decoded.find_char(newline[0], limit)
diff --git a/pypy/module/_io/test/test_interp_textio.py 
b/pypy/module/_io/test/test_interp_textio.py
--- a/pypy/module/_io/test/test_interp_textio.py
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -38,31 +38,27 @@
 
 @given(st.text())
 def test_read_buffer(text):
-    buf = DecodeBuffer(text)
-    assert buf.get_chars(-1) == text
+    buf = DecodeBuffer(text.encode('utf-8'))
+    assert buf.get_chars(-1) == text.encode('utf-8')
     assert buf.exhausted()
 
 @given(st.text(), st.lists(st.integers(min_value=0)))
 def test_readn_buffer(text, sizes):
-    buf = DecodeBuffer(text)
+    buf = DecodeBuffer(text.encode('utf-8'))
     strings = []
     for n in sizes:
         s = buf.get_chars(n)
         if not buf.exhausted():
-            assert len(s) == n
+            assert len(s.decode('utf-8')) == n
         else:
-            assert len(s) <= n
+            assert len(s.decode('utf-8')) <= n
         strings.append(s)
-    assert ''.join(strings) == text[:sum(sizes)]
+    assert ''.join(strings) == text[:sum(sizes)].encode('utf-8')
 
 @given(st.text())
 def test_next_char(text):
-    buf = DecodeBuffer(text)
-    chars = []
-    try:
-        while True:
-            chars.append(buf.next_char())
-    except StopIteration:
-        pass
+    buf = DecodeBuffer(text.encode('utf-8'))
+    for i in range(len(text)):
+        ch = buf.next_char()
+        assert ch == text[i].encode('utf-8')[0]
     assert buf.exhausted()
-    assert u''.join(chars) == text
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy utf8-io: Adapt DecodeBuffer to utf8

Reply via email to