Author: Ronan Lamy <[email protected]>
Branch: utf8-io
Changeset: r93179:e509ec2ccea2
Date: 2017-11-26 01:51 +0000
http://bitbucket.org/pypy/pypy/changeset/e509ec2ccea2/
Log: Adapt DecodeBuffer to utf8
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -11,7 +11,7 @@
from rpython.rlib.rarithmetic import intmask, r_uint, r_ulonglong
from rpython.rlib.rbigint import rbigint
from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.rutf8 import FLAG_ASCII, check_utf8
+from rpython.rlib.rutf8 import FLAG_ASCII, check_utf8, next_codepoint_pos
STATE_ZERO, STATE_OK, STATE_DETACHED = range(3)
@@ -303,7 +303,7 @@
def set(self, space, w_decoded):
check_decoded(space, w_decoded)
- self.text = space.unicode_w(w_decoded)
+ self.text = space.utf8_w(w_decoded)
self.pos = 0
def reset(self):
@@ -312,7 +312,7 @@
def get_chars(self, size):
if self.text is None:
- return u""
+ return ""
available = len(self.text) - self.pos
if size < 0 or size > available:
@@ -341,7 +341,7 @@
if self.exhausted():
raise StopIteration
ch = self.text[self.pos]
- self.pos += 1
+ self.pos = next_codepoint_pos(self.text, self.pos)
return ch
def peek_char(self):
@@ -362,16 +362,16 @@
ch = self.next_char()
except StopIteration:
return False
- if ch == u'\n':
+ if ch == '\n':
return True
- if ch == u'\r':
+ if ch == '\r':
if scanned >= limit:
return False
try:
ch = self.peek_char()
except StopIteration:
return False
- if ch == u'\n':
+ if ch == '\n':
self.next_char()
return True
else:
@@ -388,11 +388,11 @@
except StopIteration:
return False
scanned += 1
- if ch == u'\r':
+ if ch == '\r':
if scanned >= limit:
return False
try:
- if self.peek_char() == u'\n':
+ if self.peek_char() == '\n':
self.next_char()
return True
except StopIteration:
@@ -705,11 +705,11 @@
else:
if self.readtranslate:
# Newlines are already translated, only search for \n
- newline = u'\n'
+ newline = '\n'
else:
# Non-universal mode.
newline = self.readnl
- if newline == u'\r\n':
+ if newline == '\r\n':
return self.decoded.find_crlf(limit)
else:
return self.decoded.find_char(newline[0], limit)
diff --git a/pypy/module/_io/test/test_interp_textio.py
b/pypy/module/_io/test/test_interp_textio.py
--- a/pypy/module/_io/test/test_interp_textio.py
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -38,31 +38,27 @@
@given(st.text())
def test_read_buffer(text):
- buf = DecodeBuffer(text)
- assert buf.get_chars(-1) == text
+ buf = DecodeBuffer(text.encode('utf-8'))
+ assert buf.get_chars(-1) == text.encode('utf-8')
assert buf.exhausted()
@given(st.text(), st.lists(st.integers(min_value=0)))
def test_readn_buffer(text, sizes):
- buf = DecodeBuffer(text)
+ buf = DecodeBuffer(text.encode('utf-8'))
strings = []
for n in sizes:
s = buf.get_chars(n)
if not buf.exhausted():
- assert len(s) == n
+ assert len(s.decode('utf-8')) == n
else:
- assert len(s) <= n
+ assert len(s.decode('utf-8')) <= n
strings.append(s)
- assert ''.join(strings) == text[:sum(sizes)]
+ assert ''.join(strings) == text[:sum(sizes)].encode('utf-8')
@given(st.text())
def test_next_char(text):
- buf = DecodeBuffer(text)
- chars = []
- try:
- while True:
- chars.append(buf.next_char())
- except StopIteration:
- pass
+ buf = DecodeBuffer(text.encode('utf-8'))
+ for i in range(len(text)):
+ ch = buf.next_char()
+ assert ch == text[i].encode('utf-8')[0]
assert buf.exhausted()
- assert u''.join(chars) == text
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit