[pypy-commit] pypy unicode-utf8: fix rsplit

fijal Sat, 04 Mar 2017 13:20:52 -0800

Author: fijal
Branch: unicode-utf8
Changeset: r90542:e30fd70a9177
Date: 2017-03-04 21:23 +0100
http://bitbucket.org/pypy/pypy/changeset/e30fd70a9177/


Log:    fix rsplit

diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -123,6 +123,8 @@
         return rutf8.compute_length_utf8(self._utf8)
 
     def _val(self, space):
+        import pdb
+        pdb.set_trace()
         return self._utf8.decode('utf8')
 
     @staticmethod
diff --git a/rpython/rlib/rstring.py b/rpython/rlib/rstring.py
--- a/rpython/rlib/rstring.py
+++ b/rpython/rlib/rstring.py
@@ -35,6 +35,8 @@
     from rpython.rlib.rutf8 import next_codepoint_pos
 
     if isutf8:
+        if pos == -1:
+            return 0
         return next_codepoint_pos(s, pos)
     else:
         return pos + 1
@@ -44,6 +46,8 @@
     from rpython.rlib.rutf8 import prev_codepoint_pos
 
     if isutf8:
+        if pos == 0:
+            return -1
         return prev_codepoint_pos(s, pos)
     else:
         return pos - 1
@@ -139,7 +143,7 @@
         while True:
             # starting from the end, find the end of the next word
             while i >= 0:
-                if not _isspace(value, i):
+                if not _isspace(value, i, isutf8):
                     break   # found
                 i = _decr(value, i, isutf8)
             else:
@@ -151,17 +155,17 @@
                 j = -1   # take all the rest of the string
             else:
                 j = _decr(value, i, isutf8)
-                while j >= 0 and not _isspace(value, j):
+                while j >= 0 and not _isspace(value, j, isutf8):
                     j = _decr(value, j, isutf8)
                 maxsplit -= 1   # NB. if it's already < 0, it stays < 0
 
             # the word is value[j+1:i+1]
+            j1 = _incr(value, j, isutf8)
+            assert j1 >= 0
+            i1 = _incr(value, i, isutf8)
+            res.append(value[j1:i1])
             if j < 0:
-                j1 = 0
-            else:
-                j1 = _incr(value, j, isutf8)
-            assert j1 >= 0
-            res.append(value[j1:i+1])
+                break
 
             # continue to look from the character before the space before the 
word
             i = _decr(value, j, isutf8)
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -77,10 +77,11 @@
 def prev_codepoint_pos(code, pos):
     """ Gives the position of the previous codepoint
     """
+    pos -= 1
     chr1 = ord(code[pos])
     if chr1 < 0x80:
-        return pos - 1
-    while ord(code[pos]) & 0xC0 == 0xC0:
+        return pos
+    while ord(code[pos]) & 0xC0 == 0x80:
         pos -= 1
     return pos
 
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: fix rsplit

Reply via email to