On Wed, May 25, 2011 at 22:39, Ivan Krasilnikov <inf...@gmail.com> wrote:
> Hi, here's my patch for mbyte.c and a few testcases.
>
> I've eliminated those return -1's by doing a bytewise comparison of
> strings after the first corrupted character. This should make the
> comparisons transitive at least.
>

Had a bug in the patch - incorrectly checked for utf_ptr2char()'s
failure. Fixed patch and more tests in vimscript, suitable for
src/testdir/, are attached.

-- 
You received this message from the "vim_use" maillist.
Do not top-post! Type your reply below the text you are replying to.
For more information, visit http://www.vim.org/maillist.php
diff --git a/src/mbyte.c b/src/mbyte.c
index ce0c897..7ae7e78 100644
--- a/src/mbyte.c
+++ b/src/mbyte.c
@@ -1701,6 +1701,70 @@ utf_ptr2char(p)
 }
 
 /*
+ * Converts a UTF-8 byte sequence to a wide character.
+ * String is assumed to be terminated by NUL or after n bytes,
+ * whichever comes first.
+ * The function is safe in the sense that it never accesses
+ * memory beyond the first n bytes of s.
+ *
+ * On success, returns decoded codepoint, advances s to
+ * the beginning of next character and decreases n accordingly.
+ *
+ * If end of string was reached, returns 0 and, if n > 0,
+ * advances s past NUL byte.
+ *
+ * If byte sequence is illegal or incomplete, returns -1 and
+ * does not advance s.
+ */
+    static int
+utf_safe_read_char(s, n)
+    char_u      **s;
+    size_t      *n;
+{
+    int         c, k;
+
+    if (*n == 0)
+    {
+        /* end of buffer */
+        return 0;
+    }
+
+    k = utf8len_tab_zero[**s];
+
+    if (k == 1)
+    {
+        /* ASCII character or NUL */
+        (*n)--;
+        return *(*s)++;
+    }
+
+    if (k <= *n)
+    {
+        /* We have a multibyte sequence and it isn't truncated by buffer
+         * limits so utf_ptr2char() is safe to use. Or the first byte is
+         * illegal (k=0), and it's also safe to use utf_ptr2char(). */
+
+        c = utf_ptr2char(*s);
+
+        /* On failure, utf_ptr2char() returns the first byte, so here we
+         * check equality with the first byte. The only non-ASCII character
+         * which equals the first byte of its own UTF-8 representation is
+         * U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too.
+         * It's safe even if n=1, else we would have k=2 > n. */
+        if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83))
+        {
+            /* byte sequence was successfully decoded */
+            *s += k;
+            *n -= k;
+            return c;
+        }
+    }
+
+    /* byte sequence is incomplete or illegal */
+    return -1;
+}
+
+/*
  * Get character at **pp and advance *pp to the next character.
  * Note: composing characters are skipped!
  */
@@ -3079,6 +3143,80 @@ utf_isupper(a)
     return (utf_tolower(a) != a);
 }
 
+    static int
+utf_strnicmp(s1, s2, n1, n2)
+    char_u      *s1, *s2;
+    size_t      n1, n2;
+{
+    int         c1, c2, cdiff;
+    char_u      buffer[6];
+
+    for (;;)
+    {
+        c1 = utf_safe_read_char(&s1, &n1);
+        c2 = utf_safe_read_char(&s2, &n2);
+
+        if (c1 <= 0 || c2 <= 0)
+            break;
+
+        if (c1 == c2)
+            continue;
+
+        cdiff = utf_fold(c1) - utf_fold(c2);
+        if (cdiff != 0)
+            return cdiff;
+    }
+
+    /* some string ended or has an incomplete/illegal character sequence */
+
+    if (c1 == 0 || c2 == 0)
+    {
+        /* some string ended. shorter string is smaller */
+        if (c1 == 0 && c2 == 0)
+            return 0;
+        return c1 == 0 ? -1 : 1;
+    }
+
+    /* Continue with bytewise comparison to produce some result that
+     * would make comparison operations involving this function transitive.
+     *
+     * If only one string had an error, comparison should be made with
+     * folded version of the other string. In this case it is enough
+     * to fold just one character to determine the result of comparison. */
+
+    if (c1 != -1 && c2 == -1)
+    {
+        n1 = utf_char2bytes(utf_fold(c1), buffer);
+        s1 = buffer;
+    }
+    else if (c2 != -1 && c1 == -1)
+    {
+        n2 = utf_char2bytes(utf_fold(c2), buffer);
+        s2 = buffer;
+    }
+
+    while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL)
+    {
+        cdiff = (int)(*s1) - (int)(*s2);
+        if (cdiff != 0)
+            return cdiff;
+
+        s1++;
+        s2++;
+        n1--;
+        n2--;
+    }
+
+    if (n1 > 0 && *s1 == NUL)
+        n1 = 0;
+    if (n2 > 0 && *s2 == NUL)
+        n2 = 0;
+
+    if (n1 == 0 && n2 == 0)
+        return 0;
+    return n1 == 0 ? -1 : 1;
+}
+
 /*
  * Version of strnicmp() that handles multi-byte characters.
  * Needed for Big5, Sjift-JIS and UTF-8 encoding.  Other DBCS encodings can
@@ -3097,44 +3235,17 @@ mb_strnicmp(s1, s2, nn)
     int		incomplete = FALSE;
     int		n = (int)nn;
 
-    for (i = 0; i < n; i += l)
+    if (enc_utf8)
     {
-	if (s1[i] == NUL && s2[i] == NUL)   /* both strings end */
-	    return 0;
-	if (enc_utf8)
-	{
-	    l = utf_byte2len(s1[i]);
-	    if (l > n - i)
-	    {
-		l = n - i;		    /* incomplete character */
-		incomplete = TRUE;
-	    }
-	    /* Check directly first, it's faster. */
-	    for (j = 0; j < l; ++j)
-	    {
-		if (s1[i + j] != s2[i + j])
-		    break;
-		if (s1[i + j] == 0)
-		    /* Both stings have the same bytes but are incomplete or
-		     * have illegal bytes, accept them as equal. */
-		    l = j;
-	    }
-	    if (j < l)
-	    {
-		/* If one of the two characters is incomplete return -1. */
-		if (incomplete || i + utf_byte2len(s2[i]) > n)
-		    return -1;
-		/* Don't case-fold illegal bytes or truncated characters. */
-		if (utf_ptr2len(s1 + i) < l || utf_ptr2len(s2 + i) < l)
-		    return -1;
-		cdiff = utf_fold(utf_ptr2char(s1 + i))
-					     - utf_fold(utf_ptr2char(s2 + i));
-		if (cdiff != 0)
-		    return cdiff;
-	    }
-	}
-	else
-	{
+        return utf_strnicmp(s1, s2, nn, nn);
+    }
+    else
+    {
+        for (i = 0; i < n; i += l)
+        {
+	    if (s1[i] == NUL && s2[i] == NUL)   /* both strings end */
+	        return 0;
+
 	    l = (*mb_ptr2len)(s1 + i);
 	    if (l <= 1)
 	    {
diff --git a/src/testdir/testXX.in b/src/testdir/testXX.in
new file mode 100644
index 0000000..63a81e7
--- /dev/null
+++ b/src/testdir/testXX.in
@@ -0,0 +1,93 @@
+Tests for case-insensitive UTF-8 comparisons (utf_strnicmp() in mbyte.c)
+
+STARTTEST
+:so small.vim
+:if !has("multi_byte")
+: e! test.ok
+: w! test.out
+: qa!
+:endif
+:set enc=utf8
+ggdG
+:
+:function! Ch(a, op, b, expected)
+:  if eval(printf('"%s" %s "%s"', a:a, a:op, a:b)) != a:expected
+:    call append(line('$'), printf('"%s" %s "%s" should return %d', a:a, a:op, a:b, a:expected))
+:  else
+:    let b:passed += 1
+:  endif
+:endfunction
+:
+:function! Chk(a, b, result)
+:  if a:result == 0
+:    call Ch(a:a, '==?', a:b, 1)
+:    call Ch(a:a, '!=?', a:b, 0)
+:    call Ch(a:a, '<=?', a:b, 1)
+:    call Ch(a:a, '>=?', a:b, 1)
+:    call Ch(a:a, '<?', a:b, 0)
+:    call Ch(a:a, '>?', a:b, 0)
+:  elseif a:result > 0
+:    call Ch(a:a, '==?', a:b, 0)
+:    call Ch(a:a, '!=?', a:b, 1)
+:    call Ch(a:a, '<=?', a:b, 0)
+:    call Ch(a:a, '>=?', a:b, 1)
+:    call Ch(a:a, '<?', a:b, 0)
+:    call Ch(a:a, '>?', a:b, 1)
+:  else
+:    call Ch(a:a, '==?', a:b, 0)
+:    call Ch(a:a, '!=?', a:b, 1)
+:    call Ch(a:a, '<=?', a:b, 1)
+:    call Ch(a:a, '>=?', a:b, 0)
+:    call Ch(a:a, '<?', a:b, 1)
+:    call Ch(a:a, '>?', a:b, 0)
+:  endif
+:endfunction
+:
+:function! Check(a, b, result)
+:  call Chk(a:a, a:b, a:result)
+:  call Chk(a:b, a:a, -a:result)
+:endfunction
+:
+:function! LT(a, b)
+:  call Check(a:a, a:b, -1)
+:endfunction
+:
+:function! GT(a, b)
+:  call Check(a:a, a:b, 1)
+:endfunction
+:
+:function! EQ(a, b)
+:  call Check(a:a, a:b, 0)
+:endfunction
+:
+:let b:passed=0
+:call EQ('', '')
+:call LT('', 'a')
+:call EQ('abc', 'abc')
+:call EQ('Abc', 'abC')
+:call LT('ab', 'abc')
+:call LT('AB', 'abc')
+:call LT('ab', 'aBc')
+:call EQ('\xd0\xb9\xd1\x86\xd1\x83\xd0\xba\xd0\xb5\xd0\xbd', '\xd0\xb9\xd0\xa6\xd0\xa3\xd0\xba\xd0\x95\xd0\xbd')
+:call LT('\xd0\xb9\xd1\x86\xd1\x83\xd0\xba\xd0\xb5\xd0\xbd', '\xd0\xaf\xd1\x86\xd1\x83\xd0\xba\xd0\xb5\xd0\xbd')
+:call EQ('\xe2\x84\xaa', 'k')
+:call LT('\xe2\x84\xaa', 'kkkkkk')
+:call EQ('\xe2\x84\xaa\xe2\x84\xaa\xe2\x84\xaa', 'kkk')
+:call LT('kk', '\xe2\x84\xaa\xe2\x84\xaa\xe2\x84\xaa')
+:call EQ('\xe2\x84\xaa\xe2\x84\xa6k\xe2\x84\xaak\xcf\x89', 'k\xcf\x89\xe2\x84\xaakk\xe2\x84\xa6')
+:call EQ('Abc\x80', 'AbC\x80')
+:call LT('Abc\x80', 'AbC\x81')
+:call LT('Abc', 'AbC\x80')
+:call LT('abc\x80DEF', 'abc\x80def')  " case folding stops at the first bad character
+:call LT('\xc3XYZ', '\xc3xyz')
+:call EQ('\xef\xbc\xba', '\xef\xbd\x9a')  " FF3A (upper), FF5A (lower)
+:call GT('\xef\xbc\xba', '\xef\xbc\xff')  " first string is ok and equals \xef\xbd\x9a after folding, second string is illegal and was left unchanged, then the strings were bytewise compared
+:call LT('\xc3', '\xc3\x83')
+:call EQ('\xc3\xa3xYz', '\xc3\x83XyZ')
+:for n in range(0x60, 0xFF) | call LT(printf('xYz\x%.2X', n-1), printf('XyZ\x%.2X', n)) | endfor
+:for n in range(0x80, 0xBF) | call EQ(printf('xYz\xc2\x%.2XUvW', n), printf('XyZ\xc2\x%.2XuVw', n)) | endfor
+:for n in range(0xC0, 0xFF) | call LT(printf('xYz\xc2\x%.2XUvW', n), printf('XyZ\xc2\x%.2XuVw', n)) | endfor
+:call append(0, printf('%d checks passed', b:passed))
+:wq! test.out
+ENDTEST
+

Reply via email to