On Wed, May 25, 2011 at 14:09, Bram Moolenaar <b...@moolenaar.net> wrote:
> Yes, this code just returns -1, no matter if the first or second string
> is bigger.
>
> Your other remark about difference in byte length of a character is
> right, but it's not so easy to fix.  Can you suggest a patch?
> Preferably with a test.

Hi, here's my patch for mbyte.c and a few testcases.

I've eliminated those return -1's by doing a bytewise comparison of
strings after the first corrupted character. This should make the
comparisons transitive at least.

-- 
You received this message from the "vim_use" maillist.
Do not top-post! Type your reply below the text you are replying to.
For more information, visit http://www.vim.org/maillist.php
diff --git a/src/mbyte.c b/src/mbyte.c
index ce0c897..1e9d253 100644
--- a/src/mbyte.c
+++ b/src/mbyte.c
@@ -3079,6 +3079,143 @@ utf_isupper(a)
     return (utf_tolower(a) != a);
 }
 
+    static int
+utf_strnicmp(s1, s2, n1, n2)
+    char_u      *s1, *s2;
+    size_t      n1, n2;
+{
+    int         c1, c2, k;
+    char_u      buffer[6];
+
+    for (;;)
+    {
+        /* Try to decode next utf8 character sequence from s1,
+         * put its codepoint into c1, and advance s1.
+         * On errors sets c1=-1, on end of string sets c1=0. */
+        if (n1 != 0)
+        {
+            k = utf8len_tab_zero[*s1];
+            if (k == 1)
+            {
+                c1 = *s1++;             /* ascii, possibly NUL */
+                n1--;
+            }
+            else if (k <= n1)
+            {
+                c1 = utf_ptr2char(s1);
+                if (c1 >= 256)
+                {
+                    s1 += k;            /* success */
+                    n1 -= k;
+                }
+                else
+                {
+                    c1 = -1;            /* incomplete or illegal */
+                }
+            }
+            else
+            {
+                c1 = -1;                /* incomplete */
+            }
+        }
+        else
+        {
+            c1 = 0;                     /* end of string */
+        }
+
+        /* Do the same for the other string */
+        if (n2 != 0)
+        {
+            k = utf8len_tab_zero[*s2];
+            if (k == 1)
+            {
+                c2 = *s2++;             /* ascii, possibly NUL */
+                n2--;
+            }
+            else if (k <= n2)
+            {
+                c2 = utf_ptr2char(s2);
+                if (c2 >= 256)
+                {
+                    s2 += k;            /* success */
+                    n2 -= k;
+                }
+                else
+                {
+                    c2 = -1;            /* incomplete or illegal */
+                }
+            }
+            else
+            {
+                c2 = -1;                /* incomplete */
+            }
+        }
+        else
+        {
+            c2 = 0;                     /* end of string */
+        }
+
+        if (c1 > 0 && c2 > 0)
+        {
+            /* both characters were successfully decoded */
+            k = utf_fold(c1) - utf_fold(c2);
+            if (k != 0)
+                return k;
+            continue;
+        }
+
+        break;
+    }
+
+    if (c1 == 0 || c2 == 0)
+    {
+        /* one of the strings ended. shorter string is always smaller */
+        if (c1 == 0 && c2 == 0)
+            return 0;
+        return c1 == 0 ? -1 : 1;
+    }
+
+    if (!(c1 == -1 && c2 == -1))
+    {
+        /* One of the characters is good and the other is incomplete
+         * or contains an illegal byte. Fold and encode the good one
+         * for bytewise comparison. */
+        if (c1 == -1)
+        {
+            n2 = utf_char2bytes(utf_fold(c2), buffer);
+            s2 = buffer;
+        }
+        else
+        {
+            n1 = utf_char2bytes(utf_fold(c1), buffer);
+            s1 = buffer;
+        }
+    }
+
+    /* do bytewise comparison */
+
+    while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL)
+    {
+        k = (int)(*s1) - (int)(*s2);
+        if (k != 0)
+            return k;
+
+        s1++;
+        s2++;
+        n1--;
+        n2--;
+    }
+
+    if (n1 > 0 && *s1 == NUL)
+        n1 = 0;
+    if (n2 > 0 && *s2 == NUL)
+        n2 = 0;
+
+    if (n1 == 0 && n2 == 0)
+        return 0;
+    return n1 == 0 ? -1 : 1;
+}
+
 /*
  * Version of strnicmp() that handles multi-byte characters.
  * Needed for Big5, Sjift-JIS and UTF-8 encoding.  Other DBCS encodings can
@@ -3097,44 +3234,17 @@ mb_strnicmp(s1, s2, nn)
     int		incomplete = FALSE;
     int		n = (int)nn;
 
-    for (i = 0; i < n; i += l)
+    if (enc_utf8)
     {
-	if (s1[i] == NUL && s2[i] == NUL)   /* both strings end */
-	    return 0;
-	if (enc_utf8)
-	{
-	    l = utf_byte2len(s1[i]);
-	    if (l > n - i)
-	    {
-		l = n - i;		    /* incomplete character */
-		incomplete = TRUE;
-	    }
-	    /* Check directly first, it's faster. */
-	    for (j = 0; j < l; ++j)
-	    {
-		if (s1[i + j] != s2[i + j])
-		    break;
-		if (s1[i + j] == 0)
-		    /* Both stings have the same bytes but are incomplete or
-		     * have illegal bytes, accept them as equal. */
-		    l = j;
-	    }
-	    if (j < l)
-	    {
-		/* If one of the two characters is incomplete return -1. */
-		if (incomplete || i + utf_byte2len(s2[i]) > n)
-		    return -1;
-		/* Don't case-fold illegal bytes or truncated characters. */
-		if (utf_ptr2len(s1 + i) < l || utf_ptr2len(s2 + i) < l)
-		    return -1;
-		cdiff = utf_fold(utf_ptr2char(s1 + i))
-					     - utf_fold(utf_ptr2char(s2 + i));
-		if (cdiff != 0)
-		    return cdiff;
-	    }
-	}
-	else
-	{
+        return utf_strnicmp(s1, s2, nn, nn);
+    }
+    else
+    {
+        for (i = 0; i < n; i += l)
+        {
+	    if (s1[i] == NUL && s2[i] == NUL)   /* both strings end */
+	        return 0;
+
 	    l = (*mb_ptr2len)(s1 + i);
 	    if (l <= 1)
 	    {

Attachment: strnicmp.test.vim
Description: Binary data

Reply via email to