On Wed, May 25, 2011 at 14:09, Bram Moolenaar <b...@moolenaar.net> wrote: > Yes, this code just returns -1, no matter if the first or second string > is bigger. > > Your other remark about difference in byte length of a character is > right, but it's not so easy to fix. Can you suggest a patch? > Preferably with a test.
Hi, here's my patch for mbyte.c and a few testcases. I've eliminated those return -1's by doing a bytewise comparison of strings after the first corrupted character. This should make the comparisons transitive at least. -- You received this message from the "vim_use" maillist. Do not top-post! Type your reply below the text you are replying to. For more information, visit http://www.vim.org/maillist.php
diff --git a/src/mbyte.c b/src/mbyte.c index ce0c897..1e9d253 100644 --- a/src/mbyte.c +++ b/src/mbyte.c @@ -3079,6 +3079,143 @@ utf_isupper(a) return (utf_tolower(a) != a); } + static int +utf_strnicmp(s1, s2, n1, n2) + char_u *s1, *s2; + size_t n1, n2; +{ + int c1, c2, k; + char_u buffer[6]; + + for (;;) + { + /* Try to decode next utf8 character sequence from s1, + * put its codepoint into c1, and advance s1. + * On errors sets c1=-1, on end of string sets c1=0. */ + if (n1 != 0) + { + k = utf8len_tab_zero[*s1]; + if (k == 1) + { + c1 = *s1++; /* ascii, possibly NUL */ + n1--; + } + else if (k <= n1) + { + c1 = utf_ptr2char(s1); + if (c1 >= 256) + { + s1 += k; /* success */ + n1 -= k; + } + else + { + c1 = -1; /* incomplete or illegal */ + } + } + else + { + c1 = -1; /* incomplete */ + } + } + else + { + c1 = 0; /* end of string */ + } + + /* Do the same for the other string */ + if (n2 != 0) + { + k = utf8len_tab_zero[*s2]; + if (k == 1) + { + c2 = *s2++; /* ascii, possibly NUL */ + n2--; + } + else if (k <= n2) + { + c2 = utf_ptr2char(s2); + if (c2 >= 256) + { + s2 += k; /* success */ + n2 -= k; + } + else + { + c2 = -1; /* incomplete or illegal */ + } + } + else + { + c2 = -1; /* incomplete */ + } + } + else + { + c2 = 0; /* end of string */ + } + + if (c1 > 0 && c2 > 0) + { + /* both characters were successfully decoded */ + k = utf_fold(c1) - utf_fold(c2); + if (k != 0) + return k; + continue; + } + + break; + } + + if (c1 == 0 || c2 == 0) + { + /* one of the strings ended. shorter string is always smaller */ + if (c1 == 0 && c2 == 0) + return 0; + return c1 == 0 ? -1 : 1; + } + + if (!(c1 == -1 && c2 == -1)) + { + /* One of the characters is good and the other is incomplete + * or contains an illegal byte. Fold and encode the good one + * for bytewise comparison. */ + if (c1 == -1) + { + n2 = utf_char2bytes(utf_fold(c2), buffer); + s2 = buffer; + } + else + { + n1 = utf_char2bytes(utf_fold(c1), buffer); + s1 = buffer; + } + } + + /* do bytewise comparison */ + + while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL) + { + k = (int)(*s1) - (int)(*s2); + if (k != 0) + return k; + + s1++; + s2++; + n1--; + n2--; + } + + if (n1 > 0 && *s1 == NUL) + n1 = 0; + if (n2 > 0 && *s2 == NUL) + n2 = 0; + + if (n1 == 0 && n2 == 0) + return 0; + return n1 == 0 ? -1 : 1; +} + /* * Version of strnicmp() that handles multi-byte characters. * Needed for Big5, Sjift-JIS and UTF-8 encoding. Other DBCS encodings can @@ -3097,44 +3234,17 @@ mb_strnicmp(s1, s2, nn) int incomplete = FALSE; int n = (int)nn; - for (i = 0; i < n; i += l) + if (enc_utf8) { - if (s1[i] == NUL && s2[i] == NUL) /* both strings end */ - return 0; - if (enc_utf8) - { - l = utf_byte2len(s1[i]); - if (l > n - i) - { - l = n - i; /* incomplete character */ - incomplete = TRUE; - } - /* Check directly first, it's faster. */ - for (j = 0; j < l; ++j) - { - if (s1[i + j] != s2[i + j]) - break; - if (s1[i + j] == 0) - /* Both stings have the same bytes but are incomplete or - * have illegal bytes, accept them as equal. */ - l = j; - } - if (j < l) - { - /* If one of the two characters is incomplete return -1. */ - if (incomplete || i + utf_byte2len(s2[i]) > n) - return -1; - /* Don't case-fold illegal bytes or truncated characters. */ - if (utf_ptr2len(s1 + i) < l || utf_ptr2len(s2 + i) < l) - return -1; - cdiff = utf_fold(utf_ptr2char(s1 + i)) - - utf_fold(utf_ptr2char(s2 + i)); - if (cdiff != 0) - return cdiff; - } - } - else - { + return utf_strnicmp(s1, s2, nn, nn); + } + else + { + for (i = 0; i < n; i += l) + { + if (s1[i] == NUL && s2[i] == NUL) /* both strings end */ + return 0; + l = (*mb_ptr2len)(s1 + i); if (l <= 1) {
strnicmp.test.vim
Description: Binary data