On Wed, May 25, 2011 at 22:39, Ivan Krasilnikov <inf...@gmail.com> wrote: > Hi, here's my patch for mbyte.c and a few testcases. > > I've eliminated those return -1's by doing a bytewise comparison of > strings after the first corrupted character. This should make the > comparisons transitive at least. >
Had a bug in the patch - incorrectly checked for utf_ptr2char()'s failure. Fixed patch and more tests in vimscript, suitable for src/testdir/, are attached. -- You received this message from the "vim_use" maillist. Do not top-post! Type your reply below the text you are replying to. For more information, visit http://www.vim.org/maillist.php
diff --git a/src/mbyte.c b/src/mbyte.c index ce0c897..7ae7e78 100644 --- a/src/mbyte.c +++ b/src/mbyte.c @@ -1701,6 +1701,70 @@ utf_ptr2char(p) } /* + * Converts a UTF-8 byte sequence to a wide character. + * String is assumed to be terminated by NUL or after n bytes, + * whichever comes first. + * The function is safe in the sense that it never accesses + * memory beyond the first n bytes of s. + * + * On success, returns decoded codepoint, advances s to + * the beginning of next character and decreases n accordingly. + * + * If end of string was reached, returns 0 and, if n > 0, + * advances s past NUL byte. + * + * If byte sequence is illegal or incomplete, returns -1 and + * does not advance s. + */ + static int +utf_safe_read_char(s, n) + char_u **s; + size_t *n; +{ + int c, k; + + if (*n == 0) + { + /* end of buffer */ + return 0; + } + + k = utf8len_tab_zero[**s]; + + if (k == 1) + { + /* ASCII character or NUL */ + (*n)--; + return *(*s)++; + } + + if (k <= *n) + { + /* We have a multibyte sequence and it isn't truncated by buffer + * limits so utf_ptr2char() is safe to use. Or the first byte is + * illegal (k=0), and it's also safe to use utf_ptr2char(). */ + + c = utf_ptr2char(*s); + + /* On failure, utf_ptr2char() returns the first byte, so here we + * check equality with the first byte. The only non-ASCII character + * which equals the first byte of its own UTF-8 representation is + * U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too. + * It's safe even if n=1, else we would have k=2 > n. */ + if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83)) + { + /* byte sequence was successfully decoded */ + *s += k; + *n -= k; + return c; + } + } + + /* byte sequence is incomplete or illegal */ + return -1; +} + +/* * Get character at **pp and advance *pp to the next character. * Note: composing characters are skipped! */ @@ -3079,6 +3143,80 @@ utf_isupper(a) return (utf_tolower(a) != a); } + static int +utf_strnicmp(s1, s2, n1, n2) + char_u *s1, *s2; + size_t n1, n2; +{ + int c1, c2, cdiff; + char_u buffer[6]; + + for (;;) + { + c1 = utf_safe_read_char(&s1, &n1); + c2 = utf_safe_read_char(&s2, &n2); + + if (c1 <= 0 || c2 <= 0) + break; + + if (c1 == c2) + continue; + + cdiff = utf_fold(c1) - utf_fold(c2); + if (cdiff != 0) + return cdiff; + } + + /* some string ended or has an incomplete/illegal character sequence */ + + if (c1 == 0 || c2 == 0) + { + /* some string ended. shorter string is smaller */ + if (c1 == 0 && c2 == 0) + return 0; + return c1 == 0 ? -1 : 1; + } + + /* Continue with bytewise comparison to produce some result that + * would make comparison operations involving this function transitive. + * + * If only one string had an error, comparison should be made with + * folded version of the other string. In this case it is enough + * to fold just one character to determine the result of comparison. */ + + if (c1 != -1 && c2 == -1) + { + n1 = utf_char2bytes(utf_fold(c1), buffer); + s1 = buffer; + } + else if (c2 != -1 && c1 == -1) + { + n2 = utf_char2bytes(utf_fold(c2), buffer); + s2 = buffer; + } + + while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL) + { + cdiff = (int)(*s1) - (int)(*s2); + if (cdiff != 0) + return cdiff; + + s1++; + s2++; + n1--; + n2--; + } + + if (n1 > 0 && *s1 == NUL) + n1 = 0; + if (n2 > 0 && *s2 == NUL) + n2 = 0; + + if (n1 == 0 && n2 == 0) + return 0; + return n1 == 0 ? -1 : 1; +} + /* * Version of strnicmp() that handles multi-byte characters. * Needed for Big5, Sjift-JIS and UTF-8 encoding. Other DBCS encodings can @@ -3097,44 +3235,17 @@ mb_strnicmp(s1, s2, nn) int incomplete = FALSE; int n = (int)nn; - for (i = 0; i < n; i += l) + if (enc_utf8) { - if (s1[i] == NUL && s2[i] == NUL) /* both strings end */ - return 0; - if (enc_utf8) - { - l = utf_byte2len(s1[i]); - if (l > n - i) - { - l = n - i; /* incomplete character */ - incomplete = TRUE; - } - /* Check directly first, it's faster. */ - for (j = 0; j < l; ++j) - { - if (s1[i + j] != s2[i + j]) - break; - if (s1[i + j] == 0) - /* Both stings have the same bytes but are incomplete or - * have illegal bytes, accept them as equal. */ - l = j; - } - if (j < l) - { - /* If one of the two characters is incomplete return -1. */ - if (incomplete || i + utf_byte2len(s2[i]) > n) - return -1; - /* Don't case-fold illegal bytes or truncated characters. */ - if (utf_ptr2len(s1 + i) < l || utf_ptr2len(s2 + i) < l) - return -1; - cdiff = utf_fold(utf_ptr2char(s1 + i)) - - utf_fold(utf_ptr2char(s2 + i)); - if (cdiff != 0) - return cdiff; - } - } - else - { + return utf_strnicmp(s1, s2, nn, nn); + } + else + { + for (i = 0; i < n; i += l) + { + if (s1[i] == NUL && s2[i] == NUL) /* both strings end */ + return 0; + l = (*mb_ptr2len)(s1 + i); if (l <= 1) { diff --git a/src/testdir/testXX.in b/src/testdir/testXX.in new file mode 100644 index 0000000..63a81e7 --- /dev/null +++ b/src/testdir/testXX.in @@ -0,0 +1,93 @@ +Tests for case-insensitive UTF-8 comparisons (utf_strnicmp() in mbyte.c) + +STARTTEST +:so small.vim +:if !has("multi_byte") +: e! test.ok +: w! test.out +: qa! +:endif +:set enc=utf8 +ggdG +: +:function! Ch(a, op, b, expected) +: if eval(printf('"%s" %s "%s"', a:a, a:op, a:b)) != a:expected +: call append(line('$'), printf('"%s" %s "%s" should return %d', a:a, a:op, a:b, a:expected)) +: else +: let b:passed += 1 +: endif +:endfunction +: +:function! Chk(a, b, result) +: if a:result == 0 +: call Ch(a:a, '==?', a:b, 1) +: call Ch(a:a, '!=?', a:b, 0) +: call Ch(a:a, '<=?', a:b, 1) +: call Ch(a:a, '>=?', a:b, 1) +: call Ch(a:a, '<?', a:b, 0) +: call Ch(a:a, '>?', a:b, 0) +: elseif a:result > 0 +: call Ch(a:a, '==?', a:b, 0) +: call Ch(a:a, '!=?', a:b, 1) +: call Ch(a:a, '<=?', a:b, 0) +: call Ch(a:a, '>=?', a:b, 1) +: call Ch(a:a, '<?', a:b, 0) +: call Ch(a:a, '>?', a:b, 1) +: else +: call Ch(a:a, '==?', a:b, 0) +: call Ch(a:a, '!=?', a:b, 1) +: call Ch(a:a, '<=?', a:b, 1) +: call Ch(a:a, '>=?', a:b, 0) +: call Ch(a:a, '<?', a:b, 1) +: call Ch(a:a, '>?', a:b, 0) +: endif +:endfunction +: +:function! Check(a, b, result) +: call Chk(a:a, a:b, a:result) +: call Chk(a:b, a:a, -a:result) +:endfunction +: +:function! LT(a, b) +: call Check(a:a, a:b, -1) +:endfunction +: +:function! GT(a, b) +: call Check(a:a, a:b, 1) +:endfunction +: +:function! EQ(a, b) +: call Check(a:a, a:b, 0) +:endfunction +: +:let b:passed=0 +:call EQ('', '') +:call LT('', 'a') +:call EQ('abc', 'abc') +:call EQ('Abc', 'abC') +:call LT('ab', 'abc') +:call LT('AB', 'abc') +:call LT('ab', 'aBc') +:call EQ('\xd0\xb9\xd1\x86\xd1\x83\xd0\xba\xd0\xb5\xd0\xbd', '\xd0\xb9\xd0\xa6\xd0\xa3\xd0\xba\xd0\x95\xd0\xbd') +:call LT('\xd0\xb9\xd1\x86\xd1\x83\xd0\xba\xd0\xb5\xd0\xbd', '\xd0\xaf\xd1\x86\xd1\x83\xd0\xba\xd0\xb5\xd0\xbd') +:call EQ('\xe2\x84\xaa', 'k') +:call LT('\xe2\x84\xaa', 'kkkkkk') +:call EQ('\xe2\x84\xaa\xe2\x84\xaa\xe2\x84\xaa', 'kkk') +:call LT('kk', '\xe2\x84\xaa\xe2\x84\xaa\xe2\x84\xaa') +:call EQ('\xe2\x84\xaa\xe2\x84\xa6k\xe2\x84\xaak\xcf\x89', 'k\xcf\x89\xe2\x84\xaakk\xe2\x84\xa6') +:call EQ('Abc\x80', 'AbC\x80') +:call LT('Abc\x80', 'AbC\x81') +:call LT('Abc', 'AbC\x80') +:call LT('abc\x80DEF', 'abc\x80def') " case folding stops at the first bad character +:call LT('\xc3XYZ', '\xc3xyz') +:call EQ('\xef\xbc\xba', '\xef\xbd\x9a') " FF3A (upper), FF5A (lower) +:call GT('\xef\xbc\xba', '\xef\xbc\xff') " first string is ok and equals \xef\xbd\x9a after folding, second string is illegal and was left unchanged, then the strings were bytewise compared +:call LT('\xc3', '\xc3\x83') +:call EQ('\xc3\xa3xYz', '\xc3\x83XyZ') +:for n in range(0x60, 0xFF) | call LT(printf('xYz\x%.2X', n-1), printf('XyZ\x%.2X', n)) | endfor +:for n in range(0x80, 0xBF) | call EQ(printf('xYz\xc2\x%.2XUvW', n), printf('XyZ\xc2\x%.2XuVw', n)) | endfor +:for n in range(0xC0, 0xFF) | call LT(printf('xYz\xc2\x%.2XUvW', n), printf('XyZ\xc2\x%.2XuVw', n)) | endfor +:call append(0, printf('%d checks passed', b:passed)) +:wq! test.out +ENDTEST +