* src/dfa.c (setbit_case_fold): Remove, replace with... (setbit_wc, setbit_c, setbit_case_fold_c): ... these. (parse_bracket_exp): Use setbit_case_fold_c when iterating over single-byte sequences. Use setbit_wc for multi-byte character sets, and setbit_case_fold_c for single-byte character sets. (lex): Use setbit_case_fold_c for single-byte character sets. --- > At first I was going to say this: > > You are using ru_RU.KOI8-R, which is a uni-byte locale, yet your > inputs (both stdin and the grep regexp) use the two-byte > representation \xd0\9f for П, instead of the uni-byte \360. > > But it fails even with the single-byte version. > So it is indeed a bug in grep, but at least this time > it affects relatively few locales. > > Here's the fix I expect to use and a test case to exercise it.
The bug affects all single-byte locales except ISO-8859-1 ones. It is quite serious---the logic to map wide characters back to bytes makes no sense. The attached patch fixes it and does not regress high-bit-range, while removing the debatable uses of wctob and checks for EOF. Ok to apply together with your testcase? --- src/dfa.c | 102 ++++++++++++++++++++++++++++++++++--------------------------- 1 files changed, 57 insertions(+), 45 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index b41cbb6..6602ae8 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -536,51 +536,67 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) eolbyte = eol; } -/* Like setbit, but if case is folded, set both cases of a letter. - For MB_CUR_MAX > 1, one or both of the two cases may not be set, - so the resulting charset may only be used as an optimization. */ -static void -setbit_case_fold ( +/* Set a bit in the charclass for the given wchar_t. Do nothing if WC + is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1, + this may happen when folding case in weird Turkish locales where + dotless i/dotted I are not included in the chosen character set. + Return whether a bit was set in the charclass. */ #if MBS_SUPPORT - wint_t b, +static bool +setbit_wc (wint_t wc, charclass c) +{ + int b = wctob (wc); + if (b != EOF) + { + setbit (b, c); + return true; + } + else + return false; +} + +/* Set a bit in the charclass for the given single byte character, + if it is valid in the current character set. */ +static void +setbit_c (int b, charclass c) +{ + /* Do nothing if b is invalid in this character set. */ + if (MB_CUR_MAX > 1 && btowc (b) == EOF) + return; + setbit (b, c); +} #else - unsigned int b, +#define setbit_c setbit #endif - charclass c) + +/* Like setbit_c, but if case is folded, set both cases of a letter. For + MB_CUR_MAX > 1, the resulting charset is only used as an optimization, + and the caller takes care of setting the appropriate field of struct + mb_char_classes. */ +static void +setbit_case_fold_c (int b, charclass c) { - if (case_fold) - { #if MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - wint_t b1 = iswupper(b) ? towlower(b) : b; - wint_t b2 = iswlower(b) ? towupper(b) : b; - if (wctob ((unsigned char)b1) == b1) - setbit (b1, c); - if (b2 != b1 && wctob ((unsigned char)b2) == b2) - setbit (b2, c); - } - else -#endif - { - unsigned char b1 = isupper(b) ? tolower(b) : b; - unsigned char b2 = islower(b) ? toupper(b) : b; - setbit (b1, c); - if (b2 != b1) - setbit (b2, c); - } + if (MB_CUR_MAX > 1) + { + wint_t wc = btowc (b); + if (wc == EOF) + return; + setbit (b, c); + if (case_fold && iswalpha (wc)) + setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c); } else - { -#if MBS_SUPPORT - int b2 = wctob ((unsigned char) b); - if (b2 == EOF || b2 == b) #endif - setbit (b, c); + { + setbit (b, c); + if (case_fold && isalpha (b)) + setbit_c (isupper (b) ? tolower (b) : toupper (b), c); } } + /* UTF-8 encoding allows some optimizations that we can't otherwise assume in a multibyte encoding. */ static inline int @@ -859,7 +875,7 @@ parse_bracket_exp (void) for (c2 = 0; c2 < NOTCHAR; ++c2) if (pred->func(c2)) - setbit_case_fold (c2, ccl); + setbit_case_fold_c (c2, ccl); } #if MBS_SUPPORT @@ -970,7 +986,7 @@ parse_bracket_exp (void) } if (!hard_LC_COLLATE) for (c = c1; c <= c2; c++) - setbit_case_fold (c, ccl); + setbit_case_fold_c (c, ccl); else { /* Defer to the system regex library about the meaning @@ -984,7 +1000,7 @@ parse_bracket_exp (void) subject[0] = c; if (!(case_fold && isupper (c)) && regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH) - setbit_case_fold (c, ccl); + setbit_case_fold_c (c, ccl); } regfree (&re); } @@ -998,15 +1014,12 @@ parse_bracket_exp (void) colon_warning_state |= (c == ':') ? 2 : 4; #if MBS_SUPPORT - /* Build normal characters. */ - setbit_case_fold (wc, ccl); if (MB_CUR_MAX > 1) { if (case_fold && iswalpha(wc)) { wc = towlower(wc); - c = wctob(wc); - if (c == EOF || (wint_t)c != (wint_t)wc) + if (!setbit_wc (wc, ccl)) { REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, work_mbc->nchars + 1); @@ -1016,19 +1029,18 @@ parse_bracket_exp (void) continue; #else wc = towupper(wc); - c = wctob(wc); #endif } - if (c == EOF || (wint_t)c != (wint_t)wc) + if (!setbit_wc (wc, ccl)) { REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, work_mbc->nchars + 1); work_mbc->chars[work_mbc->nchars++] = wc; } } -#else - setbit_case_fold (c, ccl); + else #endif + setbit_case_fold_c (c, ccl); } while (( #if MBS_SUPPORT @@ -1382,7 +1394,7 @@ lex (void) if (case_fold && isalpha(c)) { zeroset(ccl); - setbit_case_fold (c, ccl); + setbit_case_fold_c (c, ccl); return lasttok = CSET + charclass_index(ccl); } -- 1.7.4.4 -- To UNSUBSCRIBE, email to debian-bugs-dist-requ...@lists.debian.org with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org