bug#16919: [PATCH] fix mismatch between dfa and regex for treatment of titlecase

Paul Eggert Fri, 07 Mar 2014 18:34:28 -0800

Jim Meyering wrote:

That is my preference, too.

OK, thanks, I installed the attached patch, which should do just that,and I'm marking this bug as done. I'll CC: this to Aharon since itmight affect Gawk.


The code still needs cleanup but so far I'm just trying to fix bugs.

From df950021df03d33f06e54505e3e68be5989d72a6 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Fri, 7 Mar 2014 18:27:28 -0800
Subject: [PATCH] grep: fix case-fold mismatches between DFA and regex

The DFA code and the regex code didn't use the same semantics for
case-folding.  The regex code says that the data char d matches
the pattern char p if uc (d) == uc (p).  POSIX is unclear in this
area; the simplest fix for now is to change the DFA code to agree
with the regex code.  See <http://bugs.gnu.org/16919>.
* src/dfa.c (static_assert): New macro, if not already defined.
(setbit_case_fold_c): Assume MB_CUR_MAX is 1 and that case_fold
is nonzero; all callers changed.
(setbit_case_fold_c, parse_bracket_exp, lex, atom):
Case-fold like the regex code does.
(lonesome_lower): New constant.
(case_folded_counterparts): New function.
(parse_bracket_exp): Prefer plain setbit when case-folding is
not needed.
* src/dfa.h (CASE_FOLDED_BUFSIZE): New constant.
(case_folded_counterparts): New function decl.
* src/main.c (trivial_case_ignore): Case-fold like the regex code does.
(main): Try to improve comment re trivial_case_ignore.
* tests/case-fold-titlecase: Add lots more test cases.
---
 src/dfa.c                 | 143 ++++++++++++++++++++++++---------------
 src/dfa.h                 |   8 +++
 src/main.c                |  57 ++++++++--------
 tests/case-fold-titlecase | 167 +++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 282 insertions(+), 93 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 585a599..5910268 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -34,6 +34,12 @@
 #include <locale.h>
 #include <stdbool.h>
 
+/* Gawk doesn't use Gnulib, so don't assume static_assert is present.  */
+#ifndef static_assert
+# define static_assert(cond, diagnostic) \
+    extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })]
+#endif
+
 #define STREQ(a, b) (strcmp (a, b) == 0)
 
 /* ISASCIIDIGIT differs from isdigit, as follows:
@@ -710,34 +716,16 @@ setbit_wc (wint_t wc, charclass c)
 #endif
 }
 
-/* Set a bit for B in the charclass C, if B is a valid single byte
-   character in the current character set.  If case is folded, set B's
-   lower and upper case variants similarly.  If MB_CUR_MAX > 1, the
-   resulting charset is used only as an optimization, and the caller
-   should set the appropriate field of struct mb_char_classes.  */
+/* Set a bit for B and its case variants in the charclass C.
+   MB_CUR_MAX must be 1.  */
 static void
 setbit_case_fold_c (int b, charclass c)
 {
-  if (MB_CUR_MAX > 1)
-    {
-      wint_t wc = btowc (b);
-      if (wc == WEOF)
-        return;
-      if (case_fold)
-        {
-          setbit_wc (towlower (wc), c);
-          setbit_wc (towupper (wc), c);
-        }
-    }
-  else
-    {
-      if (case_fold)
-        {
-          setbit (tolower (b), c);
-          setbit (toupper (b), c);
-        }
-    }
-  setbit (b, c);
+  int ub = toupper (b);
+  int i;
+  for (i = 0; i < NOTCHAR; i++)
+    if (toupper (i) == ub)
+      setbit (i, c);
 }
 
 
@@ -898,6 +886,50 @@ static unsigned char const *buf_end;    /* reference to 
end in dfaexec.  */
 # define MIN(a,b) ((a) < (b) ? (a) : (b))
 #endif
 
+/* The set of wchar_t values C such that there's a useful locale
+   somewhere where C != towupper (C) && C != towlower (towupper (C)).
+   For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
+   towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
+   towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
+static short const lonesome_lower[] =
+  {
+    0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
+    0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
+
+    /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
+       counterpart in locales predating Unicode 4.0.0 (April 2003).  */
+    0x03F2,
+
+    0x03F5, 0x1E9B, 0x1FBE,
+  };
+
+static_assert ((sizeof lonesome_lower / sizeof *lonesome_lower + 2
+                == CASE_FOLDED_BUFSIZE),
+               "CASE_FOLDED_BUFSIZE is wrong");
+
+/* Find the characters equal to C after case-folding, other than C
+   itself, and store them into FOLDED.  Return the number of characters
+   stored.  */
+int
+case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
+{
+  int i;
+  int n = 0;
+  wint_t uc = towupper (c);
+  wint_t lc = towlower (uc);
+  if (uc != c)
+    folded[n++] = uc;
+  if (lc != uc && lc != c && towupper (lc) == uc)
+    folded[n++] = lc;
+  for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
+    {
+      wint_t li = lonesome_lower[i];
+      if (li != lc && li != uc && li != c && towupper (li) == uc)
+        folded[n++] = li;
+    }
+  return n;
+}
+
 typedef int predicate (int);
 
 /* The following list maps the names of the Posix named character classes
@@ -1058,7 +1090,7 @@ parse_bracket_exp (void)
 
                   for (c2 = 0; c2 < NOTCHAR; ++c2)
                     if (pred->func (c2))
-                      setbit_case_fold_c (c2, ccl);
+                      setbit (c2, ccl);
                 }
               else
                 known_bracket_exp = false;
@@ -1125,8 +1157,21 @@ parse_bracket_exp (void)
                     }
                 }
               else if (using_simple_locale ())
-                for (; c <= c2; c++)
-                  setbit_case_fold_c (c, ccl);
+               {
+                  for (c1 = c; c1 <= c2; c1++)
+                    setbit (c1, ccl);
+                  if (case_fold)
+                    {
+                      int uc = toupper (c);
+                      int uc2 = toupper (c2);
+                      for (c1 = 0; c1 < NOTCHAR; c1++)
+                        {
+                          int uc1 = toupper (c1);
+                          if (uc <= uc1 && uc1 <= uc2)
+                            setbit (c1, ccl);
+                        }
+                    }
+                }
               else
                 known_bracket_exp = false;
 
@@ -1145,26 +1190,22 @@ parse_bracket_exp (void)
 
       if (MB_CUR_MAX == 1)
         {
-          setbit_case_fold_c (c, ccl);
+          if (case_fold)
+            setbit_case_fold_c (c, ccl);
+          else
+            setbit (c, ccl);
           continue;
         }
 
       if (case_fold)
         {
-          wint_t folded = towlower (wc);
-          if (folded != wc && !setbit_wc (folded, ccl))
-            {
-              REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
-                                    work_mbc->nchars + 1);
-              work_mbc->chars[work_mbc->nchars++] = folded;
-            }
-          folded = towupper (wc);
-          if (folded != wc && !setbit_wc (folded, ccl))
-            {
-              REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
-                                    work_mbc->nchars + 1);
-              work_mbc->chars[work_mbc->nchars++] = folded;
-            }
+          wchar_t folded[CASE_FOLDED_BUFSIZE];
+          int i, n = case_folded_counterparts (wc, folded);
+          REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
+                                work_mbc->nchars + n);
+          for (i = 0; i < n; i++)
+            if (!setbit_wc (folded[i], ccl))
+              work_mbc->chars[work_mbc->nchars++] = folded[i];
         }
       if (!setbit_wc (wc, ccl))
         {
@@ -1510,7 +1551,7 @@ lex (void)
           if (MB_CUR_MAX > 1)
             return lasttok = WCHAR;
 
-          if (case_fold && (tolower (c) != c || toupper (c) != c))
+          if (case_fold && isalpha (c))
             {
               zeroset (ccl);
               setbit_case_fold_c (c, ccl);
@@ -1757,18 +1798,14 @@ atom (void)
   if (MBS_SUPPORT && tok == WCHAR)
     {
       addtok_wc (wctok);
+
       if (case_fold)
         {
-          wint_t folded = towlower (wctok);
-          if (folded != wctok)
-            {
-              addtok_wc (folded);
-              addtok (OR);
-            }
-          folded = towupper (wctok);
-          if (folded != wctok)
+          wchar_t folded[CASE_FOLDED_BUFSIZE];
+          int i, n = case_folded_counterparts (wctok, folded);
+          for (i = 0; i < n; i++)
             {
-              addtok_wc (folded);
+              addtok_wc (folded[i]);
               addtok (OR);
             }
         }
diff --git a/src/dfa.h b/src/dfa.h
index 7e0674f..ad2b854 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -101,3 +101,11 @@ extern void dfawarn (const char *);
 extern _Noreturn void dfaerror (const char *);
 
 extern int using_utf8 (void);
+
+/* Maximum number of characters that can be the case-folded
+   counterparts of a single character, not counting the character
+   itself.  This is 1 for towupper, 1 for towlower, and 1 for each
+   entry in LONESOME_LOWER; see dfa.c.  */
+enum { CASE_FOLDED_BUFSIZE = 1 + 1 + 19 };
+
+int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]);
diff --git a/src/main.c b/src/main.c
index 808e47a..4f8f1ef 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1892,11 +1892,12 @@ trivial_case_ignore (size_t len, char const *keys,
     return false;
 
   /* Worst case is that each byte B of KEYS is ASCII alphabetic and
-     the two other_case(B) characters, C and D, each occupies
-     MB_CUR_MAX bytes, so each B maps to [BCD], which requires 2 *
-     MB_CUR_MAX + 3 bytes; this is bounded above by the constant
-     expression 2 * MB_LEN_MAX + 3.  */
-  *new_keys = xnmalloc (len + 1, 2 * MB_LEN_MAX + 3);
+     CASE_FOLDED_BUFSIZE other_case(B) characters, C through Z, each
+     occupying MB_CUR_MAX bytes, so each B maps to [BC...Z], which
+     requires CASE_FOLDED_BUFSIZE * MB_CUR_MAX + 3 bytes; this is
+     bounded above by the constant expression CASE_FOLDED_BUFSIZE *
+     MB_LEN_MAX + 3.  */
+  *new_keys = xnmalloc (len + 1, CASE_FOLDED_BUFSIZE * MB_LEN_MAX + 3);
   char *p = *new_keys;
 
   mbstate_t mb_state = { 0 };
@@ -1918,9 +1919,9 @@ trivial_case_ignore (size_t len, char const *keys,
       keys += n;
       len -= n;
 
-      wint_t lc = towlower (wc);
-      wint_t uc = towupper (wc);
-      if (lc == wc && uc == wc)
+      wchar_t folded[CASE_FOLDED_BUFSIZE];
+      int nfolded = case_folded_counterparts (wc, folded);
+      if (nfolded <= 0)
         {
           memcpy (p, orig, n);
           p += n;
@@ -1933,20 +1934,18 @@ trivial_case_ignore (size_t len, char const *keys,
           memcpy (p, orig, n);
           p += n;
 
-          if (lc != wc)
-            {
-              size_t lcbytes = WCRTOMB (p, lc, &mb_state);
-              if (lcbytes == (size_t) -1)
-                goto skip_case_ignore_optimization;
-              p += lcbytes;
-            }
-          if (uc != wc)
+          int i = 0;
+          do
             {
-              size_t ucbytes = WCRTOMB (p, uc, &mb_state);
-              if (ucbytes == (size_t) -1 || ! mbsinit (&mb_state))
+              size_t nbytes = WCRTOMB (p, folded[i], &mb_state);
+              if (nbytes == (size_t) -1)
                 goto skip_case_ignore_optimization;
-              p += ucbytes;
+              p += nbytes;
             }
+          while (++i < nfolded);
+
+          if (! mbsinit (&mb_state))
+            goto skip_case_ignore_optimization;
 
           *p++ = ']';
         }
@@ -2351,16 +2350,16 @@ main (int argc, char **argv)
   else
     usage (EXIT_TROUBLE);
 
-  /* As currently implemented, case-insensitive matching is expensive in
-     multi-byte locales because of a few outlier locales in which some
-     characters change size when converted to upper or lower case.  To
-     accommodate those, we revert to searching the input one line at a
-     time, rather than using the much more efficient buffer search.
-     However, if we have a regular expression, /foo/i, we can convert
-     it to an equivalent case-insensitive /[fF][oO][oO]/, and thus
-     avoid the expensive read-and-process-a-line-at-a-time requirement.
-     Optimize-away the "-i" option, when possible, converting each
-     candidate alpha, C, in the regexp to [Cc].  */
+  /* Case-insensitive matching is expensive in multibyte locales
+     because a few characters may change size when converted to upper
+     or lower case.  To accommodate those, search the input one line
+     at a time, rather than using the much more efficient buffer search.
+
+     Try to convert a regular expression 'foo' (ignoring case) to an
+     equivalent regular expression '[fF][oO][oO]' (where case matters).
+     Not only does this avoid the expensive requirement to read and
+     process a line at a time, it also allows use of the kwset engine,
+     a win in non-UTF-8 multibyte locales.  */
   if (match_icase)
     {
       size_t new_keycc;
diff --git a/tests/case-fold-titlecase b/tests/case-fold-titlecase
index 0ece5c8..f16022b 100755
--- a/tests/case-fold-titlecase
+++ b/tests/case-fold-titlecase
@@ -1,5 +1,5 @@
 #!/bin/sh
-# Check that case folding works even with titlecase characters.
+# Check that case folding works even with titlecase and similarly odd chars.
 
 # Copyright 2014 Free Software Foundation, Inc.
 
@@ -25,17 +25,162 @@ export LC_ALL
 
 fail=0
 
-LJ='\307\207' # U+01C7 LATIN CAPITAL LETTER LJ
-Lj='\307\210' # U+01C8 LATIN CAPITAL LETTER L WITH SMALL LETTER J
-lj='\307\211' # U+01C9 LATIN SMALL LETTER LJ
-pattern=$(printf "$Lj\n") || framework_failure_
-printf "$lj$lj\n$Lj$Lj\n$LJ$LJ\n" >in || framework_failure_
+for testcase in \
+  0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
+do
+  case $testcase in
+    0)
+      a='\302\265'     # U+00B5
+      b='\316\234'     # U+039C
+      c='\316\274'     # U+03BC
+      ;;
+    1)
+      a='\111'         # U+0049
+      b='\151'         # U+0069
+      c='\304\260'     # U+0130
+      ;;
+    2)
+      a='\111'         # U+0049
+      b='\151'         # U+0069
+      c='\304\261'     # U+0131
+      ;;
+    3)
+      a='\123'         # U+0053
+      b='\163'         # U+0073
+      c='\305\277'     # U+017F
+      ;;
+    4)
+      a='\307\204'     # U+01C4
+      b='\307\205'     # U+01C5
+      c='\307\206'     # U+01C6
+      ;;
+    5)
+      a='\307\207'     # U+01C7
+      b='\307\210'     # U+01C8
+      c='\307\211'     # U+01C9
+      ;;
+    6)
+      a='\307\212'     # U+01CA
+      b='\307\213'     # U+01CB
+      c='\307\214'     # U+01CC
+      ;;
+    7)
+      a='\307\261'     # U+01F1
+      b='\307\262'     # U+01F2
+      c='\307\263'     # U+01F3
+      ;;
+    8)
+      a='\315\205'     # U+0345
+      b='\316\231'     # U+0399
+      c='\316\271'     # U+03B9
+      ;;
+    9)
+      a='\316\243'     # U+03A3
+      b='\317\202'     # U+03C2
+      c='\317\203'     # U+03C3
+      ;;
+    10)
+      a='\316\222'     # U+0392
+      b='\316\262'     # U+03B2
+      c='\317\220'     # U+03D0
+      ;;
+    11)
+      a='\316\230'     # U+0398
+      b='\316\270'     # U+03B8
+      c='\317\221'     # U+03D1
+      ;;
+    12)
+      a='\316\246'     # U+03A6
+      b='\317\206'     # U+03C6
+      c='\317\225'     # U+03D5
+      ;;
+    13)
+      a='\316\240'     # U+03A0
+      b='\317\200'     # U+03C0
+      c='\317\226'     # U+03D6
+      ;;
+    14)
+      a='\316\232'     # U+039A
+      b='\316\272'     # U+03BA
+      c='\317\260'     # U+03F0
+      ;;
+    15)
+      a='\316\241'     # U+03A1
+      b='\317\201'     # U+03C1
+      c='\317\261'     # U+03F1
+      ;;
+    16)
+      a='\316\230'     # U+0398
+      b='\316\270'     # U+03B8
+      c='\317\264'     # U+03F4
+      ;;
+    17)
+      a='\316\225'     # U+0395
+      b='\316\265'     # U+03B5
+      c='\317\265'     # U+03F5
+      ;;
+    18)
+      a='\341\271\240' # U+1E60
+      b='\341\271\241' # U+1E61
+      c='\341\272\233' # U+1E9B
+      ;;
+    19)
+      a='\303\237'     # U+00DF
+      b='\303\237'     # U+00DF
+      c='\341\272\236' # U+1E9E
+      ;;
+    20)
+      a='\316\231'     # U+0399
+      b='\316\271'     # U+03B9
+      c='\341\276\276' # U+1FBE
+      ;;
+    21)
+      a='\316\251'     # U+03A9
+      b='\317\211'     # U+03C9
+      c='\342\204\246' # U+2126
+      ;;
+    22)
+      a='\113'         # U+004B
+      b='\153'         # U+006B
+      c='\342\204\252' # U+212A
+      ;;
+    23)
+      a='\303\205'     # U+00C5
+      b='\303\245'     # U+00E5
+      c='\342\204\253' # U+212B
+      ;;
+    24)
+      a='\316\243'     # U+03A3
+      b='\317\203'     # U+03C3
+      c='\317\262'     # U+03F2
+      ;;
+  esac
 
-grep -i "$pattern" in >out || fail=1
-compare in out || fail=1
+  printf "$a\\n$b\\n$c\\n" >in || framework_failure_
+  for pattern in "$a" "$b" "$c"; do
+     pat=$(printf "$pattern\\n") || framework_failure_
+     grep -i "\\(\\)\\1$pat" in >out-regex || fail=1
+     grep -i "$pat" in >out-dfa || fail=1
+     compare_ out-regex out-dfa || fail=1
+  done
+done
 
-pattern="($pattern)\\1"
-grep -Ei "$pattern" in >out || fail=1
-compare in out || fail=1
+# Try a unibyte test with ISO 8859-7, if available.
+if test "$(get-mb-cur-max el_GR.iso88597)" -eq 1; then
+  LC_ALL=el_GR.iso88597
+  export LC_ALL
+
+  a='\323' # SIGMA
+  b='\362' # stigma
+  c='\363' # sigma
+
+  printf "$a\\n$b\\n$c\\n" >in || framework_failure_
+  for pattern in "$a" "$b" "$c"; do
+     pat=$(printf "$pattern\\n") || framework_failure_
+     grep -i "\\(\\)\\1$pat" in >out-regex || fail=1
+     grep -i "$pat" in >out-dfa || fail=1
+     compare_ out-regex out-dfa || fail=1
+  done
+fi
 
 Exit $fail
-- 
1.8.5.3

bug#16919: [PATCH] fix mismatch between dfa and regex for treatment of titlecase

Reply via email to