I like that as far as it goes, but it pulls loose a thread that has been nagging me for a while. How about the attached instead? It includes somewhat more simplification, entailing more-efficient handling of caseless letters when ignoring case.
>From 85efede266be9d2cda8d229c012828b6ae4574c5 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Tue, 28 Jan 2014 13:47:47 -0800
Subject: [PATCH] Simplify handling of letter case.

* src/dfa.c (setbit_wc, setbit_case_fold_c, atom): Simplify.
(setbit_case_fold_c, parse_bracket_exp, lex, atom): Invoke tolower
and toupper instead of isalpha followed by one or the other, and
similarly for towlower, towupper, iswalpha.  This should lead to
more-efficient handling of caseless letters, and it simplifies
the code.
---
 src/dfa.c | 93 ++++++++++++++++++++++++++++++---------------------------------
 1 file changed, 44 insertions(+), 49 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index b79c604..72beed0 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -693,39 +693,24 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
    this may happen when folding case in weird Turkish locales where
    dotless i/dotted I are not included in the chosen character set.
    Return whether a bit was set in the charclass.  */
-#if MBS_SUPPORT
 static bool
 setbit_wc (wint_t wc, charclass c)
 {
+#if MBS_SUPPORT
   int b = wctob (wc);
   if (b == EOF)
     return false;
 
   setbit (b, c);
   return true;
-}
-
-/* Set a bit in the charclass for the given single byte character,
-   if it is valid in the current character set.  */
-static void
-setbit_c (int b, charclass c)
-{
-  /* Do nothing if b is invalid in this character set.  */
-  if (MB_CUR_MAX > 1 && btowc (b) == WEOF)
-    return;
-  setbit (b, c);
-}
 #else
-# define setbit_c setbit
-static inline bool
-setbit_wc (wint_t wc, charclass c)
-{
   abort ();
    /*NOTREACHED*/ return false;
-}
 #endif
+}
 
-/* Like setbit_c, but if case is folded, set both cases of a letter.  For
+/* Like setbit_wc but for a single-byte character B; and if case is
+   folded, set both cases of a letter.  For
    MB_CUR_MAX > 1, the resulting charset is only used as an optimization,
    and the caller takes care of setting the appropriate field of struct
    mb_char_classes.  */
@@ -737,16 +722,16 @@ setbit_case_fold_c (int b, charclass c)
       wint_t wc = btowc (b);
       if (wc == WEOF)
         return;
-      setbit (b, c);
-      if (case_fold && iswalpha (wc))
-        setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
+      if (case_fold)
+        setbit_wc (wc ^ towlower (wc) ^ towupper (wc), c);
     }
   else
     {
-      setbit (b, c);
-      if (case_fold && isalpha (b))
-        setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
+      if (case_fold)
+        setbit (b ^ tolower (b) ^ toupper (b), c);
     }
+
+  setbit (b, c);
 }
 
 
@@ -1085,23 +1070,30 @@ parse_bracket_exp (void)
             {
               /* When case folding map a range, say [m-z] (or even [M-z])
                  to the pair of ranges, [m-z] [M-Z].  */
+              wchar_t lo1 = wc, hi1 = wc2, lo2 = wc, hi2 = wc2;
+              if (case_fold)
+                {
+                  lo1 = towlower (lo1);
+                  hi1 = towlower (hi1);
+                  lo2 = towupper (lo2);
+                  hi2 = towupper (hi2);
+                }
+
               REALLOC_IF_NECESSARY (work_mbc->range_sts,
                                     range_sts_al, work_mbc->nranges + 1);
               REALLOC_IF_NECESSARY (work_mbc->range_ends,
                                     range_ends_al, work_mbc->nranges + 1);
-              work_mbc->range_sts[work_mbc->nranges] =
-                case_fold ? towlower (wc) : (wchar_t) wc;
-              work_mbc->range_ends[work_mbc->nranges++] =
-                case_fold ? towlower (wc2) : (wchar_t) wc2;
+              work_mbc->range_sts[work_mbc->nranges] = lo1;
+              work_mbc->range_ends[work_mbc->nranges++] = hi1;
 
-              if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+              if (lo1 != lo2 || hi1 != hi2)
                 {
                   REALLOC_IF_NECESSARY (work_mbc->range_sts,
                                         range_sts_al, work_mbc->nranges + 1);
-                  work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
+                  work_mbc->range_sts[work_mbc->nranges] = lo2;
                   REALLOC_IF_NECESSARY (work_mbc->range_ends,
                                         range_ends_al, work_mbc->nranges + 1);
-                  work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
+                  work_mbc->range_ends[work_mbc->nranges++] = hi2;
                 }
             }
           else
@@ -1129,16 +1121,18 @@ parse_bracket_exp (void)
           continue;
         }
 
-      if (case_fold && iswalpha (wc))
+      if (case_fold)
         {
-          wc = towlower (wc);
-          if (!setbit_wc (wc, ccl))
+          wchar_t diff = towlower (wc) ^ towupper (wc);
+          if (diff)
             {
-              REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
-                                    work_mbc->nchars + 1);
-              work_mbc->chars[work_mbc->nchars++] = wc;
+              if (!setbit_wc (wc ^ diff, ccl))
+                {
+                  REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
+                                        work_mbc->nchars + 1);
+                  work_mbc->chars[work_mbc->nchars++] = wc ^ diff;
+                }
             }
-          wc = towupper (wc);
         }
       if (!setbit_wc (wc, ccl))
         {
@@ -1481,7 +1475,7 @@ lex (void)
           if (MB_CUR_MAX > 1)
             return lasttok = WCHAR;
 
-          if (case_fold && isalpha (c))
+          if (case_fold && tolower (c) != toupper (c))
             {
               zeroset (ccl);
               setbit_case_fold_c (c, ccl);
@@ -1725,17 +1719,18 @@ add_utf8_anychar (void)
 static void
 atom (void)
 {
-  if (0)
-    {
-      /* empty */
-    }
-  else if (MBS_SUPPORT && tok == WCHAR)
+  if (MBS_SUPPORT && tok == WCHAR)
     {
-      addtok_wc (case_fold ? towlower (wctok) : wctok);
-      if (case_fold && iswalpha (wctok))
+      wchar_t wc = wctok;
+      addtok_wc (wc);
+      if (case_fold)
         {
-          addtok_wc (towupper (wctok));
-          addtok (OR);
+          wchar_t diff = towlower (wc) ^ towupper (wc);
+          if (diff)
+            {
+              addtok_wc (wc ^ diff);
+              addtok (OR);
+            }
         }
 
       tok = lex ();
-- 
1.8.5.3

Reply via email to