On Mon, Jan 13, 2014 at 8:43 AM, Jim Meyering <[email protected]> wrote:
> Thank you for the patch.  I will take a look in the next day or two.

Sorry about the delay.
I have divided your patch into two separate commits: one modifies
dfa.c and the other modifies dfasearch.c.  I've included 5 commits
below.  The first two are those.  The next one is "also remove call to
mb_case_map_apply", which I will merge into your dfasearch.c commit.
I left it separate solely to ease review.  The fourth one adds a
little more test coverage, and will also be merged into your
dfasearch.c commit.  The 5th one is incidental, and just happened to
be on this branch.  Please inspect the modified comments and commit
log messages on your two commits and let me know if you would like to
make any changes before I push.

Thanks again,
Jim
From f804d7afbf75f08fd209ad08358bf76c18cc1d56 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Sat, 25 Jan 2014 16:19:37 -0800
Subject: [PATCH 1/5] dfa: remove GREP-ifdef'd code in favor of code used by
 gawk

For many years, gawk and grep have used different #ifdef'd bits of
code relating to how the DFA matcher matches multibyte characters.
Remove the GREP-specific code in favor of the code gawk uses.  This
permits us to avoid still more cases in which grep must resort to
the expensive process of copying/case-converting each input line
before matching against a case-converted regexp.
* src/dfa.c (parse_bracket_exp, atom): As above.
---
 src/dfa.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 5e3140d..b79c604 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1094,7 +1094,6 @@ parse_bracket_exp (void)
               work_mbc->range_ends[work_mbc->nranges++] =
                 case_fold ? towlower (wc2) : (wchar_t) wc2;

-#ifndef GREP
               if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
                 {
                   REALLOC_IF_NECESSARY (work_mbc->range_sts,
@@ -1104,7 +1103,6 @@ parse_bracket_exp (void)
                                         range_ends_al, work_mbc->nranges + 1);
                   work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
                 }
-#endif
             }
           else
             {
@@ -1140,11 +1138,7 @@ parse_bracket_exp (void)
                                     work_mbc->nchars + 1);
               work_mbc->chars[work_mbc->nchars++] = wc;
             }
-#ifdef GREP
-          continue;
-#else
           wc = towupper (wc);
-#endif
         }
       if (!setbit_wc (wc, ccl))
         {
@@ -1738,13 +1732,11 @@ atom (void)
   else if (MBS_SUPPORT && tok == WCHAR)
     {
       addtok_wc (case_fold ? towlower (wctok) : wctok);
-#ifndef GREP
       if (case_fold && iswalpha (wctok))
         {
           addtok_wc (towupper (wctok));
           addtok (OR);
         }
-#endif

       tok = lex ();
     }
-- 
1.8.5.3.321.g14598b9


From ada1ba7a047db001432cc3d72707c099828ed338 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Mon, 13 Jan 2014 08:42:25 -0800
Subject: [PATCH 2/5] dfasearch: skip kwset optimization when
 multi-byte+case-insensitive

Now that DFA searching works with multi-byte locales, the only remaining
reason to case-convert the searched input is the kwset optimization.
But multi-byte case-conversion is so expensive that it's not
worthwhile even to attempt that optimization.

* src/dfasearch.c (kwsmusts): Skip this function in ignore-case mode
when the locale is multi-byte.
(EGexecute): Now that this code need not handle multi-byte case-ignoring
matches, remove the expensive copy/case-conversion code.
---
 src/dfasearch.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/src/dfasearch.c b/src/dfasearch.c
index 46581ff..0b0f506 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -95,6 +95,12 @@ kwsmusts (void)
   struct dfamust const *dm;
   char const *err;

+  /* With case-insensitive matching in a multi-byte locale, do not
+     use kwsearch, because in that case, it would be too expensive,
+     requiring that we case-convert all searched input.  */
+  if (MB_CUR_MAX > 1 && match_icase)
+    return;
+
   dm = dfamusts (dfa);
   if (dm)
     {
@@ -220,19 +226,6 @@ EGexecute (char const *buf, size_t size, size_t 
*match_size,
   size_t i, ret_val;
   mb_len_map_t *map = NULL;

-  if (MB_CUR_MAX > 1)
-    {
-      if (match_icase)
-        {
-          /* mbtolower adds a NUL byte at the end.  That will provide
-             space for the sentinel byte dfaexec may add.  */
-          char *case_buf = mbtolower (buf, &size, &map);
-          if (start_ptr)
-            start_ptr = case_buf + (start_ptr - buf);
-          buf = case_buf;
-        }
-    }
-
   mb_start = buf;
   buflim = buf + size;

-- 
1.8.5.3.321.g14598b9


From 3893ea4a026399e942c4af5b0a2c71a071296686 Mon Sep 17 00:00:00 2001
From: Jim Meyering <[email protected]>
Date: Tue, 14 Jan 2014 07:36:09 -0800
Subject: [PATCH 3/5] also remove call to mb_case_map_apply

* src/dfasearch.c (EGexecute): With no case-converted buffer, there
is no longer any need to call this length/offset mapping function.
---
 src/dfasearch.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/dfasearch.c b/src/dfasearch.c
index 0b0f506..f640b9b 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -223,8 +223,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
   regoff_t start;
   size_t len, best_len;
   struct kwsmatch kwsm;
-  size_t i, ret_val;
-  mb_len_map_t *map = NULL;
+  size_t i;

   mb_start = buf;
   buflim = buf + size;
@@ -411,8 +410,6 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
   len = end - beg;
  success_in_len:;
   size_t off = beg - buf;
-  mb_case_map_apply (map, &off, &len);
   *match_size = len;
-  ret_val = off;
-  return ret_val;
+  return off;
 }
-- 
1.8.5.3.321.g14598b9


From 26531c30f83e6277d7a8ec9de249f41d03f64e5b Mon Sep 17 00:00:00 2001
From: Jim Meyering <[email protected]>
Date: Tue, 14 Jan 2014 07:53:50 -0800
Subject: [PATCH 4/5] test more

* tests/turkish-eyes: Test all of -E, -F and -G.
---
 tests/turkish-eyes | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/turkish-eyes b/tests/turkish-eyes
index 68301e7..453e30f 100755
--- a/tests/turkish-eyes
+++ b/tests/turkish-eyes
@@ -38,7 +38,9 @@ i=$(printf '\304\261') # lowercase dotless i
 search_str="$i:i I:$I"
 printf "$data\n" > in || framework_failure_

-LC_ALL=$L grep -i "^$search_str\$" in > out || fail=1
-compare out in || fail=1
+for opt in -E -F -G; do
+  LC_ALL=$L grep $opt -i "^$search_str\$" in > out || fail=1
+  compare out in || fail=1
+done

 Exit $fail
-- 
1.8.5.3.321.g14598b9


From e269b64fd335eb73e28a5a2e61bf67e9459cce5a Mon Sep 17 00:00:00 2001
From: Jim Meyering <[email protected]>
Date: Sat, 25 Jan 2014 16:47:17 -0800
Subject: [PATCH 5/5] maint: move two local variable declarations

* src/dfasearch.c (kwsmusts): Move one declaration down to the point
of definition.  Move another into the sole scope where it is used.
---
 src/dfasearch.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/dfasearch.c b/src/dfasearch.c
index f640b9b..634a082 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -92,18 +92,16 @@ kwsincr_case (const char *must)
 static void
 kwsmusts (void)
 {
-  struct dfamust const *dm;
-  char const *err;
-
   /* With case-insensitive matching in a multi-byte locale, do not
      use kwsearch, because in that case, it would be too expensive,
      requiring that we case-convert all searched input.  */
   if (MB_CUR_MAX > 1 && match_icase)
     return;

-  dm = dfamusts (dfa);
+  struct dfamust const *dm = dfamusts (dfa);
   if (dm)
     {
+      char const *err;
       kwsinit (&kwset);
       /* First, we compile in the substrings known to be exact
          matches.  The kwset matcher will return the index
-- 
1.8.5.3.321.g14598b9

Reply via email to