From 0a88f6c25065cf5258e19e40ee7a22133995d96a Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <noritnk@kcn.ne.jp>
Date: Sun, 17 Nov 2019 07:20:41 +0900
Subject: [PATCH 1/2] grep: fix performance degradation with previous patch

* src/kwsearch.c (Fexecute): Avoid unnecessary back-up in non-UTF8
multibyte locales.
---
 src/kwsearch.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/kwsearch.c b/src/kwsearch.c
index 5edff79..f590d19 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -224,9 +224,18 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
         goto success;

       /* We need a preceding mb_start pointer.  Use the beginning of line
-         if there is a preceding newline, else BUF.  */
-      char const *nl = memrchr (mb_start, eol, beg - mb_start);
-      mb_start = nl ? nl + 1 : buf;
+         if there is a preceding newline.  */
+      if (mb_check)
+        {
+           char const *nl = memrchr (buf, eol, beg - buf);
+           mb_start = nl ? nl + 1 : buf;
+        }
+      else
+        {
+           char const *nl = memrchr (mb_start, eol, beg - mb_start);
+           if (nl)
+             mb_start = nl + 1;
+        }

       /* Succeed if neither the preceding nor the following character is a
          word constituent.  If the preceding is not, yet the following
-- 
2.24.0.155.gd9f6f3b619


From 15ada78014cfa212971f4cbfacc01eb5559ab792 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <noritnk@kcn.ne.jp>
Date: Sun, 17 Nov 2019 07:29:15 +0900
Subject: [PATCH 2/2] grep: improve grep -Fw performance in non-UTF8 multibyte
 locales

* src/searchutils.c (mb_goback): New parameter.  All callers changed.
* src/search.h (mb_goback): Update prototype.
* src/kwsearch.c (Fexecute): Use mb_goback's MBCLEN to detect a
word-boundary even more efficiently.
---
 src/dfasearch.c   |  2 +-
 src/kwsearch.c    | 22 +++++++++++-----------
 src/search.h      |  3 ++-
 src/searchutils.c | 24 +++++++++++++++---------
 4 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/src/dfasearch.c b/src/dfasearch.c
index 3ebd25e..6c95d8c 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -279,7 +279,7 @@ EGexecute (void *vdc, char const *buf, size_t size, size_t *match_size,
                     goto success;
                   if (mb_start < beg)
                     mb_start = beg;
-                  if (mb_goback (&mb_start, match, buflim) == 0)
+                  if (mb_goback (&mb_start, NULL, match, buflim) == 0)
                     goto success;
                   /* The matched line starts in the middle of a multibyte
                      character.  Perform the DFA search starting from the
diff --git a/src/kwsearch.c b/src/kwsearch.c
index f590d19..f121816 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -161,6 +161,7 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
   bool longest;
   struct kwsearch *kwsearch = vcp;
   kwset_t kwset = kwsearch->kwset;
+  size_t mbclen;

   if (match_lines)
     mb_check = longest = false;
@@ -194,7 +195,9 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
           return EGexecute (kwsearch->re, buf, size, match_size, start_ptr);
         }

-      if (mb_check && mb_goback (&mb_start, beg + offset, buf + size) != 0)
+      mbclen = 0;
+      if (mb_check
+          && mb_goback (&mb_start, &mbclen, beg + offset, buf + size) != 0)
         {
           /* We have matched a single byte that is not at the beginning of a
              multibyte character.  mb_goback has advanced MB_START past that
@@ -225,22 +228,19 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,

       /* We need a preceding mb_start pointer.  Use the beginning of line
          if there is a preceding newline.  */
-      if (mb_check)
+      if (mbclen == 0)
         {
-           char const *nl = memrchr (buf, eol, beg - buf);
-           mb_start = nl ? nl + 1 : buf;
-        }
-      else
-        {
-           char const *nl = memrchr (mb_start, eol, beg - mb_start);
-           if (nl)
-             mb_start = nl + 1;
+          char const *nl = memrchr (mb_start, eol, beg - mb_start);
+          if (nl)
+            mb_start = nl + 1;
         }

       /* Succeed if neither the preceding nor the following character is a
          word constituent.  If the preceding is not, yet the following
          character IS a word constituent, keep trying with shorter matches.  */
-      if (! wordchar_prev (mb_start, beg, buf + size))
+      if (mbclen > 0
+          ? ! wordchar_next (beg - mbclen, buf + size)
+          : ! wordchar_prev (mb_start, beg, buf + size))
         for (;;)
           {
             if (! wordchar_next (beg + len, buf + size))
diff --git a/src/search.h b/src/search.h
index a782a0c..d6010b9 100644
--- a/src/search.h
+++ b/src/search.h
@@ -52,7 +52,8 @@ extern size_t wordchars_size (char const *, char const *) _GL_ATTRIBUTE_PURE;
 extern size_t wordchar_next (char const *, char const *) _GL_ATTRIBUTE_PURE;
 extern size_t wordchar_prev (char const *, char const *, char const *)
   _GL_ATTRIBUTE_PURE;
-extern ptrdiff_t mb_goback (char const **, char const *, char const *);
+extern ptrdiff_t mb_goback (char const **, size_t *, char const *,
+                            char const *);

 /* dfasearch.c */
 extern void *GEAcompile (char *, size_t, reg_syntax_t);
diff --git a/src/searchutils.c b/src/searchutils.c
index 9bb35fd..d6a36f1 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -75,18 +75,21 @@ kwsinit (bool mb_trans)
    back from CUR to the previous boundary, where a "boundary" is the
    start of a multibyte character or is an error-encoding byte.  The
    buffer ends at END (i.e., one past the address of the buffer's last
-   byte).  If CUR is already at a boundary, return 0.  If *MB_START is
-   greater than CUR, return the negative value CUR - *MB_START.
+   byte).  If CUR is already at a boundary, return 0.  If CUR is no
+   larger than *MB_START, return CUR - *MB_START without modifying
+   *MB_START or *MBCLEN.

    When returning zero, set *MB_START to CUR.  When returning a
-   positive value, set *MB_START to the next boundary after CUR, or to
-   END if there is no such boundary.  When returning a negative value,
-   leave *MB_START alone.  */
+   positive value, set *MB_START to the next boundary after CUR,
+   or to END if there is no such boundary, and set *MBCLEN to the
+   length of the preceding character.  */
 ptrdiff_t
-mb_goback (char const **mb_start, char const *cur, char const *end)
+mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
+           char const *end)
 {
   const char *p = *mb_start;
   const char *p0 = p;
+  size_t clen;

   if (cur <= p)
     return cur - p;
@@ -94,13 +97,14 @@ mb_goback (char const **mb_start, char const *cur, char const *end)
   if (localeinfo.using_utf8)
     {
       p = cur;
+      clen = 1;

       if (cur < end && (*cur & 0xc0) == 0x80)
         for (int i = 1; i <= 3; i++)
           if ((cur[-i] & 0xc0) != 0x80)
             {
               mbstate_t mbs = { 0 };
-              size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
+              clen = mb_clen (cur - i, end - (cur - i), &mbs);
               if (i < clen && clen < (size_t) -2)
                 {
                   p0 = cur - i;
@@ -114,7 +118,7 @@ mb_goback (char const **mb_start, char const *cur, char const *end)
       mbstate_t mbs = { 0 };
       do
         {
-          size_t clen = mb_clen (p, end - p, &mbs);
+          clen = mb_clen (p, end - p, &mbs);

           if ((size_t) -2 <= clen)
             {
@@ -130,6 +134,8 @@ mb_goback (char const **mb_start, char const *cur, char const *end)
     }

   *mb_start = p;
+  if (mbclen)
+    *mbclen = clen;
   return p == cur ? 0 : cur - p0;
 }

@@ -192,6 +198,6 @@ wordchar_prev (char const *buf, char const *cur, char const *end)
       || (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2))
     return sbwordchar[b];
   char const *p = buf;
-  cur -= mb_goback (&p, cur, end);
+  cur -= mb_goback (&p, NULL, cur, end);
   return wordchar_next (cur, end);
 }
-- 
2.24.0.155.gd9f6f3b619

