On 06/05/2014 04:32 AM, Norihiro Tanaka wrote:
+      memchr (cp, *lookfor, lenin - (cp - lookin));
+      if (!cp)

Thanks, but this part can't be right, as memchr's result is discarded.

It seems to me that much of the performance benefit comes from using a faster implementation of strstr, and that the DFA code will be better off if it simply uses the system strstr rather than rolling its own. (The DFA code dates back before strstr was standardized, which is why it has its own implementation.)

I installed the attached patch to do that and got a big speedup:

$ printf '%08192d\n' 0 | time -p src/old/grep -f - /dev/null
real 16.14
user 16.13
sys 0.00
$ printf '%08192d\n' 0 | time -p src/grep -f - /dev/null
real 0.79
user 0.79
sys 0.00

Could you please look at the remaining part of your patch and see whether it's a win if it's merged to what's now installed? Thanks.

PS. Aharon, I assume this'll affect Gawk, in that you'll need to provide a strstr if you want to be portable to ancient systems that lack it. strstr was standardized in C89 so it'd have to be a pretty ancient system, and it may be better just to let this slide.
diff --git a/NEWS b/NEWS
index fdad83c..1af3def 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,10 @@ GNU grep NEWS                                    -*- outline -*-
 
 * Noteworthy changes in release ?.? (????-??-??) [?]
 
+** Improvements
+
+  Performance has improved for very long strings in patterns.
+
 
 * Noteworthy changes in release 2.20 (2014-06-03) [stable]
 
diff --git a/bootstrap.conf b/bootstrap.conf
index 9f3457d..1ab730d 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -77,6 +77,7 @@ stdlib
 stpcpy
 strerror
 string
+strstr
 strtoull
 strtoumax
 sys_stat
diff --git a/src/dfa.c b/src/dfa.c
index 48a83cd..522a027 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3752,19 +3752,6 @@ icatalloc (char *old, char const *new)
   return result;
 }
 
-static char *_GL_ATTRIBUTE_PURE
-istrstr (char const *lookin, char const *lookfor)
-{
-  char const *cp;
-  size_t len;
-
-  len = strlen (lookfor);
-  for (cp = lookin; *cp != '\0'; ++cp)
-    if (strncmp (cp, lookfor, len) == 0)
-      return (char *) cp;
-  return NULL;
-}
-
 static void
 freelist (char **cpp)
 {
@@ -3780,7 +3767,7 @@ enlist (char **cpp, char *new, size_t len)
   new[len] = '\0';
   /* Is there already something in the list that's new (or longer)?  */
   for (i = 0; cpp[i] != NULL; ++i)
-    if (istrstr (cpp[i], new) != NULL)
+    if (strstr (cpp[i], new) != NULL)
       {
         free (new);
         return cpp;
@@ -3788,7 +3775,7 @@ enlist (char **cpp, char *new, size_t len)
   /* Eliminate any obsoleted strings.  */
   j = 0;
   while (cpp[j] != NULL)
-    if (istrstr (new, cpp[j]) == NULL)
+    if (strstr (new, cpp[j]) == NULL)
       ++j;
     else
       {

Reply via email to