On Wed, Feb 19, 2014 at 6:22 AM, Norihiro Tanaka <[email protected]> wrote:
> for i in $(seq 10); do env LC_ALL=ja_JP.eucJP time src/grep -i n in; done
Wow. You're right. With the attached patch, I see a speedup of more
than 130x in this case: (fyi, the "time" output is slightly different,
because I have installed GNU time)
grep-2.17$ for i in $(seq 5); do env LC_ALL=ja_JP.eucJP time grep -i n in; done
2.78 real 2.78 user 0.00 sys
2.73 real 2.73 user 0.00 sys
2.75 real 2.75 user 0.00 sys
2.73 real 2.73 user 0.00 sys
2.74 real 2.74 user 0.00 sys
2.17+patch$ for i in $(seq 5); do env LC_ALL=ja_JP.eucJP time src/grep
-i n in; done
0.02 real 0.02 user 0.00 sys
0.02 real 0.02 user 0.00 sys
0.02 real 0.02 user 0.00 sys
0.02 real 0.02 user 0.00 sys
0.02 real 0.02 user 0.00 sys
I haven't investigated they "why" yet, but expect that I will make
grep-2.18 with just this one performance-improving patch.
Thank you, Norihiro,
Jim
From 65cba556a62016952dc76e9a943d49a2e2b27ec3 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Wed, 19 Feb 2014 11:14:52 -0800
Subject: [PATCH] grep: make -i up to 130x faster in a multi-byte locale
This reverts the grep-source changes of commit v2.16-4-g97318f5,
"grep: make --ignore-case (-i) faster (sometimes 10x) in multibyte
locales", but leaves that commit's added tests.
* src/main.c (trivial_case_convert): Remove function.
(main): Remove regexp preprocessing for -i.
* NEWS (Improvements): Mention this.
---
NEWS | 4 +++
src/main.c | 111 +------------------------------------------------------------
2 files changed, 5 insertions(+), 110 deletions(-)
diff --git a/NEWS b/NEWS
index 6785a96..c6d78d0 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,10 @@ GNU grep NEWS -*- outline
-*-
* Noteworthy changes in release ?.? (????-??-??) [?]
+** Improvements
+
+ grep -i in a multibyte locale may be over 130 times faster than in 2.17
+
* Noteworthy changes in release 2.17 (2014-02-17) [stable]
diff --git a/src/main.c b/src/main.c
index bd20297..28d3720 100644
--- a/src/main.c
+++ b/src/main.c
@@ -27,7 +27,6 @@
#include <fcntl.h>
#include <inttypes.h>
#include <stdio.h>
-#include <assert.h>
#include "system.h"
#include "argmatch.h"
@@ -1642,14 +1641,13 @@ if any error occurs and -q is not given, the exit
status is 2.\n"));
exit (status);
}
-static char const *matcher;
-
/* If M is NULL, initialize the matcher to the default. Otherwise set the
matcher to M if available. Exit in case of conflicts or if M is not
available. */
static void
setmatcher (char const *m)
{
+ static char const *matcher;
unsigned int i;
if (!m)
@@ -1864,84 +1862,6 @@ parse_grep_colors (void)
return;
}
-#define MBRTOWC(pwc, s, n, ps) \
- (MB_CUR_MAX == 1 ? \
- (*(pwc) = btowc (*(unsigned char *) (s)), 1) : \
- mbrtowc ((pwc), (s), (n), (ps)))
-
-#define WCRTOMB(s, wc, ps) \
- (MB_CUR_MAX == 1 ? \
- (*(s) = wctob ((wint_t) (wc)), 1) : \
- wcrtomb ((s), (wc), (ps)))
-
-/* If the newline-separated regular expressions, KEYS (with length, LEN
- and no trailing NUL byte), are amenable to transformation into
- otherwise equivalent case-ignoring ones, perform the transformation,
- put the result into malloc'd memory, *NEW_KEYS with length *NEW_LEN,
- and return true. Otherwise, return false. */
-static bool
-trivial_case_ignore (size_t len, char const *keys,
- size_t *new_len, char **new_keys)
-{
- /* FIXME: consider removing the following restriction:
- Reject if KEYS contain ASCII '\\' or '['. */
- if (memchr (keys, '\\', len) || memchr (keys, '[', len))
- return false;
-
- /* Worst case is that each byte B of KEYS is ASCII alphabetic and each
- other_case(B) character, C, occupies MB_CUR_MAX bytes, so each B
- maps to [BC], which requires MB_CUR_MAX + 3 bytes. */
- *new_keys = xnmalloc (MB_CUR_MAX + 3, len + 1);
- char *p = *new_keys;
-
- mbstate_t mb_state;
- memset (&mb_state, 0, sizeof mb_state);
- while (len)
- {
- wchar_t wc;
- int n = MBRTOWC (&wc, keys, len, &mb_state);
-
- /* For an invalid, incomplete or L'\0', skip this optimization. */
- if (n <= 0)
- {
- skip_case_ignore_optimization:
- free (*new_keys);
- return false;
- }
-
- char const *orig = keys;
- keys += n;
- len -= n;
-
- if (!iswalpha (wc))
- {
- memcpy (p, orig, n);
- p += n;
- }
- else
- {
- *p++ = '[';
- memcpy (p, orig, n);
- p += n;
-
- wchar_t wc2 = iswupper (wc) ? towlower (wc) : towupper (wc);
- char buf[MB_CUR_MAX];
- int n2 = WCRTOMB (buf, wc2, &mb_state);
- if (n2 <= 0)
- goto skip_case_ignore_optimization;
- assert (n2 <= MB_CUR_MAX);
- memcpy (p, buf, n2);
- p += n2;
-
- *p++ = ']';
- }
- }
-
- *new_len = p - *new_keys;
-
- return true;
-}
-
int
main (int argc, char **argv)
{
@@ -2336,35 +2256,6 @@ main (int argc, char **argv)
else
usage (EXIT_TROUBLE);
- /* As currently implemented, case-insensitive matching is expensive in
- multi-byte locales because of a few outlier locales in which some
- characters change size when converted to upper or lower case. To
- accommodate those, we revert to searching the input one line at a
- time, rather than using the much more efficient buffer search.
- However, if we have a regular expression, /foo/i, we can convert
- it to an equivalent case-insensitive /[fF][oO][oO]/, and thus
- avoid the expensive read-and-process-a-line-at-a-time requirement.
- Optimize-away the "-i" option, when possible, converting each
- candidate alpha, C, in the regexp to [Cc]. */
- if (match_icase)
- {
- size_t new_keycc;
- char *new_keys;
- /* It is not possible with -F, not useful with -P (pcre) and there is no
- point when there is no regexp. It also depends on which constructs
- appear in the regexp. See trivial_case_ignore for those details. */
- if (keycc
- && ! (matcher
- && (STREQ (matcher, "fgrep") || STREQ (matcher, "pcre")))
- && trivial_case_ignore (keycc, keys, &new_keycc, &new_keys))
- {
- match_icase = 0;
- free (keys);
- keys = new_keys;
- keycc = new_keycc;
- }
- }
-
#if MBS_SUPPORT
if (MB_CUR_MAX > 1)
build_mbclen_cache ();
--
1.9.0