Norihiro Tanaka wrote:
By the way, if `FGREP_NO_DFA' env. is set, fgrep still uses fgrep matcher.
I'd rather not introduce another environment variable. GREP_COLOR and GREP_COLORS and GREP_OPTIONS are now commonly considered to have been mistakes, and I'd rather not compound them.
Instead, how about removing the fgrep matcher entirely? That'll make the code simpler and easier to maintain, which is a real plus. Perhaps it'll be slower in a few cases, but as long as significant slowdowns are rare, that's OK.
Something like the attached patch, say.
From ce7edb6943f6efd50dcda06bd994855ae609e86c Mon Sep 17 00:00:00 2001 From: Paul Eggert <[email protected]> Date: Tue, 6 May 2014 23:36:54 -0700 Subject: [PATCH] grep: simplify by removing Fexecute, kwsearch.c, etc. These were present only for performance, but often Fexecute is slower and this code doesn't seem to be worth the maintenance hassle. From a suggestion by Norihiro Tanaka in: http://bugs.gnu.org/17420 * src/Makefile.am (grep_SOURCES): Remove kwsearch.c. * src/grep.c (compile, execute): Initialize at link-time instead of at runtime. (do_execute): Simplify, since there is no Fexecute any more. (Fcompile): New function, replacing the old kwsearch.c function. It mostly consists of the old fgrep_to_grep_pattern's contents. (matchers): Use EGexecute for fgrep. (contains_encoding_error, fgrep_to_grep_pattern): Remove. (main): Remove no-longer-necessary initializations. Move special-case code for -F into Fcompile. * src/kwsearch.c: Remove. * src/search.h (Fcompile, Fexecute): Remove decls. --- src/Makefile.am | 2 +- src/grep.c | 130 +++++++++++++++------------------------ src/kwsearch.c | 188 -------------------------------------------------------- src/search.h | 4 -- 4 files changed, 49 insertions(+), 275 deletions(-) delete mode 100644 src/kwsearch.c diff --git a/src/Makefile.am b/src/Makefile.am index f8c9415..ae8d13a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -25,7 +25,7 @@ bin_PROGRAMS = grep bin_SCRIPTS = egrep fgrep grep_SOURCES = grep.c searchutils.c \ dfa.c dfasearch.c \ - kwset.c kwsearch.c \ + kwset.c \ pcresearch.c noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h diff --git a/src/grep.c b/src/grep.c index a661fc0..cb5e40f 100644 --- a/src/grep.c +++ b/src/grep.c @@ -416,8 +416,9 @@ usable_st_size (struct stat const *st) /* Functions we'll use to search. */ typedef void (*compile_fp_t) (char const *, size_t); typedef size_t (*execute_fp_t) (char const *, size_t, size_t *, char const *); -static compile_fp_t compile; -static execute_fp_t execute; +static void Gcompile (char const *, size_t); +static compile_fp_t compile = Gcompile; +static execute_fp_t execute = EGexecute; /* Like error, but suppress the diagnostic if requested. */ static void @@ -1055,9 +1056,8 @@ do_execute (char const *buf, size_t size, size_t *match_size, size_t result; const char *line_next; - /* With the current implementation, using --ignore-case with a multi-byte - character set is very inefficient when applied to a large buffer - containing many matches. We can avoid much of the wasted effort + /* -Pi in a multibyte locale is inefficient when applied to a large + buffer containing many matches. Avoid much of the wasted effort by matching line-by-line. FIXME: this is just an ugly workaround, and it doesn't really @@ -1067,8 +1067,7 @@ do_execute (char const *buf, size_t size, size_t *match_size, to struct matcher to split the buffer passed to execute. It would perform the memchr if line-by-line matching is necessary, or just return buf + size otherwise. */ - if (! (execute == Fexecute || execute == Pexecute) - || MB_CUR_MAX == 1 || !match_icase) + if (! (execute == Pexecute && match_icase && MB_CUR_MAX > 1)) return execute (buf, size, match_size, start_ptr); for (line_next = buf; line_next < buf + size; ) @@ -1671,6 +1670,47 @@ PAcompile (char const *pattern, size_t size) GEAcompile (pattern, size, RE_SYNTAX_POSIX_AWK); } +static void +Fcompile (char const *keys, size_t len) +{ + /* Implement grep -F in terms of plain grep. */ + + char *new_keys = xnmalloc (len + 1, 2); + char *p = new_keys; + mbstate_t mb_state = { 0 }; + size_t n; + + for (; len; keys += n, len -= n) + { + wchar_t wc; + n = mbrtowc (&wc, keys, len, &mb_state); + switch (n) + { + case (size_t) -2: + n = len; + /* Fall through. */ + default: + p = mempcpy (p, keys, n); + break; + + case (size_t) -1: + memset (&mb_state, 0, sizeof mb_state); + /* Fall through. */ + case 1: + *p = '\\'; + p += strchr ("$*.[\\^", *keys) != NULL; + /* Fall through. */ + case 0: + *p++ = *keys; + n = 1; + break; + } + } + + Gcompile (new_keys, p - new_keys); + free (new_keys); +} + struct matcher { char const name[16]; @@ -1680,7 +1720,7 @@ struct matcher static struct matcher const matchers[] = { { "grep", Gcompile, EGexecute }, { "egrep", Ecompile, EGexecute }, - { "fgrep", Fcompile, Fexecute }, + { "fgrep", Fcompile, EGexecute }, { "awk", Acompile, EGexecute }, { "gawk", GAcompile, EGexecute }, { "posixawk", PAcompile, EGexecute }, @@ -1887,61 +1927,6 @@ parse_grep_colors (void) return; } -/* Return true if PAT (of length PATLEN) contains an encoding error. */ -static bool -contains_encoding_error (char const *pat, size_t patlen) -{ - mbstate_t mbs = { 0 }; - size_t i, charlen; - - for (i = 0; i < patlen; i += charlen + (charlen == 0)) - { - charlen = mbrlen (pat + i, patlen - i, &mbs); - if ((size_t) -2 <= charlen) - return true; - } - return false; -} - -/* Change a pattern for fgrep into grep. */ -static void -fgrep_to_grep_pattern (size_t len, char const *keys, - size_t *new_len, char **new_keys) -{ - char *p = *new_keys = xnmalloc (len + 1, 2); - mbstate_t mb_state = { 0 }; - size_t n; - - for (; len; keys += n, len -= n) - { - wchar_t wc; - n = mbrtowc (&wc, keys, len, &mb_state); - switch (n) - { - case (size_t) -2: - n = len; - /* Fall through. */ - default: - p = mempcpy (p, keys, n); - break; - - case (size_t) -1: - memset (&mb_state, 0, sizeof mb_state); - /* Fall through. */ - case 1: - *p = '\\'; - p += strchr ("$*.[\\^", *keys) != NULL; - /* Fall through. */ - case 0: - *p++ = *keys; - n = 1; - break; - } - } - - *new_len = p - *new_keys; -} - int main (int argc, char **argv) { @@ -1988,8 +1973,6 @@ main (int argc, char **argv) last_recursive = 0; prepended = prepend_default_options (getenv ("GREP_OPTIONS"), &argc, &argv); - compile = matchers[0].compile; - execute = matchers[0].execute; while (prev_optind = optind, (opt = get_nondigit_option (argc, argv, &default_context)) != -1) @@ -2333,23 +2316,6 @@ main (int argc, char **argv) else usage (EXIT_TROUBLE); - /* If fgrep in a multibyte locale, then use grep if either - (1) case is ignored (where grep is typically faster), or - (2) the pattern has an encoding error (where fgrep might not work). */ - if (compile == Fcompile && MB_CUR_MAX > 1 - && (match_icase || contains_encoding_error (keys, keycc))) - { - size_t new_keycc; - char *new_keys; - fgrep_to_grep_pattern (keycc, keys, &new_keycc, &new_keys); - free (keys); - keys = new_keys; - keycc = new_keycc; - matcher = "grep"; - compile = Gcompile; - execute = EGexecute; - } - if (MB_CUR_MAX > 1) build_mbclen_cache (); diff --git a/src/kwsearch.c b/src/kwsearch.c deleted file mode 100644 index 6bd516a..0000000 --- a/src/kwsearch.c +++ /dev/null @@ -1,188 +0,0 @@ -/* kwsearch.c - searching subroutines using kwset for grep. - Copyright 1992, 1998, 2000, 2007, 2009-2014 Free Software Foundation, Inc. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3, or (at your option) - any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA - 02110-1301, USA. */ - -/* Written August 1992 by Mike Haertel. */ - -#include <config.h> -#include "search.h" - -/* Whether -w considers WC to be a word constituent. */ -static bool -wordchar (wint_t wc) -{ - return wc == L'_' || iswalnum (wc); -} - -/* KWset compiled pattern. For Ecompile and Gcompile, we compile - a list of strings, at least one of which is known to occur in - any string matching the regexp. */ -static kwset_t kwset; - -void -Fcompile (char const *pattern, size_t size) -{ - size_t total = size; - mb_len_map_t *map = NULL; - char const *pat = (match_icase && MB_CUR_MAX > 1 - ? mbtoupper (pattern, &total, &map) - : pattern); - - kwsinit (&kwset); - - char const *p = pat; - do - { - size_t len; - char const *sep = memchr (p, '\n', total); - if (sep) - { - len = sep - p; - sep++; - total -= (len + 1); - } - else - { - len = total; - total = 0; - } - - char *buf = NULL; - if (match_lines) - { - buf = xmalloc (len + 2); - buf[0] = eolbyte; - memcpy (buf + 1, p, len); - buf[len + 1] = eolbyte; - p = buf; - len += 2; - } - kwsincr (kwset, p, len); - free (buf); - - p = sep; - } - while (p); - - kwsprep (kwset); -} - -/* Apply the MAP (created by mbtoupper) to the uppercase-buffer-relative - *OFF and *LEN, converting them to be relative to the original buffer. */ - -static void -mb_case_map_apply (mb_len_map_t const *map, size_t *off, size_t *len) -{ - if (map) - { - size_t off_incr = 0; - size_t len_incr = 0; - size_t k; - for (k = 0; k < *off; k++) - off_incr += map[k]; - for (; k < *off + *len; k++) - len_incr += map[k]; - *off += off_incr; - *len += len_incr; - } -} - -size_t -Fexecute (char const *buf, size_t size, size_t *match_size, - char const *start_ptr) -{ - char const *beg, *try, *end, *mb_start; - size_t len; - char eol = eolbyte; - struct kwsmatch kwsmatch; - size_t ret_val; - mb_len_map_t *map = NULL; - - if (MB_CUR_MAX > 1) - { - if (match_icase) - { - char *case_buf = mbtoupper (buf, &size, &map); - if (start_ptr) - start_ptr = case_buf + (start_ptr - buf); - buf = case_buf; - } - } - - for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++) - { - size_t offset = kwsexec (kwset, beg - match_lines, - buf + size - beg + match_lines, &kwsmatch); - if (offset == (size_t) -1) - goto failure; - len = kwsmatch.size[0] - match_lines; - if (!match_lines && MB_CUR_MAX > 1 && !using_utf8 () - && mb_goback (&mb_start, beg + offset, buf + size) != 0) - { - /* The match was a part of multibyte character, advance at least - one byte to ensure no infinite loop happens. */ - beg = mb_start; - continue; - } - beg += offset; - if (start_ptr && !match_words) - goto success_in_beg_and_len; - if (match_lines) - goto success_in_beg_and_len; - if (match_words) - for (try = beg; ; ) - { - if (wordchar (mb_prev_wc (buf, try, buf + size))) - break; - if (wordchar (mb_next_wc (try + len, buf + size))) - { - if (!len) - break; - offset = kwsexec (kwset, beg, --len, &kwsmatch); - if (offset == (size_t) -1) - break; - try = beg + offset; - len = kwsmatch.size[0]; - } - else if (!start_ptr) - goto success; - else - goto success_in_beg_and_len; - } /* for (try) */ - else - goto success; - } /* for (beg in buf) */ - - failure: - return -1; - - success: - if ((end = memchr (beg + len, eol, (buf + size) - (beg + len))) != NULL) - end++; - else - end = buf + size; - while (buf < beg && beg[-1] != eol) - --beg; - len = end - beg; - success_in_beg_and_len:; - size_t off = beg - buf; - mb_case_map_apply (map, &off, &len); - - *match_size = len; - ret_val = off; - return ret_val; -} diff --git a/src/search.h b/src/search.h index 14877bc..ddb8bbd 100644 --- a/src/search.h +++ b/src/search.h @@ -53,10 +53,6 @@ extern wint_t mb_next_wc (char const *, char const *); extern void GEAcompile (char const *, size_t, reg_syntax_t); extern size_t EGexecute (char const *, size_t, size_t *, char const *); -/* kwsearch.c */ -extern void Fcompile (char const *, size_t); -extern size_t Fexecute (char const *, size_t, size_t *, char const *); - /* pcresearch.c */ extern void Pcompile (char const *, size_t); extern size_t Pexecute (char const *, size_t, size_t *, char const *); -- 1.9.0
