diff --git a/src/main.c b/src/main.c index 3f16061..738c130 100644 --- a/src/main.c +++ b/src/main.c @@ -46,6 +46,7 @@ #include "propername.h" #include "quote.h" #include "safe-read.h" +#include "search.h" #include "version-etc.h" #include "xalloc.h" #include "xstrtol.h" @@ -2364,6 +2365,11 @@ main (int argc, char **argv) } } +#if MBS_SUPPORT + if (MB_CUR_MAX > 1) + build_mbclen_cache (); +#endif + compile (keys, keycc); free (keys); diff --git a/src/search.h b/src/search.h index 61dcf95..12d0822 100644 --- a/src/search.h +++ b/src/search.h @@ -46,6 +46,7 @@ typedef signed char mb_len_map_t; extern void kwsinit (kwset_t *); extern char *mbtolower (const char *, size_t *, mb_len_map_t **); +extern void build_mbclen_cache (void); extern bool is_mb_middle (const char **, const char *, const char *, size_t); /* dfasearch.c */ diff --git a/src/searchutils.c b/src/searchutils.c index 778f4ad..6fee756 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -19,9 +19,16 @@ #include #include #include "search.h" +#if HAVE_LANGINFO_CODESET +# include +#endif + +#define STREQ(a, b) (strcmp (a, b) == 0) #define NCHAR (UCHAR_MAX + 1) +static size_t mbclen_cache[NCHAR]; + void kwsinit (kwset_t *kwset) { @@ -207,6 +214,19 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p) return out; } +void +build_mbclen_cache (void) +{ + int i; + + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + char c = i; + unsigned char uc = i; + mbstate_t mbs = { 0 }; + mbclen_cache[uc] = mbrlen (&c, 1, &mbs); + } +} bool is_mb_middle (const char **good, const char *buf, const char *end, @@ -215,16 +235,35 @@ is_mb_middle (const char **good, const char *buf, const char *end, const char *p = *good; const char *prev = p; mbstate_t cur_state; +#if HAVE_LANGINFO_CODESET + static int is_utf8 = -1; + + if (is_utf8 == -1) + is_utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8") == 0); + + if (is_utf8 && buf - p > MB_CUR_MAX) + { + for (p = buf; buf - p > MB_CUR_MAX; p--) + if (mbclen_cache[(unsigned char) *p] != (size_t) -1) + break; + + if (buf - p == MB_CUR_MAX) + p = buf; + } +#endif + + memset(&cur_state, 0, sizeof cur_state); - /* TODO: can be optimized for UTF-8. */ - memset(&cur_state, 0, sizeof(mbstate_t)); while (p < buf) { - size_t mbclen = mbrlen(p, end - p, &cur_state); + size_t mbclen = mbclen_cache[(unsigned char) *p]; + + if (mbclen == (size_t) -2) + mbclen = mbrlen (p, end - p, &cur_state); /* Store the beginning of the previous complete multibyte character. */ if (mbclen != (size_t) -2) - prev = p; + prev = p; if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) {