diff --git a/src/main.c b/src/main.c index bfd0982..2a3a8af 100644 --- a/src/main.c +++ b/src/main.c @@ -46,6 +46,7 @@ #include "propername.h" #include "quote.h" #include "safe-read.h" +#include "search.h" #include "version-etc.h" #include "xalloc.h" #include "xstrtol.h" @@ -2372,6 +2373,11 @@ main (int argc, char **argv) } } +#if MBS_SUPPORT + if (MB_CUR_MAX > 1) + build_mbclen_guess (); +#endif + compile (keys, keycc); free (keys); diff --git a/src/search.h b/src/search.h index 61dcf95..6d77d34 100644 --- a/src/search.h +++ b/src/search.h @@ -46,6 +46,7 @@ typedef signed char mb_len_map_t; extern void kwsinit (kwset_t *); extern char *mbtolower (const char *, size_t *, mb_len_map_t **); +extern void build_mbclen_guess (void); extern bool is_mb_middle (const char **, const char *, const char *, size_t); /* dfasearch.c */ diff --git a/src/searchutils.c b/src/searchutils.c index 778f4ad..9493a1b 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -19,9 +19,14 @@ #include #include #include "search.h" +#if HAVE_LANGINFO_CODESET +# include +#endif #define NCHAR (UCHAR_MAX + 1) +static size_t mbclen_guess[NCHAR]; + void kwsinit (kwset_t *kwset) { @@ -207,6 +212,42 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p) return out; } +void +build_mbclen_guess (void) +{ + mbstate_t mbs; + int i; + +#if defined HAVE_LANGINFO_CODESET + if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) + { + for (i = 0; i < NCHAR; ++i) + mbclen_guess[i] = -1; + + for (i = 0x00; i <= 0x7f; ++i) + mbclen_guess[i] = 1; + for (i = 0xc2; i < 0xdf; ++i) + mbclen_guess[i] = 2; + for (i = 0xe0; i <= 0xef; ++i) + mbclen_guess[i] = 3; + for (i = 0xf0; i <= 0xf7; ++i) + mbclen_guess[i] = 4; + for (i = 0xf8; i <= 0xfb; ++i) + mbclen_guess[i] = 5; + for (i = 0xfc; i <= 0xfd; ++i) + mbclen_guess[i] = 6; + + return; + } +#endif + + for (i = 0; i < NCHAR; ++i) + { + if (!mbsinit (&mbs)) + memset (&mbs, '\0', sizeof (mbstate_t)); + mbclen_guess[i] = mbrlen ((const char *) &i, 1 - p, &mbs); + } +} bool is_mb_middle (const char **good, const char *buf, const char *end, @@ -215,16 +256,53 @@ is_mb_middle (const char **good, const char *buf, const char *end, const char *p = *good; const char *prev = p; mbstate_t cur_state; +#if HAVE_LANGINFO_CODESET + static int is_utf8 = (strcmp (nl_langinfo (CODESET), "UTF-8") == 0); + + if (!is_utf8) +#endif + memset (&cur_state, 0, sizeof (mbstate_t)); - /* TODO: can be optimized for UTF-8. */ - memset(&cur_state, 0, sizeof(mbstate_t)); while (p < buf) { - size_t mbclen = mbrlen(p, end - p, &cur_state); + size_t mbclen = mbclen_guess[(unsigned char) *p]; + + if (mbclen == (size_t) -2) + mbclen = mbrlen (p, end - p, &cur_state); + else if (mbclen == (size_t) -1 || mbclen == 0) + mbclen = 1; +#if defined HAVE_LANGINFO_CODESET + + /* For UTF-8 check the second and subsequent bytes. */ + else if (is_utf8 && mbclen > (size_t) 1) + { + char *q = p + mbclen; + + if (q <= buf) + { + for (++p; p < q; ++p) + { + if (*p < 0x80 || 0xbf < *p) + break; + } + } + + if (p < q) + { + /* An invalid sequence, or a truncated multibyte character. + We treat it as a single byte character. */ + prev = p; + + /* Store the beginning of the previous complete multibyte + character. */ + mbclen = 1; + } + } +#endif /* Store the beginning of the previous complete multibyte character. */ if (mbclen != (size_t) -2) - prev = p; + prev = p; if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) {