On Fri, 19 Dec 2014 00:54:58 +0900
Norihiro Tanaka <[email protected]> wrote:
> On Thu, 18 Dec 2014 01:40:18 -0800
> Thanks, I understood that you said. You are right. I changed the patch
> so that always_character_boundary is not pruned even if WCP != NULL, and
> fixed the API document.
I fixed a mismatch with the comment. It does not changes the behavior.
We expect that skip_remains_mb() is inlined, and "*WCP = WC" is merged
into "IF (P < MBP)" in caller.
--
+ exceeds P. If WCP is non-NULL and the result is greater than p, set
+ *WCP to the final wide character processed, or if no wide character
+ is processed, set it to WEOF. Both P and MBP must be no larger than
+ END.
........
- if (wcp != NULL)
+ if (wcp != NULL && p < mbp)
From 0ac87ef3d35e1bd14dc0cf1d3c4a62e335c88f5e Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Mon, 15 Dec 2014 23:40:17 +0900
Subject: [PATCH] dfa: improvement for checking of multibyte character boundary
When found single bytes that cannot occur inside a multibyte character
we can skip check for multibyte character boundary before the character.
The improvement speeds up about 40% for input string which doesn't match
even the first part of a pattern.
* src/dfa.c (always_character_boundary): Add a new variable. It caches
whether each byte can occur inside a multibyte character or not.
(dfaalwayscb): Add a new function.
(dfacomp): Use it.
(skip_remains_mb): If an input character is single bytes that cannot
occur inside a multibyte character, skip check for multibyte character
boundary until there.
---
src/dfa.c | 38 ++++++++++++++++++++++++++++++++++----
1 file changed, 34 insertions(+), 4 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 806cb04..2406387 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -451,6 +451,29 @@ struct dfa
static void dfamust (struct dfa *dfa);
static void regexp (void);
+/* True if each byte can not occur inside a multibyte character */
+static bool always_character_boundary[NOTCHAR];
+
+static void
+dfaalwayscb (void)
+{
+ int i;
+ if (using_utf8 ())
+ {
+ for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+ {
+ unsigned char uc = i;
+ always_character_boundary[uc] = !((uc & 0xc0) ^ 0x80);
+ }
+ }
+ else
+ {
+ unsigned char const ucs[] = { '\0', '\n', '\r', '.', '/' };
+ for (i = 0; i < sizeof ucs / sizeof ucs[0]; ++i)
+ always_character_boundary[ucs[i]] = true;
+ }
+}
+
static void
dfambcache (struct dfa *d)
{
@@ -3271,18 +3294,24 @@ transit_state (struct dfa *d, state_num s, unsigned
char const **pp,
character.
Given DFA state d, use mbs_to_wchar to advance MBP until it reaches or
- exceeds P. If WCP is non-NULL, set *WCP to the final wide character
- processed, or if no wide character is processed, set it to WEOF.
- Both P and MBP must be no larger than END. */
+ exceeds P. If WCP is non-NULL and the result is greater than p, set
+ *WCP to the final wide character processed, or if no wide character
+ is processed, set it to WEOF. Both P and MBP must be no larger than
+ END.
+
+ If next character is character boundary, it always reaches P. So if
+ WCP is NULL, can omit check step by step and immediately return P. */
static unsigned char const *
skip_remains_mb (struct dfa *d, unsigned char const *p,
unsigned char const *mbp, char const *end, wint_t *wcp)
{
wint_t wc = WEOF;
+ if (always_character_boundary[*p])
+ return p;
while (mbp < p)
mbp += mbs_to_wchar (&wc, (char const *) mbp,
end - (char const *) mbp, d);
- if (wcp != NULL)
+ if (wcp != NULL && p < mbp)
*wcp = wc;
return mbp;
}
@@ -3713,6 +3742,7 @@ void
dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
{
dfainit (d);
+ dfaalwayscb ();
dfambcache (d);
dfaparse (s, len, d);
dfamust (d);
--
2.2.0