Hi.
Here is my proposed patch for merging the byte to w.c. caches in gawk
by using the one in dfa.
I renamed the one in dfa to 'btowc_cache' since it caches bytes,
not multibyte characters. This compiles and gets through the test
suite.
I also changed the check for the return of mbrtowc since it returns
unsigned.
Thanks,
Arnold
diff --git a/awk.h b/awk.h
index 86c8883..636be96 100644
--- a/awk.h
+++ b/awk.h
@@ -1591,10 +1591,6 @@ extern const wchar_t *wcasestrstr(const wchar_t *haystack, size_t hs_len,
const wchar_t *needle, size_t needle_len);
extern void r_free_wstr(NODE *n);
#define free_wstr(n) do { if ((n)->flags & WSTRCUR) r_free_wstr(n); } while(0)
-extern wint_t btowc_cache[];
-#define btowc_cache(x) btowc_cache[(x)&0xFF]
-extern void init_btowc_cache();
-#define is_valid_character(b) (btowc_cache[(b)&0xFF] != WEOF)
/* re.c */
extern Regexp *make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal);
extern int research(Regexp *rp, char *str, int start, size_t len, int flags);
diff --git a/dfa.c b/dfa.c
index fff4599..a2c73b1 100644
--- a/dfa.c
+++ b/dfa.c
@@ -464,10 +464,10 @@ static void regexp (void);
/* A table indexed by byte values that contains the corresponding wide
character (if any) for that byte. WEOF means the byte is not a
valid single-byte character. */
-static wint_t mbrtowc_cache[NOTCHAR];
+wint_t btowc_cache[NOTCHAR];
/* Store into *PWC the result of converting the leading bytes of the
- multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
+ multibyte buffer S of length N bytes, using the btowc_cache in *D
and updating the conversion state in *D. On conversion error,
convert just a single byte, to WEOF. Return the number of bytes
converted.
@@ -476,7 +476,7 @@ static wint_t mbrtowc_cache[NOTCHAR];
* PWC points to wint_t, not to wchar_t.
* The last arg is a dfa *D instead of merely a multibyte conversion
- state D->mbs. D also contains an mbrtowc_cache for speed.
+ state D->mbs. D also contains an btowc_cache for speed.
* N must be at least 1.
* S[N - 1] must be a sentinel byte.
* Shift encodings are not supported.
@@ -487,7 +487,7 @@ static size_t
mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
{
unsigned char uc = s[0];
- wint_t wc = mbrtowc_cache[uc];
+ wint_t wc = btowc_cache[uc];
if (wc == WEOF)
{
@@ -695,7 +695,7 @@ static charclass newline;
static bool
unibyte_word_constituent (unsigned char c)
{
- return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_');
+ return btowc_cache[c] != WEOF && (isalnum (c) || (c) == '_');
}
static int
@@ -718,25 +718,44 @@ wchar_context (wint_t wc)
return CTX_NONE;
}
+void init_btowc_cache(void)
+{
+ static bool inited = false;
+ int i;
+
+ if (inited)
+ return;
+
+ for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+ {
+ char c = i;
+ unsigned char uc = i;
+ mbstate_t s = { 0 };
+ wchar_t wc;
+ size_t ret = mbrtowc (&wc, &c, 1, &s);
+ btowc_cache[uc] = (ret == (size_t)-1 || ret == (size_t) -2) ? WEOF : wc;
+ }
+
+ inited = true;
+}
+
/* Entry point to set syntax options. */
void
dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
{
int i;
+
syntax_bits_set = 1;
syntax_bits = bits;
case_fold = fold != 0;
eolbyte = eol;
+ init_btowc_cache();
+ /* Now that btowc_cache[uc] is set, use it to calculate sbit. */
for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
{
- char c = i;
unsigned char uc = i;
- mbstate_t s = { 0 };
- wchar_t wc;
- mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
- /* Now that mbrtowc_cache[uc] is set, use it to calculate sbit. */
sbit[uc] = char_context (uc);
switch (sbit[uc])
{
diff --git a/dfa.h b/dfa.h
index 18be7f5..f2dd656 100644
--- a/dfa.h
+++ b/dfa.h
@@ -120,4 +120,15 @@ extern void dfawarn (const char *);
The user must supply a dfaerror. */
extern _Noreturn void dfaerror (const char *);
+/* General support routines. */
+
+/* using_utf8() lets us know if our locale is one based on UTF-8. */
extern int using_utf8 (void);
+
+/* init_mbcache() initializes the cache that maps bytes to m.b. characters. */
+extern void init_btowc_cache(void);
+
+/* is_valid_character() tells us if a byte is also a valid m.b. character. */
+extern wint_t btowc_cache[];
+#define is_valid_character(byte) (btowc_cache[(byte)&0xFF] != WEOF)
+#define btowc_cache(x) btowc_cache[(x)&0xFF]
diff --git a/node.c b/node.c
index a7c19db..22119d2 100644
--- a/node.c
+++ b/node.c
@@ -949,19 +949,6 @@ get_ieee_magic_val(const char *val)
return v;
}
-wint_t btowc_cache[256];
-
-/* init_btowc_cache --- initialize the cache */
-
-void init_btowc_cache()
-{
- int i;
-
- for (i = 0; i < 255; i++) {
- btowc_cache[i] = btowc(i);
- }
-}
-
#define BLOCKCHUNK 100
BLOCK nextfree[BLOCK_MAX] = {