This should help merges changes from Gawk, which always uses the char32_t API though that’s sometimes implemented with the wchar_t API even on platforms where wchar_t and char32_t act differently. The idea is to use char32_t uniformly in both the dfa and regex modules, so that they get consistent answers on all platforms. * lib/dfa.c, lib/localeinfo.c, lib/localeinfo.h: If GAWK, do not include <wctype.h> or redefine the Gnulib char32_t types and functions to be wchar.h and wctype.h functions or define mbszero and streq, as I think I have a better way to do this with Gawk that is less intrusive here; instead, always include <uchar.h>. * lib/dfa.c: Do not include <wchar.h>. Include "getext.h" before including "xalloc.h" and "localinfo.h", as Gnulib doesn’t care about the order and this works better with Gawk’s way of overriding Gnulib. (parse_bracket_exp): Use && instead of &; either is correct and both are equally fast nowadays but && triggers a warning in some Gawk compiles. * lib/dfa.h (_GL_ATTRIBUTE_MALLOC, _GL_ATTRIBUTE_DEALLOC) (_GL_ATTRIBUTE_DEALLOC_FREE) (_GL_ATTRIBUTE_RETURNS_NONNULL) [!_GL_ATTRIBUTE_MALLOC]: Remove, as Gawk’s custom.h can define them. * lib/localeinfo.c: Go back to using <verify.h> and ‘verify’ instead of using static_assert which Gawk can’t easily use because it does not use Gnulib’s assert-h module. * lib/localeinfo.h: Do not include <limits.h>, avoding some namespace pollution. (struct localeinfo): Use (unsigned char) -1 instead of UCHAR_MAX to avoid the need to include <limits.h>. * modules/dfa (Depends-on): Remove wchar-h. --- ChangeLog | 30 ++++++++++++++++++++++++++++++ lib/dfa.c | 25 +++++-------------------- lib/dfa.h | 11 ++--------- lib/localeinfo.c | 21 ++++++--------------- lib/localeinfo.h | 14 +++----------- modules/dfa | 1 - 6 files changed, 46 insertions(+), 56 deletions(-)
diff --git a/ChangeLog b/ChangeLog index 6a07acdad5..13cdd0d1fb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,35 @@ 2026-04-24 Paul Eggert <[email protected]> + dfa: always use char32_t not wchar_t + This should help merges changes from Gawk, which always uses the + char32_t API though that’s sometimes implemented with the wchar_t + API even on platforms where wchar_t and char32_t act differently. + The idea is to use char32_t uniformly in both the dfa and regex + modules, so that they get consistent answers on all platforms. + * lib/dfa.c, lib/localeinfo.c, lib/localeinfo.h: If GAWK, do not + include <wctype.h> or redefine the Gnulib char32_t types and + functions to be wchar.h and wctype.h functions or define mbszero + and streq, as I think I have a better way to do this with Gawk + that is less intrusive here; instead, always include <uchar.h>. + * lib/dfa.c: Do not include <wchar.h>. Include "getext.h" before + including "xalloc.h" and "localinfo.h", as Gnulib doesn’t care + about the order and this works better with Gawk’s way of overriding Gnulib. + (parse_bracket_exp): Use && instead of &; either is correct and + both are equally fast nowadays but && triggers a warning in some + Gawk compiles. + * lib/dfa.h (_GL_ATTRIBUTE_MALLOC, _GL_ATTRIBUTE_DEALLOC) + (_GL_ATTRIBUTE_DEALLOC_FREE) + (_GL_ATTRIBUTE_RETURNS_NONNULL) [!_GL_ATTRIBUTE_MALLOC]: + Remove, as Gawk’s custom.h can define them. + * lib/localeinfo.c: Go back to using <verify.h> and ‘verify’ + instead of using static_assert which Gawk can’t easily use because + it does not use Gnulib’s assert-h module. + * lib/localeinfo.h: Do not include <limits.h>, avoding some + namespace pollution. + (struct localeinfo): Use (unsigned char) -1 instead of UCHAR_MAX + to avoid the need to include <limits.h>. + * modules/dfa (Depends-on): Remove wchar-h. + uchar-h: <string.h> etc. namespace cleanup This is only a partial cleanup; to be cleaner we’d need to move declarations of Gnulib extensions like c32isalpha diff --git a/lib/dfa.c b/lib/dfa.c index 7bbe94405b..7719e80669 100644 --- a/lib/dfa.c +++ b/lib/dfa.c @@ -33,29 +33,14 @@ #include <stdlib.h> #include <limits.h> #include <string.h> -#include <wchar.h> - -#include "xalloc.h" -#include "localeinfo.h" #include "gettext.h" #define _(msgid) dgettext (GNULIB_TEXT_DOMAIN, msgid) -#if GAWK -/* Use ISO C 99 API. */ -# include <wctype.h> -# define char32_t wchar_t -# define mbrtoc32 mbrtowc -# define c32rtomb wcrtomb -# define c32tob wctob -# define c32isprint iswprint -# define c32isspace iswspace -# define mbszero(p) memset (p, 0, sizeof (mbstate_t)) -# define streq(a, b) (strcmp (a, b) == 0) -#else -/* Use ISO C 11 + gnulib API. */ -# include <uchar.h> -#endif +#include "xalloc.h" +#include "localeinfo.h" + +#include <uchar.h> /* Pacify gcc -Wanalyzer-null-dereference in areas where GCC understandably cannot deduce that the input comes from a @@ -1099,7 +1084,7 @@ parse_bracket_exp (struct dfa *dfa) if (wc != wc2 || wc == WEOF) { if (dfa->localeinfo.simple - || (c_isdigit (c) & c_isdigit (c2))) + || (c_isdigit (c) && c_isdigit (c2))) { for (int ci = c; ci <= c2; ci++) if (dfa->syntax.case_fold && isalpha (ci)) diff --git a/lib/dfa.h b/lib/dfa.h index f3454f0c0e..71a944176e 100644 --- a/lib/dfa.h +++ b/lib/dfa.h @@ -19,7 +19,8 @@ #ifndef DFA_H_ #define DFA_H_ -/* This file uses _Noreturn, _GL_ATTRIBUTE_DEALLOC, _GL_ATTRIBUTE_MALLOC, +/* This file uses _Noreturn, _GL_ATTRIBUTE_DEALLOC, + _GL_ATTRIBUTE_DEALLOC_FREE, _GL_ATTRIBUTE_MALLOC, _GL_ATTRIBUTE_PURE, _GL_ATTRIBUTE_RETURNS_NONNULL. */ #if !_GL_CONFIG_H_INCLUDED #error "Please include config.h first." @@ -49,14 +50,6 @@ struct dfamust /* The dfa structure. It is completely opaque. */ struct dfa; -/* Needed when Gnulib is not used. */ -#ifndef _GL_ATTRIBUTE_MALLOC -# define _GL_ATTRIBUTE_MALLOC -# define _GL_ATTRIBUTE_DEALLOC(f, i) -# define _GL_ATTRIBUTE_DEALLOC_FREE -# define _GL_ATTRIBUTE_RETURNS_NONNULL -#endif - /* Entry points. */ /* Allocate a struct dfa. The struct dfa is completely opaque. diff --git a/lib/localeinfo.c b/lib/localeinfo.c index 89a169a462..3d1ee529a7 100644 --- a/lib/localeinfo.c +++ b/lib/localeinfo.c @@ -21,25 +21,16 @@ #include <localeinfo.h> +#include <verify.h> + #include <limits.h> #include <locale.h> #include <stdlib.h> #include <string.h> -#if GAWK -/* Use ISO C 99 API. */ -# include <wctype.h> -# define char32_t wchar_t -# define mbrtoc32 mbrtowc -# define c32tolower towlower -# define c32toupper towupper -# define mbszero(p) memset (p, 0, sizeof (mbstate_t)) -#else -/* Use ISO C 11 + gnulib API. */ -# include <uchar.h> -#endif +#include <uchar.h> /* The sbclen implementation relies on this. */ -static_assert (MB_LEN_MAX <= SCHAR_MAX); +verify (MB_LEN_MAX <= SCHAR_MAX); /* Return true if the locale uses UTF-8. */ @@ -130,8 +121,8 @@ static unsigned short int const lonesome_lower[] = /* Verify that the worst case fits. This is 1 for towupper, 1 for towlower, and 1 for each entry in LONESOME_LOWER. */ -static_assert (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower - <= CASE_FOLDED_BUFSIZE); +verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower + <= CASE_FOLDED_BUFSIZE); /* Find the characters equal to C after case-folding, other than C itself, and store them into FOLDED. Return the number of characters diff --git a/lib/localeinfo.h b/lib/localeinfo.h index 7e6ecbc014..3bc84564be 100644 --- a/lib/localeinfo.h +++ b/lib/localeinfo.h @@ -17,15 +17,7 @@ /* Written by Paul Eggert. */ -#include <limits.h> -#include <wchar.h> -#if GAWK -/* Use ISO C 99 API. */ -# define char32_t wchar_t -#else -/* Use ISO C 11 + gnulib API. */ -# include <uchar.h> -#endif +#include <uchar.h> #ifdef __cplusplus extern "C" { @@ -50,13 +42,13 @@ struct localeinfo single-byte character, -1 if B is an encoding error, and -2 if B is the leading byte of a multibyte character that contains more than one byte. */ - signed char sbclen[UCHAR_MAX + 1]; + signed char sbclen[(unsigned char) -1 + 1]; /* An array indexed by byte values B that contains the corresponding 32-bit wide character (if any) for B if sbclen[B] == 1. WEOF means the byte is not a valid single-byte character, i.e., sbclen[B] == -1 or -2. */ - wint_t sbctowc[UCHAR_MAX + 1]; + wint_t sbctowc[(unsigned char) -1 + 1]; }; extern void init_localeinfo (struct localeinfo *); diff --git a/modules/dfa b/modules/dfa index 5f9610109e..f06d747a6d 100644 --- a/modules/dfa +++ b/modules/dfa @@ -38,7 +38,6 @@ uchar-h # to enable it. #uchar-h-c23 verify -wchar-h xalloc xalloc-die -- 2.53.0
