For many years, processing multibyte strings required the mbrtowc functions
and the 'wchar_t' type.
The major limitation of this API is that on Windows platforms (Cygwin as well
as native Windows) and in 32-bit mode on AIX, a 'wchar_t' is limited to 16 bits,
and this causes all sorts of bugs with characters outside the Unicode BMP.
Before 2010, we thought that this would only impact rarely used Chinese
characters. But nowadays, emoticons are in Unicode, outside the BMP, and
are frequently used on the web. So, supporting characters outside the BMP
has become more important.
In 2011, ISO C added the 'char32_t' type as a "32-bit wide character" type.
Meanwhile, many OSes have this type and the corresponding mbrtoc32 function.
Elements of this type are actual Unicode code points. The ISO C 11 standard
did only hint at it; but ISO C 23 actually requires it. All platforms that
have the mbrtoc32 function fulfil this requirement, and Gnulib's substitute
(module 'mbrtoc32') does so as well.
In particular, on glibc systems: since glibc 2.24, mbrtoc32 is identical to
mbrtowc. And the Gnulib convenience functions for char32_t characters
just delegate to the corresponding glibc functions for wchar_t wide characters.
So, we are now in a position to support characters outside the BMP in GNU
programs overall and in a portable and maintainable way.
I added some documentation a month ago:
https://www.gnu.org/software/gnulib/manual/html_node/Strings-and-Characters.html
The migration from wchar_t to char32_t can be done by writing 'char32_t'
instead of 'wchar_t', and replacing function names according to this table:
wchar_t char32_t
------- --------
7.31.2
*wprintf -- rarely used
*wscanf -- rarely used
7.31.3
fgetwc -- rarely used, see "The wchar_t mess"
fputwc -- rarely used
7.31.4.1
wcsto{f,d,ld} -- rarely used
wcsto{l,ll,ul,ull} -- rarely used
7.31.4.2
wcscpy u32_strcpy
wcsncpy u32_strncpy
wmemcpy u32_cpy
wmemmove u32_move
7.31.4.3
wcscat u32_strcat
wcsncat u32_strncat
7.31.4.4
wcscmp u32_strcmp
wcscoll u32_strcoll
wcsncmp u32_strncmp
wcsxfrm -- rarely used
wmemcmp u32_cmp
7.31.4.5/6
wcschr u32_strchr
wcscspn u32_strcspn
wcspbrk u32_strpbrk
wcsrchr u32_strrchr
wcsspn u32_strspn
wcsstr u32_strstr
wcstok u32_strtok
wmemchr u32_chr
7.31.4.7
wcslen u32_strlen
wmemset u32_set
7.31.5
wcsftime -- rarely used
7.31.6.1
btowc btoc32
wctob c32tob
7.31.6.2
mbsinit mbsinit
7.31.6.3
mbrlen -- rarely used, use mbrtoc32 instead
mbrtowc mbrtoc32
wcrtomb c32rtomb
7.31.6.4
mbsrtowcs mbsrtoc32s
wcsrtombs c32srtombs
7.32.2.1
iswalnum c32isalnum
iswalpha c32isalpha
iswblank c32isblank
iswcntrl c32iscntrl
iswdigit c32isdigit
iswgraph c32isgraph
iswlower c32islower
iswprint c32isprint
iswpunct c32ispunct
iswspace c32isspace
iswupper c32isupper
iswxdigit c32isxdigit
7.32.2.2
iswctype -- rarely used
wctype -- rarely used
7.32.3.1
towlower c32tolower
towupper c32toupper
7.32.3.2
towctrans -- rarely used
wctrans -- rarely used
POSIX
wcwidth c32width
wcswidth c32swidth
Paul has already started this migration, in diffutils:
https://git.savannah.gnu.org/gitweb/?p=diffutils.git;a=commitdiff;h=a2e301b52cc5bdb44540aa66860dc59fa1fa5a89
In Gnulib, the following areas will need migration:
* lib/mbchar.h
lib/mbiter.h
lib/mbuiter.h
Draft patch attached.
* lib/dfa.c
lib/localeinfo.h
lib/localeinfo.c
Needs to be carefully done, so as to not break gawk.
* lib/regcomp.c
lib/regexec.c
lib/regex_internal.h
lib/regex_internal.c
Needs to be done in a way that is acceptable to glibc upstream.
* lib/fnmatch.c
Likewise.
* lib/exclude.c
* lib/nstrftime.c
* lib/quotearg.c
Bruno
diff --git a/doc/strings.texi b/doc/strings.texi
index aa0830f1a5..73cb56120f 100644
--- a/doc/strings.texi
+++ b/doc/strings.texi
@@ -44,7 +44,7 @@
functions, standardized by ISO C and POSIX, that assume this
representation of strings.
-An @emph{character encoding}, or @emph{encoding} for short, describes
+A @emph{character encoding}, or @emph{encoding} for short, describes
how the elements of a character set are represented as a sequence of
bytes. For example, in the @code{ASCII} encoding, the UNDERSCORE
character is represented by a single byte, with value 0x5F. As another
diff --git a/lib/exclude.c b/lib/exclude.c
index 7bd0ec8c71..af204cd300 100644
--- a/lib/exclude.c
+++ b/lib/exclude.c
@@ -209,10 +209,10 @@ string_hasher_ci (void const *data, size_t n_buckets)
for (mbui_init (iter, p); mbui_avail (iter); mbui_advance (iter))
{
mbchar_t m = mbui_cur (iter);
- wchar_t wc;
+ char32_t wc;
if (m.wc_valid)
- wc = towlower (m.wc);
+ wc = c32tolower (m.wc);
else
wc = *m.ptr;
diff --git a/lib/mbchar.h b/lib/mbchar.h
index a2ff1d8b21..c183772cc6 100644
--- a/lib/mbchar.h
+++ b/lib/mbchar.h
@@ -17,10 +17,10 @@
/* Written by Bruno Haible <[email protected]>. */
/* A multibyte character is a short subsequence of a char* string,
- representing a single wide character.
+ representing a single 32-bit wide character.
- We use multibyte characters instead of wide characters because of
- the following goals:
+ We use multibyte characters instead of 32-bit wide characters because
+ of the following goals:
1) correct multibyte handling, i.e. operate according to the LC_CTYPE
locale,
2) ease of maintenance, i.e. the maintainer needs not know all details
@@ -28,8 +28,7 @@
3) don't fail grossly if the input is not in the encoding set by the
locale, because often different encodings are in use in the same
countries (ISO-8859-1/UTF-8, EUC-JP/Shift_JIS, ...),
- 4) fast in the case of ASCII characters,
- 5) portability, i.e. don't make unportable assumptions about wchar_t.
+ 4) fast in the case of ASCII characters.
Multibyte characters are only accessed through the mb* macros.
@@ -150,8 +149,7 @@
#endif
#include <string.h>
-#include <wchar.h>
-#include <wctype.h>
+#include <uchar.h>
_GL_INLINE_HEADER_BEGIN
#ifndef MBCHAR_INLINE
@@ -164,8 +162,8 @@ struct mbchar
{
const char *ptr; /* pointer to current character */
size_t bytes; /* number of bytes of current character, > 0 */
- bool wc_valid; /* true if wc is a valid wide character */
- wchar_t wc; /* if wc_valid: the current character */
+ bool wc_valid; /* true if wc is a valid 32-bit wide character */
+ char32_t wc; /* if wc_valid: the current character */
char buf[MBCHAR_BUF_SIZE]; /* room for the bytes, used for file input only */
};
@@ -184,7 +182,7 @@ typedef struct mbchar mbchar_t;
#define mb_cmp(mbc1, mbc2) \
((mbc1).wc_valid \
? ((mbc2).wc_valid \
- ? (int) (mbc1).wc - (int) (mbc2).wc \
+ ? _GL_CMP ((mbc1).wc, (mbc2).wc) \
: -1) \
: ((mbc2).wc_valid \
? 1 \
@@ -196,7 +194,7 @@ typedef struct mbchar mbchar_t;
#define mb_casecmp(mbc1, mbc2) \
((mbc1).wc_valid \
? ((mbc2).wc_valid \
- ? (int) towlower ((mbc1).wc) - (int) towlower ((mbc2).wc) \
+ ? _GL_CMP (c32tolower ((mbc1).wc), c32tolower ((mbc2).wc)) \
: -1) \
: ((mbc2).wc_valid \
? 1 \
@@ -212,25 +210,25 @@ typedef struct mbchar mbchar_t;
&& memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0)
#define mb_caseequal(mbc1, mbc2) \
((mbc1).wc_valid && (mbc2).wc_valid \
- ? towlower ((mbc1).wc) == towlower ((mbc2).wc) \
+ ? c32tolower ((mbc1).wc) == c32tolower ((mbc2).wc) \
: (mbc1).bytes == (mbc2).bytes \
&& memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0)
/* <ctype.h>, <wctype.h> classification. */
#define mb_isascii(mbc) \
((mbc).wc_valid && (mbc).wc >= 0 && (mbc).wc <= 127)
-#define mb_isalnum(mbc) ((mbc).wc_valid && iswalnum ((mbc).wc))
-#define mb_isalpha(mbc) ((mbc).wc_valid && iswalpha ((mbc).wc))
-#define mb_isblank(mbc) ((mbc).wc_valid && iswblank ((mbc).wc))
-#define mb_iscntrl(mbc) ((mbc).wc_valid && iswcntrl ((mbc).wc))
-#define mb_isdigit(mbc) ((mbc).wc_valid && iswdigit ((mbc).wc))
-#define mb_isgraph(mbc) ((mbc).wc_valid && iswgraph ((mbc).wc))
-#define mb_islower(mbc) ((mbc).wc_valid && iswlower ((mbc).wc))
-#define mb_isprint(mbc) ((mbc).wc_valid && iswprint ((mbc).wc))
-#define mb_ispunct(mbc) ((mbc).wc_valid && iswpunct ((mbc).wc))
-#define mb_isspace(mbc) ((mbc).wc_valid && iswspace ((mbc).wc))
-#define mb_isupper(mbc) ((mbc).wc_valid && iswupper ((mbc).wc))
-#define mb_isxdigit(mbc) ((mbc).wc_valid && iswxdigit ((mbc).wc))
+#define mb_isalnum(mbc) ((mbc).wc_valid && c32isalnum ((mbc).wc))
+#define mb_isalpha(mbc) ((mbc).wc_valid && c32isalpha ((mbc).wc))
+#define mb_isblank(mbc) ((mbc).wc_valid && c32isblank ((mbc).wc))
+#define mb_iscntrl(mbc) ((mbc).wc_valid && c32iscntrl ((mbc).wc))
+#define mb_isdigit(mbc) ((mbc).wc_valid && c32isdigit ((mbc).wc))
+#define mb_isgraph(mbc) ((mbc).wc_valid && c32isgraph ((mbc).wc))
+#define mb_islower(mbc) ((mbc).wc_valid && c32islower ((mbc).wc))
+#define mb_isprint(mbc) ((mbc).wc_valid && c32isprint ((mbc).wc))
+#define mb_ispunct(mbc) ((mbc).wc_valid && c32ispunct ((mbc).wc))
+#define mb_isspace(mbc) ((mbc).wc_valid && c32isspace ((mbc).wc))
+#define mb_isupper(mbc) ((mbc).wc_valid && c32isupper ((mbc).wc))
+#define mb_isxdigit(mbc) ((mbc).wc_valid && c32isxdigit ((mbc).wc))
/* Extra <wchar.h> function. */
@@ -238,12 +236,12 @@ typedef struct mbchar mbchar_t;
#define MB_UNPRINTABLE_WIDTH 1
MBCHAR_INLINE int
-mb_width_aux (wint_t wc)
+mb_width_aux (char32_t wc)
{
- int w = wcwidth (wc);
+ int w = c32width (wc);
/* For unprintable characters, arbitrarily return 0 for control characters
and MB_UNPRINTABLE_WIDTH otherwise. */
- return (w >= 0 ? w : iswcntrl (wc) ? 0 : MB_UNPRINTABLE_WIDTH);
+ return (w >= 0 ? w : c32iscntrl (wc) ? 0 : MB_UNPRINTABLE_WIDTH);
}
#define mb_width(mbc) \
diff --git a/lib/mbfile.h b/lib/mbfile.h
index 3482f394b9..7c6d70fcae 100644
--- a/lib/mbfile.h
+++ b/lib/mbfile.h
@@ -110,7 +110,7 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf)
{
/* These characters are part of the basic character set. ISO C 99
guarantees that their wide character code is identical to their
- char code. */
+ char code. The 32-bit wide character code is the same as well. */
mbc->wc = mbc->buf[0] = mbf->buf[0];
mbc->wc_valid = true;
mbc->ptr = &mbc->buf[0];
@@ -136,7 +136,7 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf)
behaviour will clobber it. */
mbstate_t backup_state = mbf->state;
- bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state);
+ bytes = mbrtoc32 (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state);
if (bytes == (size_t) -1)
{
@@ -178,7 +178,7 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf)
{
if (bytes == 0)
{
- /* A null wide character was encountered. */
+ /* A null 32-bit wide character was encountered. */
bytes = 1;
assert (mbf->buf[0] == '\0');
assert (mbc->wc == 0);
diff --git a/lib/mbiter.h b/lib/mbiter.h
index 7b41870b55..93bad990a1 100644
--- a/lib/mbiter.h
+++ b/lib/mbiter.h
@@ -90,7 +90,7 @@
#include <assert.h>
#include <stddef.h>
#include <string.h>
-#include <wchar.h>
+#include <uchar.h>
#include "mbchar.h"
@@ -106,11 +106,11 @@ struct mbiter_multi
mbstate_t state; /* if in_shift: current shift state */
bool next_done; /* true if mbi_avail has already filled the following */
struct mbchar cur; /* the current character:
- const char *cur.ptr pointer to current character
+ const char *cur.ptr pointer to current character
The following are only valid after mbi_avail.
- size_t cur.bytes number of bytes of current character
- bool cur.wc_valid true if wc is a valid wide character
- wchar_t cur.wc if wc_valid: the current character
+ size_t cur.bytes number of bytes of current character
+ bool cur.wc_valid true if wc is a valid 32-bit wide character
+ char32_t cur.wc if wc_valid: the current character
*/
};
@@ -136,8 +136,8 @@ mbiter_multi_next (struct mbiter_multi *iter)
assert (mbsinit (&iter->state));
iter->in_shift = true;
with_shift:
- iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
- iter->limit - iter->cur.ptr, &iter->state);
+ iter->cur.bytes = mbrtoc32 (&iter->cur.wc, iter->cur.ptr,
+ iter->limit - iter->cur.ptr, &iter->state);
if (iter->cur.bytes == (size_t) -1)
{
/* An invalid multibyte sequence was encountered. */
diff --git a/lib/mbscasestr.c b/lib/mbscasestr.c
index d92b847ba7..0753aeb864 100644
--- a/lib/mbscasestr.c
+++ b/lib/mbscasestr.c
@@ -64,7 +64,7 @@ knuth_morris_pratt_multibyte (const char *haystack, const char *needle,
{
mb_copy (&needle_mbchars[j], &mbui_cur (iter));
if (needle_mbchars[j].wc_valid)
- needle_mbchars[j].wc = towlower (needle_mbchars[j].wc);
+ needle_mbchars[j].wc = c32tolower (needle_mbchars[j].wc);
}
}
@@ -152,7 +152,7 @@ knuth_morris_pratt_multibyte (const char *haystack, const char *needle,
mb_copy (&c, &mbui_cur (phaystack));
if (c.wc_valid)
- c.wc = towlower (c.wc);
+ c.wc = c32tolower (c.wc);
if (mb_equal (needle_mbchars[j], c))
{
j++;
@@ -237,7 +237,7 @@ mbscasestr (const char *haystack, const char *needle)
mb_copy (&b, &mbui_cur (iter_needle));
if (b.wc_valid)
- b.wc = towlower (b.wc);
+ b.wc = c32tolower (b.wc);
mbui_init (iter_haystack, haystack);
for (;; mbui_advance (iter_haystack))
@@ -279,7 +279,7 @@ mbscasestr (const char *haystack, const char *needle)
comparison_count++;
mb_copy (&c, &mbui_cur (iter_haystack));
if (c.wc_valid)
- c.wc = towlower (c.wc);
+ c.wc = c32tolower (c.wc);
if (mb_equal (c, b))
/* The first character matches. */
{
diff --git a/lib/mbuiter.h b/lib/mbuiter.h
index 7a619f19e1..632def10c5 100644
--- a/lib/mbuiter.h
+++ b/lib/mbuiter.h
@@ -114,11 +114,11 @@ struct mbuiter_multi
mbstate_t state; /* if in_shift: current shift state */
bool next_done; /* true if mbui_avail has already filled the following */
struct mbchar cur; /* the current character:
- const char *cur.ptr pointer to current character
+ const char *cur.ptr pointer to current character
The following are only valid after mbui_avail.
- size_t cur.bytes number of bytes of current character
- bool cur.wc_valid true if wc is a valid wide character
- wchar_t cur.wc if wc_valid: the current character
+ size_t cur.bytes number of bytes of current character
+ bool cur.wc_valid true if wc is a valid 32-bit wide character
+ wchar_t cur.wc if wc_valid: the current character
*/
};
@@ -144,9 +144,9 @@ mbuiter_multi_next (struct mbuiter_multi *iter)
assert (mbsinit (&iter->state));
iter->in_shift = true;
with_shift:
- iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
- strnlen1 (iter->cur.ptr, MB_CUR_MAX),
- &iter->state);
+ iter->cur.bytes = mbrtoc32 (&iter->cur.wc, iter->cur.ptr,
+ strnlen1 (iter->cur.ptr, MB_CUR_MAX),
+ &iter->state);
if (iter->cur.bytes == (size_t) -1)
{
/* An invalid multibyte sequence was encountered. */
diff --git a/modules/exclude b/modules/exclude
index 841dd826cd..93bfdaf4cf 100644
--- a/modules/exclude
+++ b/modules/exclude
@@ -7,6 +7,7 @@ lib/exclude.c
Depends-on:
assert-h
+c32tolower
filename
fnmatch
fopen-gnu
diff --git a/modules/mbchar b/modules/mbchar
index b1fa0fa4ac..51a1c8e1b9 100644
--- a/modules/mbchar
+++ b/modules/mbchar
@@ -10,12 +10,21 @@ Depends-on:
extensions
extern-inline
stdbool
-wchar
-wctype-h
-iswblank
-iswdigit
-iswxdigit
-wcwidth
+uchar
+c32isalnum
+c32isalpha
+c32isblank
+c32iscntrl
+c32isdigit
+c32isgraph
+c32islower
+c32isprint
+c32ispunct
+c32isspace
+c32isupper
+c32isxdigit
+c32tolower
+c32width
memcmp
configure.ac:
diff --git a/modules/mbiter b/modules/mbiter
index 42305d62cd..082afd42f2 100644
--- a/modules/mbiter
+++ b/modules/mbiter
@@ -10,9 +10,9 @@ m4/mbrtowc.m4
Depends-on:
extern-inline
mbchar
-mbrtowc
+mbrtoc32
mbsinit
-wchar
+uchar
stdbool
configure.ac:
diff --git a/modules/mbscasestr b/modules/mbscasestr
index 2892c2fc2b..672cac8960 100644
--- a/modules/mbscasestr
+++ b/modules/mbscasestr
@@ -11,6 +11,7 @@ stdbool
string
mbslen
malloca
+c32tolower
strnlen
configure.ac:
diff --git a/modules/mbuiter b/modules/mbuiter
index b9e41031d5..63a11ff2f5 100644
--- a/modules/mbuiter
+++ b/modules/mbuiter
@@ -10,9 +10,9 @@ m4/mbrtowc.m4
Depends-on:
extern-inline
mbchar
-mbrtowc
+mbrtoc32
mbsinit
-wchar
+uchar
stdbool
strnlen1