Running a testdir of the modules
mbrtowc mbrlen mbslen mbsstr mbmemcasecoll
on NetBSD 10 shows a couple of test failures:
FAIL: test-mbmemcasecoll-3.sh
=============================
../../gltests/test-mbmemcasecmp.h:432: assertion 'my_casecmp (input1, countof
(input1), input2, countof (input2)) == 0' failed
[1] Abort trap (core dumped) LC_ALL="${testlocale}" ${CHECKER} ./test-mbmem...
FAIL test-mbmemcasecoll-3.sh (exit status: 134)
FAIL: test-mbslen.sh
====================
../../gltests/test-mbslen.c:62: assertion 'mbslen ("\341") == 1' failed
[1] Abort trap (core dumped) LC_ALL="${testlocale}" ${CHECKER} ./test-mbsle...
FAIL test-mbslen.sh (exit status: 134)
FAIL: test-mbsstr2.sh
=====================
../../gltests/test-mbsstr2.c:127: assertion 'result == input + 1' failed
[1] Abort trap (core dumped) LC_ALL="${testlocale}" ${CHECKER} ./test-mbsst...
FAIL test-mbsstr2.sh (exit status: 134)
This patch fixes it by doing the mbrtowc processing in UTF-8 locales outselves.
2026-06-02 Bruno Haible <[email protected]>
mbrtowc, mbrlen: Work around a NetBSD bug in UTF-8 locales.
* m4/mbrtowc.m4 (gl_MBRTOWC_INVALID_UTF8): New macro.
(gl_FUNC_MBRTOWC): Invoke it. Define MBRTOWC_INVALID_UTF8_BUG if mbrtowc
does not recognize some invalid UTF-8 byte sequences.
* lib/mbrtowc.c (is_locale_utf8, is_locale_utf8_cached): Define also if
MBRTOWC_INVALID_UTF8_BUG.
(rpl_mbrtowc): Handle UTF-8 locales specially also on NetBSD.
* tests/test-mbrtowc.c (main): Add more test cases for the UTF-8
encoding.
* tests/test-mbrlen.c (main): Likewise.
* doc/posix-functions/mbrtowc.texi: Mention the NetBSD bug.
* doc/posix-functions/mbrlen.texi: Likewise.
diff --git a/doc/posix-functions/mbrlen.texi b/doc/posix-functions/mbrlen.texi
index 07ece94fee..acc4b52f0b 100644
--- a/doc/posix-functions/mbrlen.texi
+++ b/doc/posix-functions/mbrlen.texi
@@ -36,6 +36,10 @@
character, on some platforms:
HP-UX 11.11, Solaris 11 2010-11.
@item
+This function returns @code{(size_t) -2} instead of @code{(size_t) -1}
+for some invalid byte sequences on some platforms:
+NetBSD 10.
+@item
This function may not return 0 when parsing the NUL character on some
platforms:
Solaris 9.
@end itemize
diff --git a/doc/posix-functions/mbrtowc.texi b/doc/posix-functions/mbrtowc.texi
index 9666c58d48..b5d155cd2b 100644
--- a/doc/posix-functions/mbrtowc.texi
+++ b/doc/posix-functions/mbrtowc.texi
@@ -42,6 +42,10 @@
character, on some platforms:
HP-UX 11.11, Solaris 11 2010-11, mingw, MSVC 14.
@item
+This function returns @code{(size_t) -2} instead of @code{(size_t) -1}
+for some invalid byte sequences on some platforms:
+NetBSD 10.
+@item
This function may not return 0 when parsing the NUL character on some
platforms:
Solaris 9.
@end itemize
diff --git a/lib/mbrtowc.c b/lib/mbrtowc.c
index f533b554f3..48316a068a 100644
--- a/lib/mbrtowc.c
+++ b/lib/mbrtowc.c
@@ -83,7 +83,7 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
# include <locale.h>
# endif
-# if (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2 && !__UCLIBC__)
+# if MBRTOWC_INVALID_UTF8_BUG || (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2
&& !__UCLIBC__)
/* Returns 1 if the current locale is an UTF-8 locale, 0 otherwise. */
static inline int
@@ -110,7 +110,8 @@ is_locale_utf8_cached (void)
size_t
rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
{
-# if MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG ||
(GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2)
+# if (MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG ||
MBRTOWC_INVALID_UTF8_BUG \
+ || (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2))
if (s == NULL)
{
pwc = NULL;
@@ -119,24 +120,26 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n,
mbstate_t *ps)
}
# endif
-# if (MBRTOC32_EMPTY_INPUT_BUG || _GL_SMALL_WCHAR_T \
+# if (MBRTOC32_EMPTY_INPUT_BUG || MBRTOWC_INVALID_UTF8_BUG ||
_GL_SMALL_WCHAR_T \
|| (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2 && !__UCLIBC__))
if (n == 0)
return (size_t) -2;
# endif
-# if (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2 && !__UCLIBC__)
+# if MBRTOWC_INVALID_UTF8_BUG || (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2
&& !__UCLIBC__)
/* Optimize the frequent case of an UTF-8 locale.
Since here we are in the !GNULIB_defined_mbstate_t case, i.e. we use
the system's mbstate_t type and have to provide interoperability with
the system's mbsinit() function, this requires knowledge about how the
system's UTF-8 mbrtowc() function stores the state. This knowledge is
- platform-specific. For simplicity, we handle only glibc systems. */
+ platform-specific. For simplicity, we handle only glibc and NetBSD
+ systems. */
if (is_locale_utf8_cached ())
{
static mbstate_t internal_state;
if (ps == NULL)
ps = &internal_state;
+ #if __GLIBC__ >= 2
/* Structure of mbstate_t =
{ int __count; union { wint_t __wch; char __wchb[4]; } __value; }
(see glibc/iconv/gconv_simple.c function utf8_internal_loop):
@@ -145,10 +148,25 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n,
mbstate_t *ps)
entire byte sequence.
__value.__wch is the already inferrable bits of the character, of
the form (x << (r*6)) when r bytes are still expected. */
+ #endif
+ #ifdef __NetBSD__
+ /* Structure of mbstate_t =
+ union { int64_t __mbstateL; char __mbstate8[128]; }
+ (see src/lib/libc/citrus/modules/citrus_utf8.c):
+ { void *header; char ch[6]; int chlen; },
+ i.e. ch[0..5] is __mbstate8[sizeof(void*)+0..sizeof(void*)+5],
+ chlen is __mbstate8[sizeof(void*)+8..sizeof(void*)+11]. */
+ #endif
/* Here n > 0. */
- size_t nstate = ps->__count & 7;
+ size_t nstate;
+ #if __GLIBC__ >= 2
+ nstate = ps->__count & 7;
+ #endif
+ #ifdef __NetBSD__
+ nstate = *(int *) &ps->__mbstate8[sizeof (void *) + 8];
+ #endif
char buf[4];
const char *p;
size_t m;
@@ -160,6 +178,7 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n,
mbstate_t *ps)
}
else
{
+ #if __GLIBC__ >= 2
size_t t = ps->__count >> 8; /* total expected number of bytes */
if (t > nstate && t <= 4)
{
@@ -181,6 +200,18 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n,
mbstate_t *ps)
errno = EINVAL;
return (size_t)(-1);
}
+ #endif
+ #ifdef __NetBSD__
+ buf[0] = ps->__mbstate8[sizeof (void *) + 0];
+ if (nstate >= 2)
+ {
+ buf[1] = ps->__mbstate8[sizeof (void *) + 1];
+ if (nstate >= 3)
+ {
+ buf[2] = ps->__mbstate8[sizeof (void *) + 2];
+ }
+ }
+ #endif
p = buf;
m = nstate;
buf[m++] = s[0];
@@ -206,12 +237,18 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n,
mbstate_t *ps)
if (nstate >= (res > 0 ? res : 1))
abort ();
res -= nstate;
+ #if __GLIBC__ >= 2
ps->__count = 0;
+ #endif
+ #ifdef __NetBSD__
+ *(int *) &ps->__mbstate8[sizeof (void *) + 8] = 0;
+ #endif
return res;
incomplete:
/* Here 0 < m < 4. */
{
+ #if __GLIBC__ >= 2
unsigned char c = (unsigned char) p[0];
if (c < 0xE0)
{
@@ -233,6 +270,19 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n,
mbstate_t *ps)
| (m > 1 ? ((unsigned char) p[1] & 0x3F) << 12 : 0)
| (m > 2 ? ((unsigned char) p[2] & 0x3F) << 6 : 0);
}
+ #endif
+ #ifdef __NetBSD__
+ *(int *) &ps->__mbstate8[sizeof (void *) + 8] = m;
+ ps->__mbstate8[sizeof (void *) + 0] = p[0];
+ if (m > 1)
+ {
+ ps->__mbstate8[sizeof (void *) + 1] = p[1];
+ if (m > 2)
+ {
+ ps->__mbstate8[sizeof (void *) + 2] = p[2];
+ }
+ }
+ #endif
}
return (size_t)(-2);
diff --git a/m4/mbrtowc.m4 b/m4/mbrtowc.m4
index 381b22dd21..fdc05da3a1 100644
--- a/m4/mbrtowc.m4
+++ b/m4/mbrtowc.m4
@@ -1,5 +1,5 @@
# mbrtowc.m4
-# serial 49
+# serial 50
dnl Copyright (C) 2001-2002, 2004-2005, 2008-2026 Free Software Foundation,
dnl Inc.
dnl This file is free software; the Free Software Foundation
@@ -38,6 +38,7 @@ AC_DEFUN([gl_FUNC_MBRTOWC]
gl_MBRTOWC_STORES_INCOMPLETE
gl_MBRTOWC_EMPTY_INPUT
gl_MBRTOWC_C_LOCALE
+ gl_MBRTOWC_INVALID_UTF8
case "$gl_cv_func_mbrtowc_null_arg1" in
*yes) ;;
*) AC_DEFINE([MBRTOWC_NULL_ARG1_BUG], [1],
@@ -81,6 +82,13 @@ AC_DEFUN([gl_FUNC_MBRTOWC]
REPLACE_MBRTOWC=1
;;
esac
+ case "$gl_cv_func_mbrtowc_invalid_UTF8" in
+ *yes) ;;
+ *) AC_DEFINE([MBRTOWC_INVALID_UTF8_BUG], [1],
+ [Define if the mbrtowc function does not recognize some invalid
UTF-8 byte sequences.])
+ REPLACE_MBRTOWC=1
+ ;;
+ esac
fi
fi
if test $REPLACE_MBSTATE_T = 1; then
@@ -700,6 +708,65 @@ AC_DEFUN([gl_MBRTOWC_C_LOCALE]
])
])
+dnl Test whether mbrtowc recognizes invalid UTF-8 byte sequences.
+
+AC_DEFUN([gl_MBRTOWC_INVALID_UTF8],
+[
+ AC_REQUIRE([gt_LOCALE_EN_UTF8])
+ AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles
+ AC_CACHE_CHECK([whether mbrtowc recognizes invalid UTF-8],
+ [gl_cv_func_mbrtowc_invalid_UTF8],
+ [
+ dnl Initial guess, used when cross-compiling or when no suitable locale
+ dnl is present.
+changequote(,)dnl
+ case "$host_os" in
+ # Guess no on NetBSD.
+ netbsd*) gl_cv_func_mbrtowc_invalid_UTF8="guessing no" ;;
+ # Guess yes otherwise.
+ *) gl_cv_func_mbrtowc_invalid_UTF8="guessing yes" ;;
+ esac
+changequote([,])dnl
+ if test "$LOCALE_EN_UTF8" != none; then
+ AC_RUN_IFELSE(
+ [AC_LANG_SOURCE([[
+#include <locale.h>
+#include <string.h>
+#include <wchar.h>
+int main ()
+{
+ if (setlocale (LC_ALL, "$LOCALE_EN_UTF8") != NULL)
+ {
+ int result = 0;
+ /* This test fails on NetBSD 10. */
+ {
+ mbstate_t state;
+ wchar_t wc;
+
+ memset (&state, '\0', sizeof (mbstate_t));
+ if (mbrtowc (&wc, "\340x", 2, &state) != (size_t)(-1))
+ result |= 1;
+ }
+ /* This test fails on NetBSD 10. */
+ {
+ mbstate_t state;
+ wchar_t wc;
+
+ memset (&state, '\0', sizeof (mbstate_t));
+ if (mbrtowc (&wc, "\360x\360", 3, &state) != (size_t)(-1))
+ result |= 2;
+ }
+ return result;
+ }
+ return 0;
+}]])],
+ [gl_cv_func_mbrtowc_invalid_UTF8=yes],
+ [gl_cv_func_mbrtowc_invalid_UTF8=no],
+ [:])
+ fi
+ ])
+])
+
# Prerequisites of lib/mbrtowc.c and lib/lc-charset-dispatch.c.
AC_DEFUN([gl_PREREQ_MBRTOWC], [
AC_REQUIRE([AC_C_INLINE])
diff --git a/tests/test-mbrlen.c b/tests/test-mbrlen.c
index 77e0f0ea35..b38173ded6 100644
--- a/tests/test-mbrlen.c
+++ b/tests/test-mbrlen.c
@@ -209,6 +209,17 @@ main (int argc, char *argv[])
ASSERT (ret == 1);
ASSERT (mbsinit (&state));
}
+ /* Test recognition of invalid byte sequences. */
+ {
+ memset (&state, 0, sizeof (mbstate_t));
+ ret = mbrlen ("\340x", 2, &state);
+ ASSERT (ret == (size_t)(-1));
+ }
+ {
+ memset (&state, 0, sizeof (mbstate_t));
+ ret = mbrlen ("\360x\360", 3, &state);
+ ASSERT (ret == (size_t)(-1));
+ }
return test_exit_status;
case '4':
diff --git a/tests/test-mbrtowc.c b/tests/test-mbrtowc.c
index 3b10e9daed..a5d0741ef2 100644
--- a/tests/test-mbrtowc.c
+++ b/tests/test-mbrtowc.c
@@ -297,6 +297,19 @@ main (int argc, char *argv[])
ASSERT (wctob (wc) == EOF);
ASSERT (mbsinit (&state));
}
+ /* Test recognition of invalid byte sequences. */
+ {
+ memset (&state, 0, sizeof (mbstate_t));
+ wc = (wchar_t) {0xBADFACE};
+ ret = mbrtowc (&wc, "\340x", 2, &state);
+ ASSERT (ret == (size_t)(-1));
+ }
+ {
+ memset (&state, 0, sizeof (mbstate_t));
+ wc = (wchar_t) {0xBADFACE};
+ ret = mbrtowc (&wc, "\360x\360", 3, &state);
+ ASSERT (ret == (size_t)(-1));
+ }
return test_exit_status;
case '4':