Module Name: src
Committed By: riastradh
Date: Sun Aug 18 02:19:35 UTC 2024
Modified Files:
src/lib/libc/locale: c16rtomb.3 c16rtomb.c c32rtomb.3 c8rtomb.3
c8rtomb.c
src/tests/lib/libc/locale: t_c16rtomb.c t_c8rtomb.c
Log Message:
c8rtomb(3), c16rtomb(3): Fix NUL handling.
PR lib/58615: incomplete c8rtomb, c16rtomb handles NUL termination
wrong
To generate a diff of this commit:
cvs rdiff -u -r1.7 -r1.8 src/lib/libc/locale/c16rtomb.3 \
src/lib/libc/locale/c32rtomb.3
cvs rdiff -u -r1.5 -r1.6 src/lib/libc/locale/c16rtomb.c \
src/lib/libc/locale/c8rtomb.3
cvs rdiff -u -r1.4 -r1.5 src/lib/libc/locale/c8rtomb.c
cvs rdiff -u -r1.2 -r1.3 src/tests/lib/libc/locale/t_c16rtomb.c \
src/tests/lib/libc/locale/t_c8rtomb.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/lib/libc/locale/c16rtomb.3
diff -u src/lib/libc/locale/c16rtomb.3:1.7 src/lib/libc/locale/c16rtomb.3:1.8
--- src/lib/libc/locale/c16rtomb.3:1.7 Sat Aug 17 01:52:51 2024
+++ src/lib/libc/locale/c16rtomb.3 Sun Aug 18 02:19:35 2024
@@ -1,4 +1,4 @@
-.\" $NetBSD: c16rtomb.3,v 1.7 2024/08/17 01:52:51 riastradh Exp $
+.\" $NetBSD: c16rtomb.3,v 1.8 2024/08/18 02:19:35 riastradh Exp $
.\"
.\" Copyright (c) 2024 The NetBSD Foundation, Inc.
.\" All rights reserved.
@@ -75,11 +75,13 @@ with the same state
.Fa ps ,
the sequence of
.Fa c16
-values must be a well-formed UTF-16 code unit sequence.
+values must be a well-formed UTF-16 code unit sequence, or an
+incomplete UTF-16 code unit sequence followed by null.
If
.Fa c16 ,
when appended to the sequence of code units passed in previous calls,
-does not form a well-formed UTF-16 code unit sequence, then
+is not null and does not form a well-formed UTF-16 code unit sequence,
+then
.Nm
returns
.Li (size_t)-1
@@ -95,6 +97,17 @@ is a null pointer, no output is stored,
and the return value are unchanged.
.Pp
If
+.Fa c16
+is null,
+.Nm
+discards any pending incomplete UTF-16 code unit sequence in
+.Fa ps ,
+outputs a (possibly empty) shift sequence to restore the initial state
+followed by a null byte, and resets
+.Fa ps
+to the initial conversion state.
+.Pp
+If
.Fa ps
is a null pointer,
.Nm
@@ -128,10 +141,11 @@ on failure.
.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
.Sh EXAMPLES
Convert a UTF-16 code unit sequence to a multibyte string,
-NUL-terminate it, and print it:
+NUL-terminate it (with any shift sequence needed to restore the initial
+state), and print it:
.Bd -literal -offset indent
char16_t c16[] = { 0xd83d, 0xdca9 };
-char buf[__arraycount(c16)*MB_CUR_MAX + 1], *s = buf;
+char buf[(__arraycount(c16) + 1)*MB_CUR_MAX], *s = buf;
size_t i;
mbstate_t mbs = {0}; /* initial conversion state */
@@ -144,7 +158,10 @@ for (i = 0; i < __arraycount(c16); i++)
assert(len < sizeof(buf) - (s - buf));
s += len;
}
-*s = '\e0'; /* NUL-terminate */
+len = c16rtomb(s, L'\e0', &mbs); /* NUL-terminate */
+if (len == (size_t)-1)
+ err(1, "c16rtomb");
+assert(len <= sizeof(buf) - (s - buf));
printf("%s\en", buf);
.Ed
.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
@@ -198,32 +215,20 @@ function first appeared in
.Nx 11.0 .
.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
.Sh BUGS
-It is not clear from the standard how
-.Nm
-is supposed to behave when given a high surrogate code point followed
-by a NUL:
-.Bd -literal -offset indent
-c16rtomb(s, 0xd800, ps);
-c16rtomb(s, L'\e0', ps);
-.Ed
-.Pp
-Currently this fails with
-.Er EILSEQ
-which matches other implementations, but this is at odds with language
-in the standard which suggests that passing
-.Li L'\e0'
-should unconditionally store a null byte and reset
-.Fa ps
-to the initial conversion state:
+The standard requires that a null code unit unconditionally reset the
+conversion state and output null:
.Bd -filled -offset indent
If
-.Fa c16
-is a null wide character, a null byte is stored, preceded by any shift
+.Fa c8
+is a null character, a null byte is stored, preceded by any shift
sequence needed to restore the initial shift state; the resulting state
described is the initial conversion state.
.Ed
.Pp
-However, it is unclear what else this should store besides a null
-byte.
-Should it discard the pending high surrogate, or convert it to
-something else and store that?
+However, some implementations such as
+.Fx 14.0 ,
+.Ox 7.4 ,
+and glibc 2.36 ignore this clause and, if the null was preceded by an
+incomplete UTF-16 code unit sequence, fail with
+.Er EILSEQ
+instead.
Index: src/lib/libc/locale/c32rtomb.3
diff -u src/lib/libc/locale/c32rtomb.3:1.7 src/lib/libc/locale/c32rtomb.3:1.8
--- src/lib/libc/locale/c32rtomb.3:1.7 Sat Aug 17 01:52:51 2024
+++ src/lib/libc/locale/c32rtomb.3 Sun Aug 18 02:19:35 2024
@@ -1,4 +1,4 @@
-.\" $NetBSD: c32rtomb.3,v 1.7 2024/08/17 01:52:51 riastradh Exp $
+.\" $NetBSD: c32rtomb.3,v 1.8 2024/08/18 02:19:35 riastradh Exp $
.\"
.\" Copyright (c) 2024 The NetBSD Foundation, Inc.
.\" All rights reserved.
@@ -82,6 +82,15 @@ is a null pointer, no output is stored,
and the return value are unchanged.
.Pp
If
+.Fa c32
+is null,
+.Nm
+outputs a (possibly empty) shift sequence to restore the initial state
+followed by a null byte and resets
+.Fa ps
+to the initial conversion state.
+.Pp
+If
.Fa ps
is a null pointer,
.Nm
@@ -115,10 +124,11 @@ on failure.
.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
.Sh EXAMPLES
Convert a sequence of Unicode scalar values to a multibyte sequence,
-NUL-terminate it, and print it:
+NUL-terminate it (with any shift sequence needed to restore the initial
+state), and print it:
.Bd -literal -offset indent
char32_t c32[] = { 0x1f4a9, 0x20ac, 0x21 };
-char buf[__arraycountb(c32)*MB_CUR_MAX + 1], *s = buf;
+char buf[(__arraycount(c32) + 1)*MB_CUR_MAX], *s = buf;
size_t i;
mbstate_t mbs = {0}; /* initial conversion state */
@@ -131,7 +141,10 @@ for (i = 0; i < __arraycount(c32); i++)
assert(len < sizeof(buf) - (s - buf));
s += len;
}
-*s = '\e0'; /* NUL-terminate */
+len = c32rtomb(s, L'\e0', &mbs); /* NUL-terminate */
+if (len == (size_t)-1)
+ err(1, "c32rtomb");
+assert(len <= sizeof(buf) - (s - buf));
printf("%s\en", buf);
.Ed
.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Index: src/lib/libc/locale/c16rtomb.c
diff -u src/lib/libc/locale/c16rtomb.c:1.5 src/lib/libc/locale/c16rtomb.c:1.6
--- src/lib/libc/locale/c16rtomb.c:1.5 Sat Aug 17 21:24:53 2024
+++ src/lib/libc/locale/c16rtomb.c Sun Aug 18 02:19:35 2024
@@ -1,4 +1,4 @@
-/* $NetBSD: c16rtomb.c,v 1.5 2024/08/17 21:24:53 riastradh Exp $ */
+/* $NetBSD: c16rtomb.c,v 1.6 2024/08/18 02:19:35 riastradh Exp $ */
/*-
* Copyright (c) 2024 The NetBSD Foundation, Inc.
@@ -66,7 +66,7 @@
*/
#include <sys/cdefs.h>
-__RCSID("$NetBSD: c16rtomb.c,v 1.5 2024/08/17 21:24:53 riastradh Exp $");
+__RCSID("$NetBSD: c16rtomb.c,v 1.6 2024/08/18 02:19:35 riastradh Exp $");
#include "namespace.h"
@@ -140,37 +140,37 @@ c16rtomb_l(char *restrict s, char16_t c1
*/
S = (struct c16rtombstate *)(void *)ps;
-#if 0
/*
- * `If c16 is a null wide character, a null byte is stored,
- * preceded by any shift sequence needed to restore the
- * initial shift state; the resulting state described is the
- * initial conversion state.'
+ * Handle several cases:
*
- * XXX But what else gets stored? Do we just discard any
- * pending high surrogate, or do we convert it to something
- * else, or what?
- */
- if (c16 == L'\0') {
+ * 1. c16 is null.
+ * 2. Pending high surrogate.
+ * 3. No pending high surrogate and c16 is a high surrogate.
+ * 4. No pending high surrogate and c16 is a low surrogate.
+ * 5. No pending high surrogate and c16 is a BMP scalar value.
+ */
+ if (c16 == L'\0') { /* 1. null */
+ /*
+ * `If c16 is a null wide character, a null byte is
+ * stored, preceded by any shift sequence needed to
+ * restore the initial shift state; the resulting
+ * state described is the initial conversion state.'
+ *
+ * So if c16 is null, discard any pending high
+ * surrogate -- there's nothing we can legitimately do
+ * with it -- and convert a null scalar value, which by
+ * definition of c32rtomb writes out any shift sequence
+ * reset followed by a null byte.
+ */
S->surrogate = 0;
- }
-#endif
-
- /*
- * Check whether:
- *
- * 1. We had previously decoded a high surrogate.
- * => Decode the low surrogate -- reject if it's not a low
- * surrogate -- and combine them to output a scalar
- * value; clear the high surrogate for next time.
- * 2. This is a high surrogate.
- * => Save it and wait for the low surrogate with no output.
- * 3. This is a low surrogate.
- * => Reject.
- * 4. This is not a surrogate.
- * => Output a scalar value.
- */
- if (S->surrogate != 0) { /* 1. pending surrogate pair */
+ c32 = 0;
+ } else if (S->surrogate != 0) { /* 2. pending high surrogate */
+ /*
+ * If the previous code unit was a high surrogate, the
+ * next code unit must be a low surrogate. Reject it
+ * if not; otherwise clear the high surrogate for next
+ * time and combine them to output a scalar value.
+ */
if (c16 < 0xdc00 || c16 > 0xdfff) {
errno = EILSEQ;
return (size_t)-1;
@@ -182,13 +182,27 @@ c16rtomb_l(char *restrict s, char16_t c1
__SHIFTIN(__SHIFTOUT(w2, __BITS(9,0)), __BITS(9,0)));
c32 += 0x10000;
S->surrogate = 0;
- } else if (c16 >= 0xd800 && c16 <= 0xdbff) { /* 2. high surrogate */
+ } else if (c16 >= 0xd800 && c16 <= 0xdbff) { /* 3. high surrogate */
+ /*
+ * No pending high surrogate and this code unit is a
+ * high surrogate. Save it for next time, and output
+ * nothing -- we don't yet know what the next scalar
+ * value will be until we receive the low surrogate.
+ */
S->surrogate = c16;
return 0; /* produced nothing */
- } else if (c16 >= 0xdc00 && c16 <= 0xdfff) { /* 3. low surrogate */
+ } else if (c16 >= 0xdc00 && c16 <= 0xdfff) { /* 4. low surrogate */
+ /*
+ * No pending high surrogate and this code unit is a
+ * low surrogate. That's invalid; fail with EILSEQ.
+ */
errno = EILSEQ;
return (size_t)-1;
- } else { /* 4. not a surrogate */
+ } else { /* 5. not a surrogate */
+ /*
+ * Code unit is a scalar value in the BMP. Just output
+ * it as is.
+ */
c32 = c16;
}
Index: src/lib/libc/locale/c8rtomb.3
diff -u src/lib/libc/locale/c8rtomb.3:1.5 src/lib/libc/locale/c8rtomb.3:1.6
--- src/lib/libc/locale/c8rtomb.3:1.5 Sat Aug 17 00:32:19 2024
+++ src/lib/libc/locale/c8rtomb.3 Sun Aug 18 02:19:35 2024
@@ -1,4 +1,4 @@
-.\" $NetBSD: c8rtomb.3,v 1.5 2024/08/17 00:32:19 riastradh Exp $
+.\" $NetBSD: c8rtomb.3,v 1.6 2024/08/18 02:19:35 riastradh Exp $
.\"
.\" Copyright (c) 2024 The NetBSD Foundation, Inc.
.\" All rights reserved.
@@ -75,11 +75,13 @@ with the same state
.Fa ps ,
the sequence of
.Fa c8
-values must be a well-formed UTF-8 code unit sequence.
+values must be a well-formed UTF-8 code unit sequence, or an
+incomplete UTF-8 code unit sequence followed by null.
If
.Fa c8 ,
when appended to the sequence of code units passed in previous calls,
-does not form a well-formed UTF-8 code unit sequence, then
+is not null and does not form a well-formed UTF-8 code unit sequence,
+then
.Nm
returns
.Li (size_t)-1
@@ -95,6 +97,17 @@ is a null pointer, no output is stored,
and the return value are unchanged.
.Pp
If
+.Fa c8
+is null,
+.Nm
+discards any pending incomplete UTF-8 code unit sequence in
+.Fa ps ,
+outputs a (possibly empty) shift sequence to restore the initial state
+followed by a null byte, and resets
+.Fa ps
+to the initial conversion state.
+.Pp
+If
.Fa ps
is a null pointer,
.Nm
@@ -128,10 +141,11 @@ on failure.
.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
.Sh EXAMPLES
Convert a UTF-8 code unit sequence to a multibyte string,
-NUL-terminate it, and print it:
+NUL-terminate it (with any shift sequence needed to restore the initial
+state), and print it:
.Bd -literal -offset indent
char8_t c8[] = { 0xf0, 0x9f, 0x92, 0xa9 };
-char buf[__arraycount(c8)*MB_CUR_MAX + 1], *s = buf;
+char buf[(__arraycount(c8) + 1)*MB_CUR_MAX], *s = buf;
size_t i;
mbstate_t mbs = {0}; /* initial conversion state */
@@ -144,7 +158,10 @@ for (i = 0; i < __arraycount(c8); i++) {
assert(len < sizeof(buf) - (s - buf));
s += len;
}
-*s = '\e0'; /* NUL-terminate */
+len = c8rtomb(s, '\e0', &mbs); /* NUL-terminate */
+if (len == (size_t)-1)
+ err(1, "c16rtomb");
+assert(len <= sizeof(buf) - (s - buf));
printf("%s\en", buf);
.Ed
.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
@@ -197,26 +214,9 @@ The
function first appeared in
.Nx 11.0 .
.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-.Sh BUGS
-It is not clear from the standard how
-.Nm
-is supposed to behave when given an incomplete UTF-8 code unit sequence
-followed by a NUL:
-.Bd -literal -offset indent
-c8rtomb(s, 0xf0, ps);
-c8rtomb(s, 0x9f, ps);
-c8rtomb(s, 0x92, ps);
-c8rtomb(s, '\e0', ps);
-.Ed
-.Pp
-Currently this fails with
-.Er EILSEQ
-which matches other implementations, but this is at odds with language
-in the standard which suggests that passing
-.Li '\e0'
-should unconditionally store a null byte and reset
-.Fa ps
-to the initial conversion state:
+.Sh CAVEATS
+The standard requires that a null code unit unconditionally reset the
+conversion state and output null:
.Bd -filled -offset indent
If
.Fa c8
@@ -225,7 +225,8 @@ sequence needed to restore the initial s
described is the initial conversion state.
.Ed
.Pp
-However, it is unclear what else this should store besides a null
-byte.
-Should it discard the pending UTF-8 code unit sequence, or convert it
-to something else and store that?
+However, some implementations such as glibc 2.36 ignore this clause
+and, if the null was preceded by an incomplete UTF-8 code unit
+sequence, fail with
+.Er EILSEQ
+instead.
Index: src/lib/libc/locale/c8rtomb.c
diff -u src/lib/libc/locale/c8rtomb.c:1.4 src/lib/libc/locale/c8rtomb.c:1.5
--- src/lib/libc/locale/c8rtomb.c:1.4 Sat Aug 17 21:24:53 2024
+++ src/lib/libc/locale/c8rtomb.c Sun Aug 18 02:19:35 2024
@@ -1,4 +1,4 @@
-/* $NetBSD: c8rtomb.c,v 1.4 2024/08/17 21:24:53 riastradh Exp $ */
+/* $NetBSD: c8rtomb.c,v 1.5 2024/08/18 02:19:35 riastradh Exp $ */
/*-
* Copyright (c) 2024 The NetBSD Foundation, Inc.
@@ -55,7 +55,7 @@
*/
#include <sys/cdefs.h>
-__RCSID("$NetBSD: c8rtomb.c,v 1.4 2024/08/17 21:24:53 riastradh Exp $");
+__RCSID("$NetBSD: c8rtomb.c,v 1.5 2024/08/18 02:19:35 riastradh Exp $");
#include "namespace.h"
@@ -173,22 +173,21 @@ c8rtomb_l(char *restrict s, char8_t c8,
*/
S = (struct c8rtombstate *)(void *)ps;
-#if 0
/*
* `If c8 is a null character, a null byte is stored, preceded
* by any shift sequence needed to restore the initial shift
* state; the resulting state described is the initial
* conversion state.'
*
- * XXX But what else gets stored? Do we just discard any
- * pending sequence, or do we convert it to something else, or
- * what?
- */
- if (c8 == u8'\0') {
- memset(S->buf, 0, sizeof(S->buf));
- S->n = 0;
+ * So if c8 is null, discard any buffered input -- there's
+ * nothing we can legitimately do with it -- and convert a null
+ * scalar value, which by definition of c32rtomb writes out any
+ * shift sequence reset followed by a null byte.
+ */
+ if (c8 == '\0') {
+ c32 = 0;
+ goto accept;
}
-#endif
/*
* Get the current state and buffer.
@@ -218,6 +217,7 @@ c8rtomb_l(char *restrict s, char8_t c8,
__SHIFTIN(c32, __BITS(23,0)));
return 0;
case UTF8_ACCEPT:
+ accept:
/*
* We have a scalar value. Clear the state and output
* the scalar value.
Index: src/tests/lib/libc/locale/t_c16rtomb.c
diff -u src/tests/lib/libc/locale/t_c16rtomb.c:1.2 src/tests/lib/libc/locale/t_c16rtomb.c:1.3
--- src/tests/lib/libc/locale/t_c16rtomb.c:1.2 Sat Aug 17 21:31:22 2024
+++ src/tests/lib/libc/locale/t_c16rtomb.c Sun Aug 18 02:19:35 2024
@@ -1,4 +1,4 @@
-/* $NetBSD: t_c16rtomb.c,v 1.2 2024/08/17 21:31:22 riastradh Exp $ */
+/* $NetBSD: t_c16rtomb.c,v 1.3 2024/08/18 02:19:35 riastradh Exp $ */
/*-
* Copyright (c) 2002 Tim J. Robbins
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__RCSID("$NetBSD: t_c16rtomb.c,v 1.2 2024/08/17 21:31:22 riastradh Exp $");
+__RCSID("$NetBSD: t_c16rtomb.c,v 1.3 2024/08/18 02:19:35 riastradh Exp $");
#include <errno.h>
#include <limits.h>
@@ -108,8 +108,6 @@ ATF_TC_BODY(c16rtomb_c_locale_test, tc)
memset(&s, 0, sizeof(s));
memset(buf, 0xcc, sizeof(buf));
ATF_CHECK_EQ_MSG((n = c16rtomb(buf, 0xd83d, &s)), 0, "n=%zu", n);
- atf_tc_expect_fail("PR lib/58615:"
- " incomplete c8rtomb, c16rtomb handles NUL termination wrong");
ATF_CHECK_EQ_MSG((n = c16rtomb(buf, L'\0', &s)), 1, "n=%zu", n);
ATF_CHECK_MSG(((unsigned char)buf[0] == '\0' &&
(unsigned char)buf[1] == 0xcc),
@@ -189,8 +187,6 @@ ATF_TC_BODY(c16rtomb_utf_8_test, tc)
memset(&s, 0, sizeof(s));
memset(buf, 0xcc, sizeof(buf));
ATF_CHECK_EQ_MSG((n = c16rtomb(buf, 0xd83d, &s)), 0, "n=%zu", n);
- atf_tc_expect_fail("PR lib/58615:"
- " incomplete c8rtomb, c16rtomb handles NUL termination wrong");
ATF_CHECK_EQ_MSG((n = c16rtomb(buf, L'\0', &s)), 1,
"n=%zu", n);
ATF_CHECK_MSG(((unsigned char)buf[0] == '\0' &&
Index: src/tests/lib/libc/locale/t_c8rtomb.c
diff -u src/tests/lib/libc/locale/t_c8rtomb.c:1.2 src/tests/lib/libc/locale/t_c8rtomb.c:1.3
--- src/tests/lib/libc/locale/t_c8rtomb.c:1.2 Sat Aug 17 21:31:22 2024
+++ src/tests/lib/libc/locale/t_c8rtomb.c Sun Aug 18 02:19:35 2024
@@ -1,4 +1,4 @@
-/* $NetBSD: t_c8rtomb.c,v 1.2 2024/08/17 21:31:22 riastradh Exp $ */
+/* $NetBSD: t_c8rtomb.c,v 1.3 2024/08/18 02:19:35 riastradh Exp $ */
/*-
* Copyright (c) 2002 Tim J. Robbins
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__RCSID("$NetBSD: t_c8rtomb.c,v 1.2 2024/08/17 21:31:22 riastradh Exp $");
+__RCSID("$NetBSD: t_c8rtomb.c,v 1.3 2024/08/18 02:19:35 riastradh Exp $");
#include <errno.h>
#include <limits.h>
@@ -117,8 +117,6 @@ ATF_TC_BODY(c8rtomb_c_locale_test, tc)
memset(&s, 0, sizeof(s));
memset(buf, 0xcc, sizeof(buf));
ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0xf0, &s)), 0, "n=%zu", n);
- atf_tc_expect_fail("PR lib/58615:"
- " incomplete c8rtomb, c16rtomb handles NUL termination wrong");
ATF_CHECK_EQ_MSG((n = c8rtomb(buf, '\0', &s)), 1, "n=%zu", n);
ATF_CHECK_MSG(((unsigned char)buf[0] == '\0' &&
(unsigned char)buf[1] == 0xcc),
@@ -225,8 +223,6 @@ ATF_TC_BODY(c8rtomb_utf_8_test, tc)
memset(&s, 0, sizeof(s));
memset(buf, 0xcc, sizeof(buf));
ATF_CHECK_EQ_MSG((n = c8rtomb(buf, 0xf0, &s)), 0, "n=%zu", n);
- atf_tc_expect_fail("PR lib/58615:"
- " incomplete c8rtomb, c16rtomb handles NUL termination wrong");
ATF_CHECK_EQ_MSG((n = c8rtomb(buf, '\0', &s)), 1, "n=%zu", n);
ATF_CHECK_MSG(((unsigned char)buf[0] == '\0' &&
(unsigned char)buf[1] == 0xcc),