[CCing diffutils-devel.]
Paul Eggert wrote in
<https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00021.html>:
> > Level 3: Behave correctly. Don't split a 2-Unicode-character sequence.
> > This is what code that uses mbrtoc32() does, when it has the
> > lines
> > if (bytes == (size_t) -3)
> > bytes = 0;
> > and uses !mbsinit (&state) in the loop termination condition.
>
> With diffutils even level 3 would not suffice, since diffutils truncates
> at input byte boundaries, so it doesn't suffice to merely treat (size_t)
> -3 as zero even if one also checks mbsinit. Instead, one would have to
> treat all the characters in the sequence ABBB... (where A is an ordinary
> multibyte character and the Bs all return (size_t) -3) as a single unit,
Yes.
As far as I can see, this proposed patch should cope with (size_t) -3
returns correctly. The "trick" is to put the mbrtoc32 call into a
do
{
... mbrtoc32 ...
}
while (! mbsinit (&state));
loop.
The patch is attached. Here's the "diff -w", for better readability.
diff --git a/src/side.c b/src/side.c
index 8404c3a..d5149de 100644
--- a/src/side.c
+++ b/src/side.c
@@ -136,34 +136,47 @@ print_half_line (char const *const *line, intmax_t
indent, intmax_t out_bound)
break;
default:
+ /* Invariant: mbstate is in the initial state here. */
+ do
{
char32_t wc;
size_t bytes = mbrtoc32 (&wc, tp0, text_limit - tp0, &mbstate);
- if (bytes <= MB_LEN_MAX)
+ if (bytes == (size_t) -1 || bytes == (size_t) -2)
+ {
+ /* An encoding error (bytes == (size_t) -1), as
+ (size_t) -2 cannot happen as the buffer ends in '\n'. */
+ if (tp0 < text_limit)
+ {
+ /* Consume one byte. Assume it has print width 1. */
+ if (ckd_add (&in_position, in_position, 1))
+ return out_position;
+ if (in_position <= out_bound)
+ putc (*tp0, out);
+ tp0++;
+ }
+ memset (&mbstate, '\0', sizeof mbstate);
+ break;
+ }
+ else
{
int width = c32width (wc);
if (0 < width && ckd_add (&in_position, in_position, width))
return out_position;
+ if (bytes == (size_t) -3)
+ bytes = 0;
if (in_position <= out_bound)
{
out_position = in_position;
fwrite (tp0, 1, bytes, out);
}
- text_pointer = tp0 + bytes;
-
- /* Resume scanning for single-byte characters, as
- shift states are not supported. */
- break;
+ tp0 += bytes;
}
}
-
- /* An encoding error (bytes == (size_t) -1),
- as (size_t) -2 cannot happen as the buffer ends in '\n',
- and (size_t) -3 cannot happen on any known platform.
- Reset, and assume the error has print width 1. */
- memset (&mbstate, 0, sizeof mbstate);
- FALLTHROUGH;
+ while (! mbsinit (&mbstate));
+ /* Invariant: mbstate is in the initial state here again. */
+ text_pointer = tp0;
+ break;
/* Print width 1. */
case ' ': case '!': case '"': case '#': case '%':
>From 9a7f1dc16cc7696a8a3ddbd09b33106cdb77d2a5 Mon Sep 17 00:00:00 2001
From: Bruno Haible <[email protected]>
Date: Tue, 4 Jul 2023 21:24:59 +0200
Subject: [PATCH] diff: Improve handling of mbrtoc32 result
* src/side.c (print_half_line): When mbrtoc32 has left the mbstate not
in the initial state, continue calling mbrtoc32.
---
src/side.c | 67 ++++++++++++++++++++++++++++++++----------------------
1 file changed, 40 insertions(+), 27 deletions(-)
diff --git a/src/side.c b/src/side.c
index 8404c3a..d5149de 100644
--- a/src/side.c
+++ b/src/side.c
@@ -136,34 +136,47 @@ print_half_line (char const *const *line, intmax_t indent, intmax_t out_bound)
break;
default:
- {
- char32_t wc;
- size_t bytes = mbrtoc32 (&wc, tp0, text_limit - tp0, &mbstate);
-
- if (bytes <= MB_LEN_MAX)
- {
- int width = c32width (wc);
- if (0 < width && ckd_add (&in_position, in_position, width))
- return out_position;
- if (in_position <= out_bound)
- {
- out_position = in_position;
- fwrite (tp0, 1, bytes, out);
- }
- text_pointer = tp0 + bytes;
-
- /* Resume scanning for single-byte characters, as
- shift states are not supported. */
- break;
- }
- }
+ /* Invariant: mbstate is in the initial state here. */
+ do
+ {
+ char32_t wc;
+ size_t bytes = mbrtoc32 (&wc, tp0, text_limit - tp0, &mbstate);
- /* An encoding error (bytes == (size_t) -1),
- as (size_t) -2 cannot happen as the buffer ends in '\n',
- and (size_t) -3 cannot happen on any known platform.
- Reset, and assume the error has print width 1. */
- memset (&mbstate, 0, sizeof mbstate);
- FALLTHROUGH;
+ if (bytes == (size_t) -1 || bytes == (size_t) -2)
+ {
+ /* An encoding error (bytes == (size_t) -1), as
+ (size_t) -2 cannot happen as the buffer ends in '\n'. */
+ if (tp0 < text_limit)
+ {
+ /* Consume one byte. Assume it has print width 1. */
+ if (ckd_add (&in_position, in_position, 1))
+ return out_position;
+ if (in_position <= out_bound)
+ putc (*tp0, out);
+ tp0++;
+ }
+ memset (&mbstate, '\0', sizeof mbstate);
+ break;
+ }
+ else
+ {
+ int width = c32width (wc);
+ if (0 < width && ckd_add (&in_position, in_position, width))
+ return out_position;
+ if (bytes == (size_t) -3)
+ bytes = 0;
+ if (in_position <= out_bound)
+ {
+ out_position = in_position;
+ fwrite (tp0, 1, bytes, out);
+ }
+ tp0 += bytes;
+ }
+ }
+ while (! mbsinit (&mbstate));
+ /* Invariant: mbstate is in the initial state here again. */
+ text_pointer = tp0;
+ break;
/* Print width 1. */
case ' ': case '!': case '"': case '#': case '%':
--
2.34.1