On 2023-07-27 12:19, Paul Eggert wrote:
--- a/lib/mbcel.h
+++ b/lib/mbcel.h
@@ -191,3 +191,3 @@ mbcel_scan (char const *p, char const *lim)
if (_GL_UNLIKELY ((size_t) -1 / 2 < len))
- return (mbcel_t) { .err = *p, .len = 1 };
+ return (mbcel_t) { .err = *p, .len = len == (size_t) -2 ? lim - p
: 1 };
Come to think of it, this would merely make mbcel compatible with
mbu?iterf?, by causing mbcel to return a length greater than 1 given an
incomplete character at input end. But even with this change, mbcel
would still not implement the multi-byte-per-encoding-error
interpretation ("MEE") behavior that Kuhn and/or the Unicode standard
describe. This is because mbu?iterf? doesn't implement MEE either.
For MEE, mbiterf would need something like the attached untested patch,
and mbiter, mbcel, etc. would all need similar patches. I'm not
suggesting that we make this change, though, as it would bloat the code
for little benefit to many callers.
It would be better to change mbu?iterf? to use
single-byte-per-encoding-error ("SEE") behavior, as this is simpler and
is more consistent with how Emacs etc. behave. Any programs that need
MEE can implement it themselves, or if the need is common enough we
could add a Gnulib API that an app can use to support MEE when
mbiter/mbcel etc. indicate an encoding error.diff --git a/lib/mbiterf.h b/lib/mbiterf.h
index dea6aaef58..bd202e9353 100644
--- a/lib/mbiterf.h
+++ b/lib/mbiterf.h
@@ -129,6 +129,7 @@ mbiterf_next (struct mbif_state *ps, const char *iter, const char *endptr)
#if !GNULIB_MBRTOC32_REGULAR
ps->in_shift = true;
with_shift:;
+ mbstate_t prev_state = ps->state;
#endif
size_t bytes;
char32_t wc;
@@ -136,12 +137,27 @@ mbiterf_next (struct mbif_state *ps, const char *iter, const char *endptr)
if (bytes == (size_t) -1)
{
/* An invalid multibyte sequence was encountered. */
+
+ /* Find the length of the smallest invalid prefix of the input,
+ so that the caller can if desired replace it with
+ a single replacement character. */
+ for (bytes = 1; bytes < endptr - iter; bytes++)
+ {
+ #if GNULIB_MBRTOC32_REGULAR
+ mbszero (&ps->state);
+ #else
+ ps->state = prev_state;
+ #endif
+ if (mbrtoc32 (&wc, iter, bytes, &ps->state) == (size_t) -1)
+ break;
+ }
+
/* Allow the next invocation to continue from a sane state. */
#if !GNULIB_MBRTOC32_REGULAR
ps->in_shift = false;
#endif
mbszero (&ps->state);
- return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = false };
+ return (mbchar_t) { .ptr = iter, .bytes = bytes, .wc_valid = false };
}
else if (bytes == (size_t) -2)
{