ansic_quote calls mbrtowc with a NULL parser state argument. This uses a
static anonymous shift state that ends up in an undefined state after an
invalid sequence. AFAICT most libcs handle UTF-8 conversions with the
static state just fine, but on Android [1] the bionic libc implementation
will, in fact, produce incorrect results if the state is reused after an
invalid sequence:
$ LC_ALL=C printf '%q\n' $'\200\321\215'
$'\200\321�'
I couldn't find a clean place in the exisiting code to add the parser
state reset so reorganized some of the surrounding code for this patch.
[1] Termux by default disables the C locale. The behavior above is only
visible in Termux once the Termux setlocale override is removed.
---
lib/sh/strtrans.c | 67 ++++++++++++++++++++++-------------------------
1 file changed, 31 insertions(+), 36 deletions(-)
diff --git a/lib/sh/strtrans.c b/lib/sh/strtrans.c
index af75dcfa..f204cb79 100644
--- a/lib/sh/strtrans.c
+++ b/lib/sh/strtrans.c
@@ -227,30 +227,24 @@ ansic_quote (const char *str, int flags, int *rlen)
{
char *r, *ret;
const char *s;
- size_t l, rsize;
unsigned char c;
+#if defined (HANDLE_MULTIBYTE)
size_t clen;
int b;
-#if defined (HANDLE_MULTIBYTE)
wchar_t wc;
+ DECLARE_MBSTATE;
#endif
if (str == 0 || *str == 0)
return ((char *)0);
- l = strlen (str);
- rsize = 4 * l + 4;
- r = ret = (char *)xmalloc (rsize);
+ r = ret = (char *)xmalloc (4 * strlen (str) + 4);
*r++ = '$';
*r++ = '\'';
for (s = str; c = *s; s++)
{
- b = 1; /* 1 == add backslash; 0 == no backslash */
- l = 1;
- clen = 1;
-
switch (c)
{
case ESC: c = 'E'; break;
@@ -266,37 +260,38 @@ ansic_quote (const char *str, int flags, int *rlen)
break;
default:
#if defined (HANDLE_MULTIBYTE)
- b = is_basic (c);
- /* XXX - clen comparison to 0 is dicey */
- if ((b == 0 && ((clen = mbrtowc (&wc, s, MB_CUR_MAX, 0)) < 0 ||
MB_INVALIDCH (clen) || iswprint (wc) == 0)) ||
- (b == 1 && ISPRINT (c) == 0))
-#else
- if (ISPRINT (c) == 0)
-#endif
+ if (is_basic (c) == 0)
{
- *r++ = '\\';
- *r++ = TOCHAR ((c >> 6) & 07);
- *r++ = TOCHAR ((c >> 3) & 07);
- *r++ = TOCHAR (c & 07);
- continue;
+ clen = mbrtowc (&wc, s, MB_CUR_MAX, &state);
+ if (clen == 0)
+ break;
+ if (MB_INVALIDCH (clen))
+ memset (&state, 0, sizeof (mbstate_t));
+ else if (iswprint (wc))
+ {
+ for (b = 0; b < (int)clen; b++)
+ *r++ = (unsigned char)s[b];
+ s += clen - 1; /* -1 because of the increment above */
+ continue;
+ }
}
- l = 0;
- break;
+ else
+#endif
+ if (ISPRINT (c))
+ {
+ *r++ = c;
+ continue;
+ }
+
+ *r++ = '\\';
+ *r++ = TOCHAR ((c >> 6) & 07);
+ *r++ = TOCHAR ((c >> 3) & 07);
+ *r++ = TOCHAR (c & 07);
+ continue;
}
- if (b == 0 && clen == 0)
- break;
- if (l)
- *r++ = '\\';
-
- if (clen == 1)
- *r++ = c;
- else
- {
- for (b = 0; b < (int)clen; b++)
- *r++ = (unsigned char)s[b];
- s += clen - 1; /* -1 because of the increment above */
- }
+ *r++ = '\\';
+ *r++ = c;
}
*r++ = '\'';
--
2.51.0