ansic_quote calls mbrtowc with a NULL parser state argument. This uses a
static anonymous shift state that ends up in an undefined state after an
invalid sequence. AFAICT most libcs handle UTF-8 conversions with the
static state just fine, but on Android [1] the bionic libc implementation
will, in fact, produce incorrect results if the state is reused after an
invalid sequence:

    $ LC_ALL=C printf '%q\n' $'\200\321\215'
    $'\200\321�'

I couldn't find a clean place in the exisiting code to add the parser
state reset so reorganized some of the surrounding code for this patch.

[1] Termux by default disables the C locale. The behavior above is only
    visible in Termux once the Termux setlocale override is removed.
---
 lib/sh/strtrans.c | 67 ++++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 36 deletions(-)

diff --git a/lib/sh/strtrans.c b/lib/sh/strtrans.c
index af75dcfa..f204cb79 100644
--- a/lib/sh/strtrans.c
+++ b/lib/sh/strtrans.c
@@ -227,30 +227,24 @@ ansic_quote (const char *str, int flags, int *rlen)
 {
   char *r, *ret;
   const char  *s;
-  size_t l, rsize;
   unsigned char c;
+#if defined (HANDLE_MULTIBYTE)
   size_t clen;
   int b;
-#if defined (HANDLE_MULTIBYTE)
   wchar_t wc;
+  DECLARE_MBSTATE;
 #endif
 
   if (str == 0 || *str == 0)
     return ((char *)0);
 
-  l = strlen (str);
-  rsize = 4 * l + 4;
-  r = ret = (char *)xmalloc (rsize);
+  r = ret = (char *)xmalloc (4 * strlen (str) + 4);
 
   *r++ = '$';
   *r++ = '\'';
 
   for (s = str; c = *s; s++)
     {
-      b = 1;           /* 1 == add backslash; 0 == no backslash */
-      l = 1;
-      clen = 1;
-
       switch (c)
        {
        case ESC: c = 'E'; break;
@@ -266,37 +260,38 @@ ansic_quote (const char *str, int flags, int *rlen)
          break;
        default:
 #if defined (HANDLE_MULTIBYTE)
-         b = is_basic (c);
-         /* XXX - clen comparison to 0 is dicey */
-         if ((b == 0 && ((clen = mbrtowc (&wc, s, MB_CUR_MAX, 0)) < 0 || 
MB_INVALIDCH (clen) || iswprint (wc) == 0)) ||
-             (b == 1 && ISPRINT (c) == 0))
-#else
-         if (ISPRINT (c) == 0)
-#endif
+         if (is_basic (c) == 0)
            {
-             *r++ = '\\';
-             *r++ = TOCHAR ((c >> 6) & 07);
-             *r++ = TOCHAR ((c >> 3) & 07);
-             *r++ = TOCHAR (c & 07);
-             continue;
+             clen = mbrtowc (&wc, s, MB_CUR_MAX, &state);
+             if (clen == 0)
+               break;
+             if (MB_INVALIDCH (clen))
+               memset (&state, 0, sizeof (mbstate_t));
+             else if (iswprint (wc))
+               {
+                 for (b = 0; b < (int)clen; b++)
+                   *r++ = (unsigned char)s[b];
+                 s += clen - 1;        /* -1 because of the increment above */
+                 continue;
+               }
            }
-         l = 0;
-         break;
+         else
+#endif
+           if (ISPRINT (c))
+             {
+               *r++ = c;
+               continue;
+             }
+
+           *r++ = '\\';
+           *r++ = TOCHAR ((c >> 6) & 07);
+           *r++ = TOCHAR ((c >> 3) & 07);
+           *r++ = TOCHAR (c & 07);
+           continue;
        }
-      if (b == 0 && clen == 0)
-       break;
 
-      if (l)
-       *r++ = '\\';
-
-      if (clen == 1)
-       *r++ = c;
-      else
-       {
-         for (b = 0; b < (int)clen; b++)
-           *r++ = (unsigned char)s[b];
-         s += clen - 1;        /* -1 because of the increment above */
-       }
+      *r++ = '\\';
+      *r++ = c;
     }
 
   *r++ = '\'';
-- 
2.51.0


Reply via email to