Fwd: probable bug(s) in do_conversion() in iri.c

Derek Martin Tue, 20 Apr 2021 14:17:59 -0700

Resending this without attachments, since as I feared the list gateway
seems to have eaten the version with them.  I'll wait for a response
before trying to provide details (code or patches).


----- Forwarded message from Derek Martin <[email protected]> -----

Hi,

I have been trying to sort out the fix for CVE-2019-5953, which based
on a diff of wget-1.20 vs. 1.21.1, appears to be simply this, in
do_conversion, iri.c:

@@ -189,9 +190,10 @@
         {
           tooshort++;
           done = len;
-          len = outlen = done + inlen * 2;
-          s = xrealloc (s, outlen + 1);
-          *out = s + done;
+          len = done + inlen * 2;
+          s = xrealloc (s, len + 1);
+          *out = s + done - outlen;
+          outlen += inlen * 2;
         }
       else /* Weird, we got an unspecified error */
         {

However, regardless of what the fix actually was, it appears to me
this function in 1.21.1 produces incorrect results.  I have no valid
IRIs to test this against, so what I did to test was pull the function
out of wget, with very minor modifications to get it to compile, and
provide a tiny wrapper.

Even that proved challenging, because I could not find useful detailed
info about the bug, or the case that caused the buffer overflow, or
even how to get this code path to execute with natural valid input.
So what I did instead was to make one additional small change, to
force the initial allocation of the output buffer to be too small to
hold the converted output data, for the vast majority of cases.  

For input, I used the EUC-KR encoding of the Korean string,
"안녕하세요." (Annyeong haseyo):

$ od -xa test_input.txt
0000000    c8be    e7b3    cfc7    bcbc    e4bf    0a2e
          >   H   3   g   G   O   <   <   ?   d   .  nl
0000014

Then attempted to convert it from EUC-KR to UTF-8 using this function.
Doing so produced garbage, because after the (successful) conversion
from iconv, the code inserts a null into the string somewhere in the
middle, truncating the output (and in this case, breaking one of the 
characters:

$ gcc -g -o dct do_conversion_test.c
$ ./dct `cat test_input.txt` euc-kr utf-8
output buffer too small case hit
converted '�ȳ��ϼ���.' (euc-kr) -> '�' (utf-8)

I rewrote the function, and my updated version produces correct
results:

$ gcc -g -o dct_fix  do_conversion_fix.c
$ ./dct_fix `cat test_input.txt` euc-kr utf-8
output buffer too small case hit
converted ''
converted '�ȳ��ϼ���.' (euc-kr) -> '안녕하세요.' (utf-8)

Naturally the input data encoded in EUC-KR prints as gibberish since
it doesn't match my terminal settings, but importantly the output in
UTF-8 (which does match my settings) is correct.

I believe the 1.21.1 version of the function also has a case where:

+          *out = s + done - outlen;

can set *out to an address before the allocated buffer, because done
is zero and outlen is not.

Another *potential* issue with the wget-1.21.1 version is that it only
leaves 1 byte for the terminating null, however if the output format
is, say, UTF-32, the terminating null will be 4 bytes.  My version of
the function leaves 4 bytes for the null and fills them all in.  I did
not attempt to address the middle case (where errno == EINVAL or
EILSEQ) because I wasn't quite sure what the intended behavior there
is, so a future fix would need to address that as well, since I
renamed/changed some of the variables and how they're used to make the
logic a bit clearer (to me at least). [I just had my version return
false in this case...]

I've attempted to attach three files to this message:

do_conversion_test.c - The extracted function with minimal modifications
do_conversion_fix.c - My fix to this function 
test_input.txt - The input text I described above.

Thanks,
Derek Martin



#include <stdio.h>
#include <iconv.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <error.h>
#include <stdbool.h>

#define logprintf fprintf
#define debug_logprintf(args...) fprintf(stderr, ##args)
#define ICONV_CONST
#define IF_DEBUG
#define LOG_VERBOSE stderr
#define _ 

int usage(void)
{
    fprintf(stderr, "Takes 3 args: <string to convert> <from encoding> <to 
encoding>\n");
    return -1;
}

static bool
do_conversion (const char *tocode, const char *fromcode, char const *in_org, 
size_t inlen, char **out)
{
  iconv_t cd;
  /* sXXXav : hummm hard to guess... */
  size_t len, done, outlen;
  int invalid = 0, tooshort = 0;
  char *s, *in, *in_save;

  cd = iconv_open (tocode, fromcode);
  if (cd == (iconv_t)(-1))
    {
      logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"),
                 fromcode, tocode);
      *out = NULL;
      return false;
    }

  /* iconv() has to work on an unescaped string */
  in_save = in = strndup (in_org, inlen);
  // url_unescape_except_reserved (in)
  inlen = strlen(in);

  // len = outlen = inlen * 2;
  // force initial buffer size to be too small for vast majority of cases
  len = outlen = inlen = 2;
  *out = s = (char *)malloc(outlen + 1);
  done = 0;

  for (;;)
    {
      if (iconv (cd, (ICONV_CONST char **) &in, &inlen, out, &outlen) != 
(size_t)(-1) &&
          iconv (cd, NULL, NULL, out, &outlen) != (size_t)(-1))
        {
          *out = s;
          *(s + len - outlen - done) = '\0';
          free(in_save);
          iconv_close(cd);
          IF_DEBUG
          {
            /* not not print out embedded passwords, in_org might be an URL */
            if (!strchr(in_org, '@') && !strchr(*out, '@'))
              debug_logprintf ("converted '%s' (%s) -> '%s' (%s)\n", in_org, 
fromcode, *out, tocode);
            else
              debug_logprintf ("logging suppressed, strings may contain 
password\n");
          }
          return true;
        }

      /* Incomplete or invalid multibyte sequence */
      if (errno == EINVAL || errno == EILSEQ)
        {
          if (!invalid)
            logprintf (LOG_VERBOSE,
                      _("Incomplete or invalid multibyte sequence 
encountered\n"));

          invalid++;
          **out = *in;
          in++;
          inlen--;
          (*out)++;
          outlen--;
        }
      else if (errno == E2BIG) /* Output buffer full */
        {
          tooshort++;
          printf("output buffer too small case hit\n");
          done = len;
          len = done + inlen * 2;
          s = realloc(s, len + 1);
          *out = s + done - outlen;
          outlen += inlen * 2;
        }
      else /* Weird, we got an unspecified error */
        {
          logprintf (LOG_VERBOSE, _("Unhandled errno %d\n"), errno);
          break;
        }
    }

    free(in_save);
    iconv_close(cd);
    IF_DEBUG
    {
      /* not not print out embedded passwords, in_org might be an URL */
      if (!strchr(in_org, '@') && !strchr(*out, '@'))
        debug_logprintf ("dbg: converted '%s' (%s) -> '%s' (%s)\n", in_org, 
fromcode, *out, tocode);
      else
        debug_logprintf ("dbg: logging suppressed, strings may contain 
password\n");
    }
    return false;
}

int main(int argc, char **argv)
{
    char *out;

    /* Takes a string, a from encoding, and a to encoding, then converts */
    if (argc != 4) return usage();
    bool rc = do_conversion(argv[3], argv[2], argv[1], strlen(argv[1]), &out);
    if (!rc){
        fprintf(stderr, "conversion failed\n");
        return -1;
    } else {
        FILE *o = fopen("output", "w+");
        fprintf(o, "%s", out);
    }
    free(out);
    return 0;
}
    



#include <stdio.h>
#include <iconv.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <error.h>
#include <stdbool.h>

#define _ 
#define LOG_VERBOSE stderr
#define logprintf fprintf
#define debug_logprintf(args...) fprintf(stderr, ##args)
#define ICONV_CONST
#define IF_DEBUG

int usage(void)
{
    fprintf(stderr, "Takes 3 args: <string to convert> <from encoding> <to 
encoding>\n");
    return -1;
}

static bool
do_conversion (const char *tocode, const char *fromcode, char const *in_org, 
size_t inlen, char **out)
{
  iconv_t cd;
  /* sXXXav : hummm hard to guess... */
  size_t outbuf_len = 0; 
  size_t out_remain = 0;
  size_t converted = 0; 
  size_t in_remain = inlen;
  int invalid = 0;
  char *outcur, *in, *in_save;

  cd = iconv_open (tocode, fromcode);
  if (cd == (iconv_t)(-1))
    {
      logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"),
                 fromcode, tocode);
      *out = NULL;
      return false;
    }

  /* iconv() has to work on an unescaped string */
  in = strndup (in_org, inlen);
  in_save = in;
  inlen = strlen(in);

  // leave 4 bytes for NULL for e.g. utf-32
  //outbuf_len = 2 * inlen;
  outbuf_len = 2;
  out_remain = outbuf_len;
  *out =  (char *)malloc(outbuf_len + 4);
  outcur = *out;

  for (;;)
    {
      if (iconv (cd, (ICONV_CONST char **) &in, &in_remain, &outcur, 
&out_remain) != (size_t)(-1) &&
          iconv (cd, NULL, NULL, &outcur, &outbuf_len) != (size_t)(-1))
        {
          for (int i = 0; i < 4; ++i) *(outcur + i) = '\0';
          free(in_save);
          iconv_close(cd);
          IF_DEBUG
          {
            /* not not print out embedded passwords, in_org might be an URL */
            if (!strchr(in_org, '@') && !strchr(*out, '@'))
              debug_logprintf ("converted '%s' (%s) -> '%s' (%s)\n", in_org, 
fromcode, *out, tocode);
            else
              debug_logprintf ("logging suppressed, strings may contain 
password\n");
          }
          return true;
        }

      /* Incomplete or invalid multibyte sequence */
      if (errno == EINVAL || errno == EILSEQ)
        {
          if (!invalid)
            logprintf (LOG_VERBOSE,
                      _("Incomplete or invalid multibyte sequence 
encountered\n"));
          /* 

          invalid++;
          **out = *in_save;
          in_save++;
          inlen--;
          (*out)++;
          outlen--;
          */
          return false;
        }
      else if (errno == E2BIG) /* Output buffer full */
        {
          printf("output buffer too small case hit\n");
          converted = outbuf_len - out_remain;
          outbuf_len += in_remain * 2;
          *out = realloc(*out, outbuf_len + 4);
          outcur = *out + converted;
          IF_DEBUG
          {
            for (int i = 0; i < 4; ++i) *(outcur + i) = '\0';
            fprintf(stderr, "converted '%s'\n", *out);
          }
          out_remain = outbuf_len - converted;
        }
      else /* Weird, we got an unspecified error */
        {
          logprintf (LOG_VERBOSE, _("Unhandled errno %d\n"), errno);
          break;
        }
    }

    free(in_save);
    iconv_close(cd);
    IF_DEBUG
    {
      /* not not print out embedded passwords, in_org might be an URL */
      if (!strchr(in_org, '@') && !strchr(*out, '@'))
        debug_logprintf ("dbg: converted '%s' (%s) -> '%s' (%s)\n", in_org, 
fromcode, *out, tocode);
      else
        debug_logprintf ("dbg: logging suppressed, strings may contain 
password\n");
    }
    return false;
}

int main(int argc, char **argv)
{
    char *out;

    /* Takes a string, a from encoding, and a to encoding, then converts */
    if (argc != 4) return usage();
    bool rc = do_conversion(argv[3], argv[2], argv[1], strlen(argv[1]), &out);
    if (!rc){
        fprintf(stderr, "conversion failed\n");
        return -1;
    } else {
        FILE *o = fopen("output", "w+");
        fprintf(o, "%s", out);
    }
    free(out);
    return 0;
}
    



?ȳ??ϼ???.


----- End forwarded message -----

-- 
Derek Martin
Senior System Software Engineer II Lead
Akamai Technologies
[email protected]

Fwd: probable bug(s) in do_conversion() in iri.c

Reply via email to