This patch is a port of my GMime rfc2047 decoder which is even more liberal in what it accepts than Thunderbird and is what I will be committing to svn.
closing bugs: #302991 #315513 #502178 Jeff
Index: camel-mime-utils.c =================================================================== --- camel-mime-utils.c (revision 8315) +++ camel-mime-utils.c (working copy) @@ -821,116 +821,321 @@ *in = inptr; } -/* decode rfc 2047 encoded string segment */ static char * -rfc2047_decode_word(const char *in, size_t len) +camel_iconv_strndup (iconv_t cd, const char *string, size_t n) { - const char *inptr = in+2; - const char *inend = in+len-2; + size_t inleft, outleft, converted = 0; + char *out, *outbuf; const char *inbuf; - const char *charset; - char *encname, *p; - int tmplen; - size_t ret; - char *decword = NULL; - char *decoded = NULL; - char *outbase = NULL; - char *outbuf; - size_t inlen, outlen; - gboolean retried = FALSE; - iconv_t ic; - - d(printf("rfc2047: decoding '%.*s'\n", len, in)); - - /* quick check to see if this could possibly be a real encoded word */ - if (len < 8 || !(in[0] == '=' && in[1] == '?' && in[len-1] == '=' && in[len-2] == '?')) { - d(printf("invalid\n")); - return NULL; - } - - /* skip past the charset to the encoding type */ - inptr = memchr (inptr, '?', inend-inptr); - if (inptr != NULL && inptr < inend + 2 && inptr[2] == '?') { - d(printf("found ?, encoding is '%c'\n", inptr[0])); - inptr++; - tmplen = inend-inptr-2; - decword = g_alloca (tmplen); /* this will always be more-than-enough room */ - switch(toupper(inptr[0])) { - case 'Q': - inlen = quoted_decode((const unsigned char *) inptr+2, tmplen, (unsigned char *) decword); - break; - case 'B': { - int state = 0; - unsigned int save = 0; - - inlen = camel_base64_decode_step((unsigned char *) inptr+2, tmplen, (unsigned char *) decword, &state, &save); - /* if state != 0 then error? */ - break; + size_t outlen; + int errnosav; + + if (cd == (iconv_t) -1) + return g_strndup (string, n); + + outlen = n * 2 + 16; + out = g_malloc (outlen + 4); + + inbuf = string; + inleft = n; + + do { + errno = 0; + outbuf = out + converted; + outleft = outlen - converted; + + converted = iconv (cd, (char **) &inbuf, &inleft, &outbuf, &outleft); + if (converted == (size_t) -1) { + if (errno != E2BIG && errno != EINVAL) + goto fail; } - default: - /* uhhh, unknown encoding type - probably an invalid encoded word string */ - return NULL; + + /* + * E2BIG There is not sufficient room at *outbuf. + * + * We just need to grow our outbuffer and try again. + */ + + converted = outbuf - out; + if (errno == E2BIG) { + outlen += inleft * 2 + 16; + out = g_realloc (out, outlen + 4); + outbuf = out + converted; } - d(printf("The encoded length = %d\n", inlen)); - if (inlen > 0) { - /* yuck, all this snot is to setup iconv! */ - tmplen = inptr - in - 3; - encname = g_alloca (tmplen + 1); - memcpy (encname, in + 2, tmplen); - encname[tmplen] = '\0'; + } while (errno == E2BIG && inleft > 0); + + /* + * EINVAL An incomplete multibyte sequence has been encounĀ + * tered in the input. + * + * We'll just have to ignore it... + */ + + /* flush the iconv conversion */ + iconv (cd, NULL, NULL, &outbuf, &outleft); + + /* Note: not all charsets can be nul-terminated with a single + nul byte. UCS2, for example, needs 2 nul bytes and UCS4 + needs 4. I hope that 4 nul bytes is enough to terminate all + multibyte charsets? */ + + /* nul-terminate the string */ + memset (outbuf, 0, 4); + + /* reset the cd */ + iconv (cd, NULL, NULL, NULL, NULL); + + return out; + + fail: + + errnosav = errno; + + w(g_warning ("camel_iconv_strndup: %s at byte %lu", strerror (errno), n - inleft)); + + g_free (out); + + /* reset the cd */ + iconv (cd, NULL, NULL, NULL, NULL); + + errno = errnosav; + + return NULL; +} - /* rfc2231 updates rfc2047 encoded words... - * The ABNF given in RFC 2047 for encoded-words is: - * encoded-word := "=?" charset "?" encoding "?" encoded-text "?=" - * This specification changes this ABNF to: - * encoded-word := "=?" charset ["*" language] "?" encoding "?" encoded-text "?=" - */ +#define is_ascii(c) isascii ((int) ((unsigned char) (c))) - /* trim off the 'language' part if it's there... */ - p = strchr (encname, '*'); - if (p) - *p = '\0'; - - charset = e_iconv_charset_name (encname); - - inbuf = decword; - - outlen = inlen * 6 + 16; - outbase = g_alloca (outlen); - outbuf = outbase; - - retry: - ic = e_iconv_open ("UTF-8", charset); - if (ic != (iconv_t) -1) { - ret = e_iconv (ic, &inbuf, &inlen, &outbuf, &outlen); - if (ret != (size_t) -1) { - e_iconv (ic, NULL, 0, &outbuf, &outlen); - *outbuf = 0; - decoded = g_strdup (outbase); +static char * +decode_8bit (const char *text, size_t len, const char *default_charset) +{ + const char *charsets[4] = { "UTF-8", NULL, NULL, NULL }; + size_t inleft, outleft, outlen, rc, min, n; + const char *locale_charset, *best; + char *out, *outbuf; + const char *inbuf; + iconv_t cd; + int i = 1; + + if (default_charset && g_ascii_strcasecmp (default_charset, "UTF-8") != 0) + charsets[i++] = default_charset; + + locale_charset = e_iconv_locale_charset (); + if (g_ascii_strcasecmp (locale_charset, "UTF-8") != 0) + charsets[i++] = locale_charset; + + min = len; + best = charsets[0]; + + outlen = (len * 2) + 16; + out = g_malloc (outlen + 1); + + for (i = 0; charsets[i]; i++) { + if ((cd = e_iconv_open ("UTF-8", charsets[i])) == (iconv_t) -1) + continue; + + outleft = outlen; + outbuf = out; + inleft = len; + inbuf = text; + n = 0; + + do { + rc = iconv (cd, (char **) &inbuf, &inleft, &outbuf, &outleft); + if (rc == (size_t) -1) { + if (errno == EINVAL) { + /* incomplete sequence at the end of the input buffer */ + n += inleft; + break; } - e_iconv_close (ic); - } else { - w(g_warning ("Cannot decode charset, header display may be corrupt: %s: %s", - charset, strerror (errno))); - - if (!retried) { - charset = e_iconv_locale_charset (); - if (!charset) - charset = "iso-8859-1"; - - retried = TRUE; - goto retry; + + if (errno == E2BIG) { + outlen += (inleft * 2) + 16; + rc = (size_t) (outbuf - out); + out = g_realloc (out, outlen + 1); + outleft = outlen - rc; + outbuf = out + rc; + } else { + inleft--; + inbuf++; + n++; } - - /* we return the encoded word here because we've got to return valid utf8 */ - decoded = g_strndup (in, inlen); } + } while (inleft > 0); + + rc = iconv (cd, NULL, NULL, &outbuf, &outleft); + *outbuf = '\0'; + + e_iconv_close (cd); + + if (rc != (size_t) -1 && n == 0) + return out; + + if (n < min) { + best = charsets[i]; + min = n; } } + + /* if we get here, then none of the charsets fit the 8bit text flawlessly... + * try to find the one that fit the best and use that to convert what we can, + * replacing any byte we can't convert with a '?' */ + + if ((cd = e_iconv_open ("UTF-8", best)) == (iconv_t) -1) { + /* this shouldn't happen... but if we are here, then + * it did... the only thing we can do at this point + * is replace the 8bit garbage and pray */ + register const char *inptr = text; + const char *inend = inptr + len; + + outbuf = out; + + while (inptr < inend) { + if (is_ascii (*inptr)) + *outbuf++ = *inptr++; + else + *outbuf++ = '?'; + } + + *outbuf = '\0'; + + return out; + } + + outleft = outlen; + outbuf = out; + inleft = len; + inbuf = text; + + do { + rc = iconv (cd, (char **) &inbuf, &inleft, &outbuf, &outleft); + if (rc == (size_t) -1) { + if (errno == EINVAL) { + /* incomplete sequence at the end of the input buffer */ + break; + } + + if (errno == E2BIG) { + rc = outbuf - out; + outlen += inleft * 2 + 16; + out = g_realloc (out, outlen + 1); + outleft = outlen - rc; + outbuf = out + rc; + } else { + *outbuf++ = '?'; + outleft--; + inleft--; + inbuf++; + } + } + } while (inleft > 0); + + iconv (cd, NULL, NULL, &outbuf, &outleft); + *outbuf = '\0'; + + e_iconv_close (cd); + + return out; +} - d(printf("decoded '%s'\n", decoded)); +#define is_rfc2047_encoded_word(atom, len) (len >= 7 && !strncmp (atom, "=?", 2) && !strncmp (atom + len - 2, "?=", 2)) - return decoded; +/* decode an rfc2047 encoded-word token */ +static char * +rfc2047_decode_word (const char *in, size_t inlen, const char *default_charset) +{ + const unsigned char *instart = (const unsigned char *) in; + const register unsigned char *inptr = instart + 2; + const unsigned char *inend = instart + inlen - 2; + unsigned char *decoded; + const char *charset; + char *charenc, *p; + guint32 save = 0; + ssize_t declen; + int state = 0; + size_t len; + iconv_t cd; + char *buf; + + /* skip over the charset */ + if (!(inptr = memchr (inptr, '?', inend - inptr)) || inptr[2] != '?') + return NULL; + + inptr++; + + switch (*inptr) { + case 'B': + case 'b': + inptr += 2; + decoded = g_alloca (inend - inptr); + declen = camel_base64_decode_step ((unsigned char *) inptr, inend - inptr, decoded, &state, &save); + break; + case 'Q': + case 'q': + inptr += 2; + decoded = g_alloca (inend - inptr); + declen = quoted_decode (inptr, inend - inptr, decoded); + + if (declen == -1) { + d(fprintf (stderr, "encountered broken 'Q' encoding\n")); + return NULL; + } + break; + default: + d(fprintf (stderr, "unknown encoding\n")); + return NULL; + } + + len = (inptr - 3) - (instart + 2); + charenc = g_alloca (len + 1); + memcpy (charenc, in + 2, len); + charenc[len] = '\0'; + charset = charenc; + + /* rfc2231 updates rfc2047 encoded words... + * The ABNF given in RFC 2047 for encoded-words is: + * encoded-word := "=?" charset "?" encoding "?" encoded-text "?=" + * This specification changes this ABNF to: + * encoded-word := "=?" charset ["*" language] "?" encoding "?" encoded-text "?=" + */ + + /* trim off the 'language' part if it's there... */ + if ((p = strchr (charset, '*'))) + *p = '\0'; + + /* slight optimization? */ + if (!g_ascii_strcasecmp (charset, "UTF-8")) { + p = (char *) decoded; + len = declen; + + while (!g_utf8_validate (p, len, (const char **) &p)) { + len = declen - (p - (char *) decoded); + *p = '?'; + } + + return g_strndup ((char *) decoded, declen); + } + + if (charset[0]) + charset = e_iconv_charset_name (charset); + + if (!charset[0] || (cd = e_iconv_open ("UTF-8", charset)) == (iconv_t) -1) { + w(g_warning ("Cannot convert from %s to UTF-8, header display may " + "be corrupt: %s", charset[0] ? charset : "unspecified charset", + g_strerror (errno))); + + return decode_8bit ((char *) decoded, declen, default_charset); + } + + buf = camel_iconv_strndup (cd, (char *) decoded, declen); + e_iconv_close (cd); + + if (buf != NULL) + return buf; + + w(g_warning ("Failed to convert \"%.*s\" to UTF-8, display may be " + "corrupt: %s", declen, decoded, g_strerror (errno))); + + return decode_8bit ((char *) decoded, declen, default_charset); } /* ok, a lot of mailers are BROKEN, and send iso-latin1 encoded @@ -988,7 +1193,7 @@ } static GString * -append_quoted_pair (GString *str, const char *in, gssize inlen) +append_quoted_pair (GString *str, const char *in, size_t inlen) { register const char *inptr = in; const char *inend = in + inlen; @@ -1007,67 +1212,117 @@ /* decodes a simple text, rfc822 + rfc2047 */ static char * -header_decode_text (const char *in, size_t inlen, int ctext, const char *default_charset) +header_decode_text (const char *in, int ctext, const char *default_charset) { + register const char *inptr = in; + gboolean encoded = FALSE; + const char *lwsp, *text; + size_t nlwsp, n; + gboolean ascii; + char *decoded; GString *out; - const char *inptr, *inend, *start, *chunk, *locale_charset; - GString *(* append) (GString *, const char *, gssize); - char *dword = NULL; - guint32 mask; - - locale_charset = e_iconv_locale_charset (); - - if (ctext) { - mask = (CAMEL_MIME_IS_SPECIAL | CAMEL_MIME_IS_SPACE | CAMEL_MIME_IS_CTRL); - append = append_quoted_pair; - } else { - mask = (CAMEL_MIME_IS_LWSP); - append = g_string_append_len; - } - - out = g_string_new (""); - inptr = in; - inend = inptr + inlen; - chunk = NULL; - - while (inptr < inend) { - start = inptr; - while (inptr < inend && camel_mime_is_type (*inptr, mask)) + + if (in == NULL) + return g_strdup (""); + + out = g_string_sized_new (strlen (in) + 1); + + while (*inptr != '\0') { + lwsp = inptr; + while (camel_mime_is_lwsp (*inptr)) inptr++; - - if (inptr == inend) { - append (out, start, inptr - start); + + nlwsp = (size_t) (inptr - lwsp); + + if (*inptr != '\0') { + text = inptr; + ascii = TRUE; + + if (!strncmp (inptr, "=?", 2)) { + inptr += 2; + + /* skip past the charset (if one is even declared, sigh) */ + while (*inptr && *inptr != '?') { + ascii = ascii && is_ascii (*inptr); + inptr++; + } + + /* sanity check encoding type */ + if (inptr[0] != '?' || !strchr ("BbQq", inptr[1]) || inptr[2] != '?') + goto non_rfc2047; + + inptr += 3; + + /* find the end of the rfc2047 encoded word token */ + while (*inptr && strncmp (inptr, "?=", 2) != 0) { + ascii = ascii && is_ascii (*inptr); + inptr++; + } + + if (!strncmp (inptr, "?=", 2)) + inptr += 2; + } else { + non_rfc2047: + /* stop if we encounter a possible rfc2047 encoded + * token even if it's inside another word, sigh. */ + while (*inptr && !camel_mime_is_lwsp (*inptr) && + strncmp (inptr, "=?", 2) != 0) { + ascii = ascii && is_ascii (*inptr); + inptr++; + } + } + + n = (size_t) (inptr - text); + if (is_rfc2047_encoded_word (text, n)) { + if ((decoded = rfc2047_decode_word (text, n, default_charset))) { + /* rfc2047 states that you must ignore all + * whitespace between encoded words */ + if (!encoded) + g_string_append_len (out, lwsp, nlwsp); + + g_string_append (out, decoded); + g_free (decoded); + + encoded = TRUE; + } else { + /* append lwsp and invalid rfc2047 encoded-word token */ + g_string_append_len (out, lwsp, nlwsp + n); + encoded = FALSE; + } + } else { + /* append lwsp */ + g_string_append_len (out, lwsp, nlwsp); + + /* append word token */ + if (!ascii) { + /* *sigh* I hate broken mailers... */ + decoded = decode_8bit (text, n, default_charset); + n = strlen (decoded); + text = decoded; + } else { + decoded = NULL; + } + + if (!ctext) + g_string_append_len (out, text, n); + else + append_quoted_pair (out, text, n); + + g_free (decoded); + + encoded = FALSE; + } + } else { + /* appending trailing lwsp */ + g_string_append_len (out, lwsp, nlwsp); break; - } else if (dword == NULL) { - append (out, start, inptr - start); - } else { - chunk = start; } - - start = inptr; - while (inptr < inend && !camel_mime_is_type (*inptr, mask)) - inptr++; - - dword = rfc2047_decode_word(start, inptr-start); - if (dword) { - g_string_append(out, dword); - g_free(dword); - } else { - if (!chunk) - chunk = start; - - if ((default_charset == NULL || !append_8bit (out, chunk, inptr-chunk, default_charset)) - && (locale_charset == NULL || !append_8bit(out, chunk, inptr-chunk, locale_charset))) - append_latin1(out, chunk, inptr-chunk); - } - - chunk = NULL; } - - dword = out->str; + + decoded = out->str; g_string_free (out, FALSE); - - return dword; + + return decoded; } @@ -1086,7 +1341,8 @@ { if (in == NULL) return NULL; - return header_decode_text (in, strlen (in), FALSE, default_charset); + + return header_decode_text (in, FALSE, default_charset); } @@ -1106,7 +1362,8 @@ { if (in == NULL) return NULL; - return header_decode_text (in, strlen (in), TRUE, default_charset); + + return header_decode_text (in, TRUE, default_charset); } /* how long a sequence of pre-encoded words should be less than, to attempt to @@ -2342,8 +2599,7 @@ g_free(text); /* or maybe that we've added up a bunch of broken bits to make an encoded word */ - text = rfc2047_decode_word(name->str, name->len); - if (text) { + if ((text = rfc2047_decode_word (name->str, name->len, charset))) { g_string_truncate(name, 0); g_string_append(name, text); g_free(text); @@ -2901,7 +3157,7 @@ node->next = NULL; node->name = name; if (strncmp(value, "=?", 2) == 0 - && (node->value = header_decode_text(value, strlen(value), FALSE, NULL))) { + && (node->value = header_decode_text(value, FALSE, NULL))) { g_free(value); } else if (g_ascii_strcasecmp (name, "boundary") != 0 && !g_utf8_validate(value, -1, NULL)) { const char *charset = e_iconv_locale_charset(); Index: camel-charset-map.c =================================================================== --- camel-charset-map.c (revision 8315) +++ camel-charset-map.c (working copy) @@ -52,8 +52,9 @@ #include <glib.h> static struct { - char *name; - unsigned int bit; /* assigned bit */ + char *name; /* charset name */ + int multibyte; /* charset type */ + unsigned int bit; /* assigned bit */ } tables[] = { /* These are the 8bit character sets (other than iso-8859-1, * which is special-cased) which are supported by both other @@ -61,20 +62,35 @@ * they're listed in is the order they'll be tried in, so put * the more-popular ones first. */ - { "iso-8859-2", 0 }, /* Central/Eastern European */ - { "iso-8859-4", 0 }, /* Baltic */ - { "koi8-r", 0 }, /* Russian */ - { "koi8-u", 0 }, /* Ukranian */ - { "iso-8859-5", 0 }, /* Least-popular Russian encoding */ - { "iso-8859-7", 0 }, /* Greek */ - { "iso-8859-8", 0 }, /* Hebrew; Visual */ - { "iso-8859-9", 0 }, /* Turkish */ - { "iso-8859-13", 0 }, /* Baltic again */ - { "iso-8859-15", 0 }, /* New-and-improved iso-8859-1, but most - * programs that support this support UTF8 - */ - { "windows-1251", 0 }, /* Russian */ - { 0, 0 } + { "iso-8859-2", 0, 0 }, /* Central/Eastern European */ + { "iso-8859-4", 0, 0 }, /* Baltic */ + { "koi8-r", 0, 0 }, /* Russian */ + { "koi8-u", 0, 0 }, /* Ukranian */ + { "iso-8859-5", 0, 0 }, /* Least-popular Russian encoding */ + { "iso-8859-6", 0, 0 }, /* Arabic */ + { "iso-8859-7", 0, 0 }, /* Greek */ + { "iso-8859-8", 0, 0 }, /* Hebrew; Visual */ + { "iso-8859-9", 0, 0 }, /* Turkish */ + { "iso-8859-13", 0, 0 }, /* Baltic again */ + { "iso-8859-15", 0, 0 }, /* New-and-improved iso-8859-1, but most + * programs that support this support UTF8 + */ + { "windows-1251", 0, 0 }, /* Russian */ + + /* These are the multibyte character sets which are commonly + * supported by other mail clients. Note: order for multibyte + * charsets does not affect priority unlike the 8bit charsets + * listed above. + */ + { "iso-2022-jp", 1, 0 }, /* Japanese designed for use over the Net */ + { "Shift-JIS", 1, 0 }, /* Japanese as used by Windows and MacOS systems */ + { "euc-jp", 1, 0 }, /* Japanese traditionally used on Unix systems */ + { "euc-kr", 1, 0 }, /* Korean */ + { "iso-2022-kr", 1, 0 }, /* Korean (less popular than euc-kr) */ + { "gb2312", 1, 0 }, /* Simplified Chinese */ + { "Big5", 1, 0 }, /* Traditional Chinese */ + { "euc-tw", 1, 0 }, + { NULL, 0, 0 } }; unsigned int encoding_map[256 * 256]; @@ -85,118 +101,196 @@ #define UCS "UCS-4LE" #endif -int main (void) +static guint +block_hash (gconstpointer v) { - int i, j; - int max, min; - int bit = 0x01; - int k; + const signed char *p = v; + guint32 h = *p++; + int i; + + for (i = 0; i < 256; i++) + h = (h << 5) - h + *p++; + + return h; +} + +static int +block_equal (gconstpointer v1, gconstpointer v2) +{ + return !memcmp (v1, v2, 256); +} + +int main (int argc, char **argv) +{ + unsigned char *block = NULL; + unsigned int bit = 0x01; + GHashTable *table_hash; + size_t inleft, outleft; + char *inbuf, *outbuf; + guint32 out[128], c; + char in[128]; + int i, j, k; int bytes; iconv_t cd; - char in[128]; - guint32 out[128]; - char *inptr, *outptr; - size_t inlen, outlen; - + /* dont count the terminator */ - bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8; - + bytes = ((sizeof (tables) / sizeof (tables[0])) + 7 - 1) / 8; + g_assert (bytes <= 4); + for (i = 0; i < 128; i++) in[i] = i + 128; - - for (j = 0; tables[j].name; j++) { + + for (j = 0; tables[j].name && !tables[j].multibyte; j++) { cd = iconv_open (UCS, tables[j].name); - if (cd == (iconv_t)-1) - exit (1); - inptr = in; - outptr = (char *)(out); - inlen = sizeof (in); - outlen = sizeof (out); - while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) { + inbuf = in; + inleft = sizeof (in); + outbuf = (char *) out; + outleft = sizeof (out); + while (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) == -1) { if (errno == EILSEQ) { - inptr++; - inlen--; + inbuf++; + inleft--; } else { - printf ("%s\n", strerror (errno)); + g_warning ("iconv (%s->UCS4, ..., %d, ..., %d): %s", + tables[j].name, inleft, outleft, + g_strerror (errno)); exit (1); } } iconv_close (cd); - - for (i = 0; i < 128 - outlen / 4; i++) { + + for (i = 0; i < 128 - outleft / 4; i++) { encoding_map[i] |= bit; encoding_map[out[i]] |= bit; } - + tables[j].bit = bit; bit <<= 1; } - - printf("/* This file is automatically generated: DO NOT EDIT */\n\n"); - - for (i=0;i<256;i++) { - /* first, do we need this block? */ - for (k=0;k<bytes;k++) { - for (j=0;j<256;j++) { - if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0) - break; + + /* Mutibyte tables */ + for ( ; tables[j].name && tables[j].multibyte; j++) { + cd = iconv_open (tables[j].name, UCS); + if (cd == (iconv_t) -1) + continue; + + for (c = 128, i = 0; c < 65535 && i < 65535; c++) { + inbuf = (char *) &c; + inleft = sizeof (c); + outbuf = in; + outleft = sizeof (in); + + if (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) != (size_t) -1) { + /* this is a legal character in charset table[j].name */ + iconv (cd, NULL, NULL, &outbuf, &outleft); + encoding_map[i++] |= bit; + encoding_map[c] |= bit; + } else { + /* reset the iconv descriptor */ + iconv (cd, NULL, NULL, NULL, NULL); } - if (j < 256) { - /* yes, dump it */ - printf("static const unsigned char m%02x%x[256] = {\n\t", i, k); - for (j=0;j<256;j++) { - printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff ); - if (((j+1)&7) == 0 && j<255) - printf("\n\t"); + } + + iconv_close (cd); + + tables[j].bit = bit; + bit <<= 1; + } + + printf ("/* This file is automatically generated: DO NOT EDIT */\n\n"); + + table_hash = g_hash_table_new_full (block_hash, block_equal, g_free, g_free); + + for (i = 0; i < 256; i++) { + for (k = 0; k < bytes; k++) { + char name[32], *alias; + int has_bits = FALSE; + + if (!block) { + /* we reuse malloc'd blocks that are not added to the + * hash table to avoid unnecessary malloc/free's */ + block = g_malloc (256); + } + + for (j = 0; j < 256; j++) { + if ((block[j] = (encoding_map[i * 256 + j] >> (k * 8)) & 0xff)) + has_bits = TRUE; + } + + if (!has_bits) + continue; + + sprintf (name, "m%02x%x", i, k); + + if ((alias = g_hash_table_lookup (table_hash, block))) { + /* this block is identical to an earlier block, just alias it */ + printf ("#define %s %s\n\n", name, alias); + } else { + /* unique block, dump it */ + g_hash_table_insert (table_hash, block, g_strdup (name)); + + printf ("static unsigned char %s[256] = {\n\t", name); + for (j = 0; j < 256; j++) { + printf ("0x%02x, ", block[j]); + if (((j + 1) & 7) == 0 && j < 255) + printf ("\n\t"); } - printf("\n};\n\n"); + printf ("\n};\n\n"); + + /* force the next loop to malloc a new block */ + block = NULL; } } } - - printf("static const struct {\n"); - for (k=0;k<bytes;k++) { - printf("\tconst unsigned char *bits%d;\n", k); - } - printf("} camel_charmap[256] = {\n\t"); - for (i=0;i<256;i++) { - /* first, do we need this block? */ - printf("{ "); - for (k=0;k<bytes;k++) { - for (j=0;j<256;j++) { - if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0) + + g_hash_table_destroy (table_hash); + g_free (block); + + printf ("struct {\n"); + for (k = 0; k < bytes; k++) + printf ("\tunsigned char *bits%d;\n", k); + + printf ("} camel_charmap[256] = {\n\t"); + for (i = 0; i < 256; i++) { + printf ("{ "); + for (k = 0; k < bytes; k++) { + for (j = 0; j < 256; j++) { + if ((encoding_map[i * 256 + j] & (0xff << (k * 8))) != 0) break; } - if (j < 256) { - printf("m%02x%x, ", i, k); - } else { - printf("NULL, "); - } + + if (j < 256) + printf ("m%02x%x, ", i, k); + else + printf ("NULL, "); } - printf("}, "); - if (((i+1)&7) == 0 && i<255) - printf("\n\t"); + + printf ("}, "); + if (((i + 1) & 3) == 0 && i < 255) + printf ("\n\t"); } - printf("\n};\n\n"); - - printf("static const struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n"); - for (j=0;tables[j].name;j++) { - printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit); - } - printf("};\n\n"); - - printf("#define charset_mask(x) \\\n"); - for (k=0;k<bytes;k++) { - if (k!=0) - printf("\t| "); + printf ("\n};\n\n"); + + printf ("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n"); + for (j = 0; tables[j].name; j++) + printf ("\t{ \"%s\", 0x%08x },\n", tables[j].name, tables[j].bit); + printf ("};\n\n"); + + printf ("#define charset_mask(x) \\\n"); + for (k = 0; k < bytes; k++) { + if (k != 0) + printf ("\t| "); else - printf("\t"); - printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8); - if (k<bytes-1) - printf("\t\\\n"); + printf ("\t"); + + printf ("(camel_charmap[(x) >> 8].bits%d ? camel_charmap[(x) >> 8].bits%d[(x) & 0xff] << %d : 0)", + k, k, k * 8); + + if (k < bytes - 1) + printf ("\t\\\n"); } - printf("\n\n"); - + printf ("\n\n"); + return 0; }
_______________________________________________ Evolution-hackers mailing list Evolution-hackers@gnome.org http://mail.gnome.org/mailman/listinfo/evolution-hackers