[commit] master: add support for UTF-7 mailbox names

Oswald Buddenhagen via isync-devel Sun, 19 Jun 2022 07:40:10 -0700

commit 52c063fd45f327169a08d2eadbb2904678f2bb40
Author: Oswald Buddenhagen <[email protected]>
Date:   Mon May 23 10:12:38 2022 +0200


    add support for UTF-7 mailbox names
    
    this finally makes us compliant with IMAP4rev1. how fitting that the
    meanwhile released IMAP4rev2 demoted UTF-7 to legacy status ...
    
    based on a patch by Georgy Kibardin <[email protected]>.

 TODO                |   2 -
 src/.gitignore      |   1 +
 src/Makefile.am     |   6 +-
 src/drv_imap.c      |  21 ++++
 src/imap_p.h        |   4 +
 src/imap_utf7.c     | 288 ++++++++++++++++++++++++++++++++++++++++++++
 src/tst_imap_utf7.c | 116 ++++++++++++++++++
 7 files changed, 434 insertions(+), 4 deletions(-)

diff --git a/TODO b/TODO
index dacdeb4e..a04b2536 100644
--- a/TODO
+++ b/TODO
@@ -6,8 +6,6 @@ automatically resume upon transient errors, e.g. "connection 
reset by peer"
 or timeout after some data was already transmitted.
 possibly also try to handle Exchange's "glitches" somehow.
 
-add support for IMAP UTF-7 (for internationalized mailbox names).
-
 uidvalidity lock timeout handling would be a good idea.
 
 should complain when multiple Channels match the same folders.
diff --git a/src/.gitignore b/src/.gitignore
index 5e7fc35d..3139876a 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -3,6 +3,7 @@
 /mbsync
 /mdconvert
 /tst_imap_msgs
+/tst_imap_utf7
 /tst_msg_cvt
 /tst_timers
 /tmp
diff --git a/src/Makefile.am b/src/Makefile.am
index ab564183..69cf29db 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -5,7 +5,7 @@
 mbsync_SOURCES = \
        util.c config.c socket.c \
        driver.c drv_proxy.c \
-       drv_imap.c imap_msgs.c \
+       drv_imap.c imap_msgs.c imap_utf7.c \
        drv_maildir.c \
        sync.c sync_state.c sync_msg_cvt.c \
        main.c main_sync.c main_list.c
@@ -54,10 +54,12 @@ man_MANS = mbsync.1 $(mdconvert_man)
 
 tst_imap_msgs_SOURCES = tst_imap_msgs.c imap_msgs.c util.c
 
+tst_imap_utf7_SOURCES = tst_imap_utf7.c imap_utf7.c util.c
+
 tst_msg_cvt_SOURCES = tst_msg_cvt.c sync_msg_cvt.c util.c
 tst_msg_cvt_CFLAGS = -DQPRINTF_BUFF=10000
 
-check_PROGRAMS = tst_imap_msgs tst_msg_cvt
+check_PROGRAMS = tst_imap_msgs tst_imap_utf7 tst_msg_cvt
 TESTS = $(check_PROGRAMS)
 
 tst_timers_SOURCES = tst_timers.c util.c
diff --git a/src/drv_imap.c b/src/drv_imap.c
index 911df572..ad95e3d2 100644
--- a/src/drv_imap.c
+++ b/src/drv_imap.c
@@ -1577,6 +1577,7 @@ list3_rsp_atom( imap_store_t *ctx, char *arg, uint len, 
int type ATTR_UNUSED )
        string_list_t *narg;
        int argl = (int)len;
        uint l;
+       char rarg[1130];  // See imap_utf7_to_utf8() for the origin of that 
number
 
        if (!arg)
                return LIST_BAD;
@@ -1608,6 +1609,16 @@ list3_rsp_atom( imap_store_t *ctx, char *arg, uint len, 
int type ATTR_UNUSED )
        }
        if (argl >= 5 && !memcmp( arg + argl - 5, ".lock", 5 )) /* workaround 
broken servers */
                return LIST_OK;
+       if (!(CAP(UTF8_ACCEPT) || CAP(UTF8_ONLY))) {
+               int rargl = imap_utf7_to_utf8( arg, argl, rarg );
+               if (rargl < 0) {
+                       error( "IMAP error: invalid modified-UTF-7 string 
'%.*s'.\n", argl, arg );
+                       return LIST_BAD;
+               }
+               assert( (uint)rargl < sizeof(rarg) );
+               arg = rarg;
+               argl = rargl;
+       }
        if (map_name( arg, argl, (char **)&narg, offsetof(string_list_t, 
string), ctx->delimiter, "/") < 0) {
                warn( "IMAP warning: ignoring mailbox %.*s (reserved character 
'/' in name)\n", argl, arg );
                return LIST_OK;
@@ -1665,6 +1676,16 @@ prepare_name( char **buf, const imap_store_t *ctx, const 
char *prefix, const cha
                return -1;
        default:
                memcpy( *buf, prefix, pl );
+               if (!(CAP(UTF8_ACCEPT) || CAP(UTF8_ONLY))) {
+                       char *nbuf = imap_utf8_to_utf7( *buf );
+                       if (!nbuf) {
+                               error( "IMAP error: invalid UTF-8 string 
'%s'\n", *buf );
+                               free( *buf );
+                               return -1;
+                       }
+                       free( *buf );
+                       *buf = nbuf;
+               }
                return 0;
        }
 }
diff --git a/src/imap_p.h b/src/imap_p.h
index 76e02e21..1c7933eb 100644
--- a/src/imap_p.h
+++ b/src/imap_p.h
@@ -10,6 +10,7 @@
 #include "driver.h"
 
 //#define DEBUG_IMAP_MSGS
+//#define DEBUG_IMAP_UTF7
 
 typedef union imap_message {
        message_t gen;
@@ -45,4 +46,7 @@ void reset_imap_messages( imap_messages_t *msgs );
 void imap_ensure_relative( imap_messages_t *msgs );
 void imap_ensure_absolute( imap_messages_t *msgs );
 
+char *imap_utf8_to_utf7( const char *buf );
+int imap_utf7_to_utf8( const char *buf, int argl, char *outbuf );
+
 #endif
diff --git a/src/imap_utf7.c b/src/imap_utf7.c
new file mode 100644
index 00000000..ac91cdb6
--- /dev/null
+++ b/src/imap_utf7.c
@@ -0,0 +1,288 @@
+// SPDX-FileCopyrightText: 2018-2021 Georgy Kibardin <[email protected]>
+// SPDX-FileCopyrightText: 2022 Oswald Buddenhagen <[email protected]>
+// SPDX-License-Identifier: GPL-2.0-or-later WITH 
LicenseRef-isync-GPL-exception
+//
+// mbsync - mailbox synchronizer
+//
+
+#include "imap_p.h"
+
+#ifdef DEBUG_IMAP_UTF7
+# define dbg(...) print(__VA_ARGS__)
+#else
+# define dbg(...) do { } while (0)
+#endif
+
+struct bit_fifo {
+       unsigned long long value;
+       uint bits;
+};
+
+static void
+add_bits( struct bit_fifo *fifo, uint bits, uint size )
+{
+       fifo->value = (fifo->value << size) | bits;
+       fifo->bits += size;
+       assert( fifo->bits <= sizeof(fifo->value) * 8 );
+}
+
+static uint
+eat_bits( struct bit_fifo *fifo, uint size )
+{
+       fifo->bits -= size;
+       return (fifo->value >> fifo->bits) & ((1LL << size) - 1);
+}
+
+static uint
+peek_bits( struct bit_fifo *fifo, uint size )
+{
+       return (fifo->value >> (fifo->bits - size)) & ((1LL << size) - 1);
+}
+
+static void
+add_char( char **p, uint chr )
+{
+       *((*p)++) = (char)chr;
+}
+
+static uchar
+eat_char( const char **p )
+{
+       return (uchar)*((*p)++);
+}
+
+static uint
+read_as_utf8( const char **utf8_buf_p )
+{
+       uchar chr = eat_char( utf8_buf_p );
+       if (chr < 0x80)
+               return chr;
+       if ((chr & 0xf8) == 0xf0) {
+               uchar chr2 = eat_char( utf8_buf_p );
+               if ((chr2 & 0xc0) != 0x80)
+                       return ~0;
+               uchar chr3 = eat_char( utf8_buf_p );
+               if ((chr3 & 0xc0) != 0x80)
+                       return ~0;
+               uchar chr4 = eat_char( utf8_buf_p );
+               if ((chr4 & 0xc0) != 0x80)
+                       return ~0;
+               return ((chr & 0x7) << 18) |
+                      ((chr2 & 0x3f) << 12) |
+                      ((chr3 & 0x3f) << 6) |
+                      (chr4 & 0x3f);
+       }
+       if ((chr & 0xf0) == 0xe0) {
+               uchar chr2 = eat_char( utf8_buf_p );
+               if ((chr2 & 0xc0) != 0x80)
+                       return ~0;
+               uchar chr3 = eat_char( utf8_buf_p );
+               if ((chr3 & 0xc0) != 0x80)
+                       return ~0;
+               return ((chr & 0xf) << 12) |
+                      ((chr2 & 0x3f) << 6) |
+                      (chr3 & 0x3f);
+       }
+       if ((chr & 0xe0) == 0xc0) {
+               uchar chr2 = eat_char( utf8_buf_p );
+               if ((chr2 & 0xc0) != 0x80)
+                       return ~0;
+               return (chr & 0x1f) << 6 |
+                      (chr2 & 0x3f);
+       }
+       return ~0;
+}
+
+static int
+needs_encoding( uint chr )
+{
+       return chr && (chr <= 0x1f || chr >= 0x7f);
+}
+
+static uint
+utf16_encode( uint chr )
+{
+       chr -= 0x10000;
+       return (((chr >> 10) + 0xd800) << 16) | ((chr & 0x3ff) + 0xdc00);
+}
+
+static uchar
+b64_encode( uint chr )
+{
+       assert( chr <= 0x3f );
+       return 
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"[chr];
+}
+
+char *
+imap_utf8_to_utf7( const char *buf )
+{
+       // Size requirements:
+       // - pass-through: l, 1 => 1
+       // - all "&": l * 2, 1 => 2
+       // - 7-bit: (l * 2 * 4 + 2) / 3 + 2, ~ l * 2.7, 1 => 5
+       // - 3-octet: (l / 3 * 2 * 4 + 2) / 3 + 2, ~ l * 0.9, 3 => 5
+       // - 4-octet: (l / 4 * 2 * 2 * 4 + 2) / 3 + 2, ~ l * 1.3, 4 => 8
+       // => worst case: "&" and 7-bit alternating: l * 3.5, 2 => 7
+       int outsz = strlen( buf ) * 7 / 2 + 3;
+       char *result = nfmalloc( outsz );
+       char *outp = result;
+       struct bit_fifo fifo = { 0, 0 };
+       int encoding = 0;
+       uint chr;
+       do {
+               chr = read_as_utf8( &buf );
+               if (chr == ~0U) {
+                       dbg( "Error: invalid UTF-8 string\n" );
+                       free( result );
+                       return NULL;
+               }
+               if (needs_encoding( chr )) {
+                       if (!encoding) {
+                               add_char( &outp, '&' );
+                               encoding = 1;
+                       }
+                       if (chr <= 0xffff)
+                               add_bits( &fifo, chr, 16 );
+                       else
+                               add_bits( &fifo, utf16_encode( chr ), 32 );
+                       while (fifo.bits >= 6)
+                               add_char( &outp, b64_encode( eat_bits( &fifo, 6 
) ) );
+               } else {
+                       if (encoding) {
+                               if (fifo.bits) {
+                                       uint trailing_bits = 6 - fifo.bits;
+                                       uchar trail = b64_encode( eat_bits( 
&fifo, fifo.bits ) << trailing_bits );
+                                       add_char( &outp, trail );
+                               }
+                               add_char( &outp, '-' );
+                               encoding = 0;
+                       }
+                       add_char( &outp, chr );
+                       if (chr == '&')
+                               add_char( &outp, '-' );
+               }
+       } while (chr);
+       assert( (int)(outp - result) <= outsz );
+       return result;
+}
+
+static void
+write_as_utf8( char **outp, uint chr )
+{
+       if (chr <= 0x7f) {
+               add_char( outp, chr );
+       } else if (chr <= 0x7ff) {
+               add_char( outp, (chr >> 6) | 0xc0 );
+               add_char( outp, (chr & 0x3f) | 0x80 );
+       } else if (chr <= 0xffff) {
+               add_char( outp, (chr >> 12) | 0xe0 );
+               add_char( outp, ((chr >> 6) & 0x3f) | 0x80 );
+               add_char( outp, (chr & 0x3f) | 0x80 );
+       } else {
+               assert( chr <= 0xfffff );
+               add_char( outp, (chr >> 18) | 0xf0 );
+               add_char( outp, ((chr >> 12) & 0x3f) | 0x80 );
+               add_char( outp, ((chr >> 6) & 0x3f) | 0x80 );
+               add_char( outp, (chr & 0x3f) | 0x80 );
+       }
+}
+
+static int
+need_another_16bit( uint bits )
+{
+       return (bits & 0xfc00) == 0xd800;
+}
+
+static uint
+utf16_decode( uint subject )
+{
+       return 0x10000 + (((subject >> 16) - 0xd800) << 10) + ((subject & 
0xffff) - 0xdc00);
+}
+
+static uint
+b64_decode( uchar chr )
+{
+       static uint lu[128] = {
+               ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,
+               ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,
+               ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 62, 63, ~0, ~0, ~0,
+               52, 53, 54, 55, 56, 57, 58, 59, 60, 61, ~0, ~0, ~0, ~0, ~0, ~0,
+               ~0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, ~0, ~0, ~0, ~0, ~0,
+               ~0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+               41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ~0, ~0, ~0, ~0, ~0,
+       };
+       return lu[chr];
+}
+
+int
+imap_utf7_to_utf8( const char *buf, int bufl, char *outbuf )
+{
+       // Size requirements:
+       // - pass-through: l (shortest worst case)
+       // - all "&": l / 2, 2 => 1, * .5
+       // - 7-bit: ((l - 2) * 3 + 1) / 4 / 2, ~ l * .38, 5 => 1, * .2
+       // - 3-octet: ((l - 2) * 3 + 1) / 4 / 2 * 3, ~ l * 1.13, 5 => 3, * .6 
(generic worst case)
+       // - 4-octet: ((l - 2) * 3 + 1) / 4 / 2 / 2 * 4, ~ l * .75, 8 => 4, * .5
+       // => reserve bufl * 9 / 8
+       char *outp = outbuf;
+       struct bit_fifo fifo = { 0, 0 };
+       const char *bufe = buf + bufl;
+       while (buf != bufe) {
+               uchar chr = *buf++;
+               if (chr != '&') {
+                       if (chr & 0x80) {
+                               dbg( "Error: 8-bit char %x\n", chr );
+                               return -1;
+                       }
+                       add_char( &outp, chr );
+                       continue;
+               }
+               if (buf == bufe) {
+                       dbg( "Error: unterminated shift sequence\n" );
+                       return -1;
+               }
+               chr = *buf++;
+               if (chr == '-') {
+                       add_char( &outp, '&' );
+                       continue;
+               }
+               fifo.bits = 0;
+               do {
+                       if (chr & 0x80) {
+                               dbg( "Error: 8-bit char %x\n", chr );
+                               return -1;
+                       }
+                       uint bits = b64_decode( chr );
+                       if (bits == ~0U) {
+                               dbg( "Error: char %x outside alphabet\n", chr );
+                               return -1;
+                       }
+                       add_bits( &fifo, bits, 6 );
+                       if (fifo.bits >= 16) {
+                               if (need_another_16bit( peek_bits( &fifo, 16 ) 
)) {
+                                       if (fifo.bits >= 32) {
+                                               uint utf16 = eat_bits( &fifo, 
32 );
+                                               if ((utf16 & 0xfc00) != 0xdc00) 
{
+                                                       dbg( "Error: unpaired 
UTF-16 surrogate\n" );
+                                                       return -1;
+                                               }
+                                               write_as_utf8( &outp, 
utf16_decode( utf16 ) );
+                                       }
+                               } else {
+                                       write_as_utf8( &outp, eat_bits( &fifo, 
16 ) );
+                               }
+                       }
+                       if (buf == bufe) {
+                               dbg( "Error: unterminated shift sequence\n" );
+                               return -1;
+                       }
+                       chr = *buf++;
+               } while (chr != '-');
+               if (fifo.bits > 6) {
+                       dbg( "Error: incomplete code point\n" );
+                       return -1;
+               }
+       }
+       return (int)(outp - outbuf);
+}
diff --git a/src/tst_imap_utf7.c b/src/tst_imap_utf7.c
new file mode 100644
index 00000000..dcc3013b
--- /dev/null
+++ b/src/tst_imap_utf7.c
@@ -0,0 +1,116 @@
+// SPDX-FileCopyrightText: 2022 Oswald Buddenhagen <[email protected]>
+// SPDX-License-Identifier: GPL-2.0-or-later
+//
+// isync test suite
+//
+
+#include "imap_p.h"
+
+static struct {
+       const char *utf8, *utf7;
+} data[] = {
+       { u8"", "" },
+       { u8"1", "1" },
+       { u8"word", "word" },
+       { u8"&", "&-" },
+       { NULL, "&" },
+       { NULL, "&-&" },
+       { u8"&&", "&-&-" },
+       { u8"1&1", "1&-1" },
+       { u8"&1&", "&-1&-" },
+       { u8"\t", "&AAk-" },
+       { NULL, "&AAk" },
+       { NULL, "&AA-" },
+       { NULL, "&*Ak-" },
+       { NULL, "&&-" },
+       { u8"m\x7f""ll", "m&AH8-ll" },
+       { u8"\t&", "&AAk-&-" },
+       { u8"\t&\t", "&AAk-&-&AAk-" },
+       { u8"&\t", "&-&AAk-" },
+       { u8"&\t&", "&-&AAk-&-" },
+       { u8"ä", "&AOQ-" },
+       { u8"\x83\x84", NULL },
+       { u8"\xc3\xc4", NULL },
+       { u8"\xc3", NULL },
+       { u8"äö", "&AOQA9g-" },
+       { u8"äöü", "&AOQA9gD8-" },
+       { u8"Ḁ", "&HgA-" },
+       { u8"\xe1\xc8\x80", NULL },
+       { u8"\xe1\xb8\xf0", NULL },
+       { u8"\xe1\xb8", NULL },
+       { u8"\xe1", NULL },
+       { u8"Ḁḁ", "&HgAeAQ-" },
+       { u8"😂", "&2D3eAg-" },
+       { u8"\xf8\x9f\x98\x82", NULL },
+       { u8"\xf0\xcf\x98\x82", NULL },
+       { u8"\xf0\x9f\xd8\x82", NULL },
+       { u8"\xf0\x9f\x98\xe2", NULL },
+       { u8"\xf0\x9f\x98", NULL },
+       { u8"\xf0\x9f", NULL },
+       { u8"\xf0", NULL },
+       { NULL, "&2D0-" },
+       { u8"😈😎", "&2D3eCNg93g4-" },
+       { u8"müll", "m&APw-ll" },
+       { u8"mü", "m&APw-" },
+       { u8"über", "&APw-ber" },
+};
+
+int
+main( void )
+{
+       int ret = 0;
+
+       for (uint i = 0; i < as(data); i++) {
+               if (!data[i].utf8)
+                       continue;
+               xprintf( "To UTF-7 \"%s\" (\"%!s\") ...\n", data[i].utf8, 
data[i].utf8 );
+               char *utf7 = imap_utf8_to_utf7( data[i].utf8 );
+               if (utf7) {
+                       if (!data[i].utf7) {
+                               xprintf( "Unexpected success: \"%s\" 
(\"%!s\")\n", utf7, utf7 );
+                               ret = 1;
+                       } else if (strcmp( utf7, data[i].utf7 )) {
+                               xprintf( "Mismatch, got \"%s\" (\"%!s\"), want 
\"%!s\"\n",
+                                        utf7, utf7, data[i].utf7 );
+                               ret = 1;
+                       }
+                       free( utf7 );
+               } else {
+                       if (data[i].utf7) {
+                               xprintf( "Conversion failure.\n" );
+                               ret = 1;
+                       }
+               }
+       }
+
+       for (uint i = 0; i < as(data); i++) {
+               if (!data[i].utf7)
+                       continue;
+               xprintf( "From UTF-7 \"%!s\" ...\n", data[i].utf7 );
+               int utf7len = strlen( data[i].utf7 );
+               char utf8buf[1000];
+               int utf8len = imap_utf7_to_utf8( data[i].utf7, utf7len, utf8buf 
);
+               if (utf8len >= 0) {
+                       if (!data[i].utf8) {
+                               xprintf( "Unexpected success: \"%.*s\" 
(\"%.*!s\")\n",
+                                        utf8len, utf8buf, utf8len, utf8buf );
+                               ret = 1;
+                       } else {
+                               int wantlen = strlen( data[i].utf8 );
+                               if (utf8len != wantlen || memcmp( utf8buf, 
data[i].utf8, utf8len )) {
+                                       xprintf( "Mismatch, got \"%.*s\" 
(\"%.*!s\"), want \"%s\" (\"%!s\")\n",
+                                                utf8len, utf8buf, utf8len, 
utf8buf, data[i].utf8, data[i].utf8 );
+                                       ret = 1;
+                               }
+                       }
+                       assert( utf8len < utf7len * 9 / 8 + 1 );
+               } else {
+                       if (data[i].utf8) {
+                               xprintf( "Conversion failure.\n" );
+                               ret = 1;
+                       }
+               }
+       }
+
+       return ret;
+}


_______________________________________________
isync-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/isync-devel

[commit] master: add support for UTF-7 mailbox names

Reply via email to