Hi,

On Sat, Jun 16, 2007 at 08:41:20PM +0200, Roland Rosenfeld wrote:
> Tobias Schlemmer schrieb am Dienstag, den 07. März 2006:
> 
> > Package: lbdb
> > Version: 0.31.1-0ts1
> > Severity: wishlist
> > Tags: l10n patch
> 
> > I have added charset conversion to rfc2047.c and (lbdb-)?fetchaddr using
> > iconv. I don't know, how portable it is, so try it out. 
> > now my .procmailrc has an enty of 
> > 
> > :0hc
> > | lbdb-fetchaddr -d '%d.%m.%Y %H.%M' -c utf-8
> > 
> > It seems to work fine for me, even with evolution and muttalias ;-). And
> > I think it's a very small step towards real internationalization. 
> 

I tried out Tobias' patch a few days ago, to cope with the mess of
having both ISO-8859-1 and UTF-8 encoded personal names in lbdb's
m_inmail database.

While conversion of characters from ISO-8859-1 to the desired UTF-8
charset works as expected, long names are mysteriously cut off at the
end. Here's an example:

  $ echo 'From: =?ISO-8859-1?Q?El_nombre_m=E1s_largo_del_mundo?= <[EMAIL 
PROTECTED]>' | /usr/lib/lbdb/fetchaddr -c utf-8
  [EMAIL PROTECTED] El nombre más larg      2007-10-27 17:03

Instead, the correct result should be:

  [EMAIL PROTECTED] El nombre más largo del mundo   2007-10-27 17:06


Looking at the code, the first thing I spotted was that the
destination buffer length passed to iconv is erroneously set to the
length of the source buffer which contains the decoded string.
This is the cause for the cut off strings, as a character byte in
one charset may correspond to multiple bytes in another charset,
e.g. when converting non-ascii characters from ISO-8859-1 to UTF-8.


Below, I included a revised version of Tobias' charset conversion patch.

In summary, the following changes were made:

* Set the destination buffer length (variable `len') to the maximum
  available length (variable `dlen' passed to `rfc2047_decode_word').

  This fixes the above problem, but I am not quite sure whether the
  converted string will always be shorter (in bytes) than the RFC2047
  encoded string, which determines the maximum buffer length.

* In the iconv loop, the source buffer length (`in') has to be
  decremented, too, if I understand the iconv(3) error behaviour
  correctly.

* Set the `in' length to include the terminating '\0' character.

* Ignore all changes concerning only white space.

* Wrap all iconv-related code blocks with an `#ifdef HAVE_ICONV'.
  This allows for compilation on platforms without iconv support.

* Cosmetic change to the lbdb-fetchaddr(1) man page paragraph.

* Fix a segfault in qpto8bit if no arguments are supplied.


> For the records: I didn't forget to include this patch, the problem
> is, that lbdb should not only run on Debian but on all Unix systems,
> including systems, where iconv is not available or not installed.  I
> don't like to add a build dependency on iconv on all systems, so I'm
> looking for some autoconf stuff to make the iconv patch configurable
> using --without-iconv (default, if iconv is not available),
> --with-iconv (default, if iconv is available), or
> --with-iconv=/some/dir.
> 
> Any help with this autoconf stuff is greatly appreciated...

Roland, are there any sources available for lbdb (for example, a CVS
or SVN repository) with the original automake/autoconf macro files?

It does not seem all too hard to make the iconv support configurable,
looking at the installation section of libiconv[1]. So if you could
give me a hint about the macro files, I would be glad to look into
this again.

Regards,
Peter

[1] http://www.gnu.org/software/libiconv/
--- lbdb-0.35.1.orig/fetchaddr.c
+++ lbdb-0.35.1/fetchaddr.c
@@ -119,6 +119,9 @@
   char *headerlist = NULL;
   char *fieldname, *next;
   char create_real_name = 0;
+#ifdef HAVE_ICONV
+  const char **charsetptr = &Charset;
+#endif
 
   /* process command line arguments: */
   if (argc > 1) {
@@ -128,6 +131,10 @@
 	datefmt = argv[++i];
       } else if (!strcmp (argv[i], "-x") && i+1 < argc) {
 	headerlist = argv[++i];
+#ifdef HAVE_ICONV
+      } else if (!strcmp (argv[i], "-c") && i+1 < argc) {
+	*charsetptr = argv[++i];
+#endif
       } else if (!strcmp (argv[i], "-a")) {
 	create_real_name = 1;
       } else {
--- lbdb-0.35.1.orig/lbdb-fetchaddr.man.in
+++ lbdb-0.35.1/lbdb-fetchaddr.man.in
@@ -24,6 +24,8 @@
 .IR dateformat ]
 .RB [ -x
 .IR headerfieldlist ]
+.RB [ -c
+.IR charset ]
 .RB [ -a ]
 .br
 .B lbdb-fetchaddr
@@ -88,6 +90,12 @@
 mail addresses.  If this option isn't given, we fall back to
 .RB ` from:to:cc:resent-from:resent-to '.
 .TP
+.BI -c " charset"
+The charset which will be used to write the database. This should be
+the charset which the application expects (normally the one from your
+current locale).  If this option isn't given, we fall back to
+.RB ` iso-8859-15 '.
+.TP
 .B -a
 Also grab addresses without a real name.  Use the local part of the
 mail address as real name.
--- lbdb-0.35.1.orig/lbdb-fetchaddr.sh.in
+++ lbdb-0.35.1/lbdb-fetchaddr.sh.in
@@ -41,6 +41,7 @@
     echo "       -h                 this short help"
     echo "       -d 'dateformat'    select date format using strftime(3)"
     echo "       -x 'from:to:cc'    colon separated list of header fields"
+    echo "       -c 'charset'       charset for the database storage"
     echo "       -a                 also grep addresses without realname"
 }
 
@@ -69,6 +70,13 @@
 	    hdrlist="-x $1"
 	fi
 	;;
+    -c)
+	if [ $# -gt 1 ]
+	then
+	    shift
+	    charset="-c $1"
+	fi
+	;;
     -a)
 	additional_param="$additional_param $1"
 	;;
@@ -112,7 +120,7 @@
   exit 1
 fi
 
-if $fetchaddr $additional_param -d "$datefmt" $hdrlist >> $db ; then
+if $fetchaddr $additional_param -d "$datefmt" $hdrlist $charset >> $db ; then
   touch $db.dirty
 fi
 
--- lbdb-0.35.1.orig/qpto8bit.c
+++ lbdb-0.35.1/qpto8bit.c
@@ -27,9 +27,17 @@
 #include "rfc822.h"
 #include "rfc2047.h"
 
-int main ()
+int main (int argc, char * argv[])
 {
   char buff[2048];
+#ifdef HAVE_ICONV
+  const char **charsetptr = &Charset;
+#endif
+
+#ifdef HAVE_ICONV
+  if (argc > 1)
+    *charsetptr = argv[1];
+#endif
 
   while (fgets (buff, sizeof (buff), stdin)) {
     rfc2047_decode (buff, buff, sizeof (buff));
--- lbdb-0.35.1.orig/rfc2047.c
+++ lbdb-0.35.1/rfc2047.c
@@ -20,6 +20,11 @@
 
 #include <ctype.h>
 #include <string.h>
+#ifdef HAVE_ICONV
+#include <iconv.h>
+#include <errno.h>
+#include <limits.h>
+#endif
 
 #include "rfc822.h"
 #include "rfc2047.h"
@@ -36,7 +41,7 @@
 };
 
 const char MimeSpecials[] = "@.,;<>[]\\\"()?/=";
-const char Charset[] = "iso-8859-1"; /* XXX - hack */
+const char *Charset = "iso-8859-15"; /* XXX - hack */
 
 
 int Index_hex[128] = {
@@ -68,12 +73,18 @@
 #define hexval(c) Index_hex[(unsigned int)(c)]
 #define base64val(c) Index_64[(unsigned int)(c)]
 
-static int rfc2047_decode_word (char *d, const char *s, size_t len)
+static int rfc2047_decode_word (char *d, const char *s, size_t dlen)
 {
   char *p = safe_strdup (s);
   char *pp = p;
   char *pd = d;
+  size_t len = dlen;
   int enc = 0, filter = 0, count = 0, c1, c2, c3, c4;
+#ifdef HAVE_ICONV
+  char *fromcharset;
+  iconv_t cd;
+  size_t in;
+#endif
 
   while ((pp = strtok (pp, "?")) != NULL)
   {
@@ -82,7 +93,12 @@
     {
       case 2:
 	if (strcasecmp (pp, Charset) != 0)
+	{
 	  filter = 1;
+#ifdef HAVE_ICONV
+	  fromcharset = safe_strdup (pp);
+#endif
+	}
 	break;
       case 3:
 	if (toupper (*pp) == 'Q')
@@ -152,13 +168,42 @@
   safe_free (&p);
   if (filter)
   {
-    pd = d;
-    while (*pd)
+#ifdef HAVE_ICONV
+    if ((cd = iconv_open (Charset, fromcharset)) == (iconv_t)(-1))
     {
-      if (!IsPrint (*pd))
+#endif
+      pd = d;
+      while (*pd)
+      {
+	if (!IsPrint (*pd))
+	  *pd = '?';
+	pd++;
+      }
+#ifdef HAVE_ICONV
+    } else {
+      p = safe_strdup (d);
+      pp = p;
+      in = strlen (d) + 1;
+      pd = d;
+      /* maximum available buffer length for converted string */
+      len = dlen;
+      while (*pd && iconv (cd, &pp, &in, &pd, &len) == (size_t)(-1))
+      {
+	if (errno == E2BIG)
+	  break;
+
 	*pd = '?';
-      pd++;
+	pp++;
+	in--;
+	pd++;
+	len--;
+      }
+      iconv (cd, NULL, NULL, &pd, &len);
+      iconv_close (cd);
+      safe_free (&p);
     }
+    safe_free (&fromcharset);
+#endif
   }
   return (0);
 }
--- lbdb-0.35.1.orig/rfc2047.h
+++ lbdb-0.35.1/rfc2047.h
@@ -18,6 +18,7 @@
 
 /* $Id: rfc2047.h,v 1.3 2005-10-29 14:48:11 roland Exp $ */
 
+extern const char *Charset;
 void rfc2047_encode_string (char *, size_t, const unsigned char *);
 void rfc2047_encode_adrlist (ADDRESS *);
 

Attachment: signature.asc
Description: Digital signature

Reply via email to