On Thu, Sep 21, 2023 at 11:26:11AM +0200, Omar Polo wrote:
> On 2023/09/21 10:55:47 +0200, Walter Alejandro Iglesias <w...@roquesor.com> 
> wrote:
> > On Wed, Sep 20, 2023 at 08:36:23PM +0200, Walter Alejandro Iglesias wrote:
> > > On Wed, Sep 20, 2023 at 07:44:12PM +0200, Walter Alejandro Iglesias wrote:
> > > > And this new idea simplifies all to this:
> > > 
> > > In case anyone else is worried.  Crystal Kolipe already pointed me out
> > > that a better UTF-8 checking is needed, I know, I'll get to that
> > > tomorrow.
> > 
> > The following version checks for not valid UTF-8 characters.  I could
> > make it fail in this case and send a dead.letter but I imagine that
> > those who really use mail(1) surely do it mostly in a tty console where,
> > at least with a non US keyboard, is too easy to type some non valid utf-8
> > character, hence this feature would be more a hassle than a help, so I
> > chose to make it simply skip adding any MIME header in this case (how it
> > has been used until now and no one complained :-)).  If you prefer the
> > other behavior let me know.
> > 
> > 
> > Index: send.c
> > ===================================================================
> > RCS file: /cvs/src/usr.bin/mail/send.c,v
> > retrieving revision 1.26
> > diff -u -p -r1.26 send.c
> > --- send.c  8 Mar 2023 04:43:11 -0000       1.26
> > +++ send.c  21 Sep 2023 08:40:11 -0000
> > @@ -33,6 +33,15 @@
> >  #include "rcv.h"
> >  #include "extern.h"
> >  
> > +/*
> > + * Variables and functions declared here will be useful to check the
> > + * character set of the message to add the appropiate MIME headers.
> > + */
> > +static char nascii = 0;
> > +static char nutf8 = 0;
> 
> There's no need to explicitly zero static (or global) variables.
> 
> > +static int not_ascii(struct __sFILE *s);
> > +static int not_utf8(struct __sFILE *s, int len);
> 
> I'd use FILE * instead of struct __sFILE
> 
> >  static volatile sig_atomic_t sendsignal;   /* Interrupted by a signal? */
> >  
> >  /*
> > @@ -341,6 +350,15 @@ mail1(struct header *hp, int printheader
> >             else
> >                     puts("Null message body; hope that's ok");
> >     }
> > +
> > +   /* Check for non ASCII characters in the message */
> > +   nascii = not_ascii(mtf);
> > +   rewind(mtf);
> > +
> > +   /* Check for non valid UTF-8 characters in the message */
> > +   nutf8 = not_utf8(mtf, fsize(mtf));
> > +   rewind(mtf);
> 
> assuming that we care for this two checks, why not doing everything in
> a single pass?
> 
> Do we really need the two checks?
> 
> >     /*
> >      * Now, take the user names from the combined
> >      * to and cc lists and do all the alias
> > @@ -525,6 +543,14 @@ puthead(struct header *hp, FILE *fo, int
> >             fmt("To:", hp->h_to, fo, w&GCOMMA), gotcha++;
> >     if (hp->h_subject != NULL && w & GSUBJECT)
> >             fprintf(fo, "Subject: %s\n", hp->h_subject), gotcha++;
> > +   if (!nascii)
> > +           fprintf(fo, "MIME-Version: 1.0\n"
> > +                   "Content-Type: text/plain; charset=us-ascii\n"
> > +                   "Content-Transfer-Encoding: 7bit\n"), gotcha++;
> 
> +1 for splitting the string in multiple lines, this is an improvements
> over previous versions, but please
> 
>  - use four spaces of indentation for continuation lines
> 
>  - although existing code uses ", gotcha++" I'd split that in a
>    separate line for clarity.
> 
> > +   else if (nutf8 == 0)
> > +           fprintf(fo, "MIME-Version: 1.0\n"
> > +                   "Content-Type: text/plain; charset=utf-8\n"
> > +                   "Content-Transfer-Encoding: 8bit\n"), gotcha++;
> >     if (hp->h_cc != NULL && w & GCC)
> >             fmt("Cc:", hp->h_cc, fo, w&GCOMMA), gotcha++;
> >     if (hp->h_bcc != NULL && w & GBCC)
> > @@ -609,4 +635,67 @@ sendint(int s)
> >  {
> >  
> >     sendsignal = s;
> > +}
> > +
> > +/* Search non ASCII characters in the message */
> > +static int
> > +not_ascii(struct __sFILE *s)
> > +{
> > +   int ch, n;
> > +   n = 0;
> > +        while ((ch = getc(s)) != EOF)
> 
> There are some spacing issues, both here and below.
> 
> > +                if (ch > 0x7f)
> > +                   n = 1;
> > +
> > +   return n;
> > +}
> > +
> > +/* Search non valid UTF-8 characters in the message */
> > +static int
> > +not_utf8(struct __sFILE *message, int len)
> > +{
> > +   int i, nou8;
> > +   char c;
> > +   unsigned char s[len + 1];
> 
> Please don't.  Variable length arrays (VLA) with a possibly large len
> are a bad idea.  They have more or less the same issues as alloca(3),
> see the CAVEATS section of it to get an idea.
> 
> > +
> > +   i = 0;
> > +        while ((c = getc(message)) != EOF)
> > +           s[i++] = c;
> 
> and even then, fread() is simpler :-)
> 
> > +   s[i] = '\0';
> > +
> > +   i = nou8 = 0;
> > +   while (i != len)
> 
> ...and even then, mbtowc is easier to use.  See Ingo'
> /src/usr/bin/ls/utf8.c for an example usage.
> 
> > +           if (s[i] <= 0x7f)
> > +                   ++i;
> > +           /* Two bytes case */
> > +           else if (s[i] >= 0xc2 && s[i] < 0xe0 &&
> > +                   s[i + 1] >= 0x80 && s[i + 1] <= 0xbf)
> > +                   i += 2;
> > +           /* Special three bytes case */
> > +           else if ((s[i] == 0xe0 &&
> > +                   s[i + 1] >= 0xa0 && s[i + 1] <= 0xbf &&
> > +                   s[i + 2] >= 0x80 && s[i + 2] <= 0xbf) ||
> > [...]
> >  }
> 
> 
> Admittedly I haven't followed the thread too closely, although I hope
> something like this will end up in mail(1) sometimes :-)

Hi Omar,

I corrected many of the things you pointed me, but not all.  The
function I use to check utf8 is mine, I use it in a pair of little
programs which I've *hardly* checked for memory leacks.  I know my
function looks BIG :-), but I know for sure that it does the job.  I
could just filter only 0xfe and 0xff as Crystal suggests, but that lets
pass iso-latin characters as utf-8, if that doesn't bother anyone I can
do that.  I don't know the other solutions you mention, I still have to
learn them. :-)

What concerns me now is to add a utf8 check for the subject.


> 
> Thanks,
> 
> Omar Polo



Index: send.c
===================================================================
RCS file: /cvs/src/usr.bin/mail/send.c,v
retrieving revision 1.26
diff -u -p -r1.26 send.c
--- send.c      8 Mar 2023 04:43:11 -0000       1.26
+++ send.c      21 Sep 2023 11:01:58 -0000
@@ -33,6 +33,10 @@
 #include "rcv.h"
 #include "extern.h"
 
+/* To check charset of the message and add the appropiate MIME headers  */
+static char nutf8;
+static int not_utf8(FILE *s, int len);
+
 static volatile sig_atomic_t sendsignal;       /* Interrupted by a signal? */
 
 /*
@@ -341,6 +345,11 @@ mail1(struct header *hp, int printheader
                else
                        puts("Null message body; hope that's ok");
        }
+
+       /* Check non valid UTF-8 characters in the message */
+       nutf8 = not_utf8(mtf, fsize(mtf));
+       rewind(mtf);
+
        /*
         * Now, take the user names from the combined
         * to and cc lists and do all the alias
@@ -525,6 +534,14 @@ puthead(struct header *hp, FILE *fo, int
                fmt("To:", hp->h_to, fo, w&GCOMMA), gotcha++;
        if (hp->h_subject != NULL && w & GSUBJECT)
                fprintf(fo, "Subject: %s\n", hp->h_subject), gotcha++;
+       if (nutf8 == 0)
+               fprintf(fo, "MIME-Version: 1.0\n"
+                   "Content-Type: text/plain; charset=us-ascii\n"
+                   "Content-Transfer-Encoding: 7bit\n"), gotcha++;
+       else if (nutf8 == 1)
+               fprintf(fo, "MIME-Version: 1.0\n"
+                   "Content-Type: text/plain; charset=utf-8\n"
+                   "Content-Transfer-Encoding: 8bit\n"), gotcha++;
        if (hp->h_cc != NULL && w & GCC)
                fmt("Cc:", hp->h_cc, fo, w&GCOMMA), gotcha++;
        if (hp->h_bcc != NULL && w & GBCC)
@@ -609,4 +626,60 @@ sendint(int s)
 {
 
        sendsignal = s;
+}
+
+/* Search non valid UTF-8 characters in the message */
+static int
+not_utf8(FILE *message, int len)
+{
+       int i, n, nonascii;
+       char c;
+       unsigned char s[len + 1];
+
+       i = 0;
+        while ((c = getc(message)) != EOF)
+               s[i++] = c;
+
+       s[i] = '\0';
+
+       i = n = nonascii = 0;
+       while (i != len)
+               if (s[i] <= 0x7f) {
+                       i++;
+               /* Two bytes case */
+               } else if (s[i] >= 0xc2 && s[i] < 0xe0 &&
+                   s[i + 1] >= 0x80 && s[i + 1] <= 0xbf) {
+                       i += 2;
+                       nonascii++;
+               /* Special three bytes case */
+               } else if ((s[i] == 0xe0 &&
+                   s[i + 1] >= 0xa0 && s[i + 1] <= 0xbf &&
+                   s[i + 2] >= 0x80 && s[i + 2] <= 0xbf) ||
+                   /* Three bytes case */
+                   (s[i] > 0xe0 && s[i] < 0xf0 &&
+                   s[i + 1] >= 0x80 && s[i + 1] <= 0xbf &&
+                   s[i + 2] >= 0x80 && s[i + 2] <= 0xbf)) {
+                       i += 3;
+                       nonascii++;
+               /* Special four bytes case */
+               } else if ((s[i] == 0xf0 &&
+                   s[i + 1] >= 0x90 && s[i + 1] <= 0xbf &&
+                   s[i + 2] >= 0x80 && s[i + 2] <= 0xbf &&
+                   s[i + 3] >= 0x80 && s[i + 3] <= 0xbf) ||
+                   /* Four bytes case */
+                   (s[i] > 0xf0 &&
+                   s[i + 1] >= 0x80 && s[i + 1] <= 0xbf &&
+                   s[i + 2] >= 0x80 && s[i + 2] <= 0xbf &&
+                   s[i + 3] >= 0x80 && s[i + 3] <= 0xbf)) {
+                       i += 4;
+                       nonascii++;
+               } else {
+                       n = i + 1;
+                       break;
+               }
+
+       if (nonascii)
+               n++;
+
+       return n;
 }


-- 
Walter

Reply via email to