On Wed, Sep 20, 2023 at 08:36:23PM +0200, Walter Alejandro Iglesias wrote:
> On Wed, Sep 20, 2023 at 07:44:12PM +0200, Walter Alejandro Iglesias wrote:
> > And this new idea simplifies all to this:
> 
> In case anyone else is worried.  Crystal Kolipe already pointed me out
> that a better UTF-8 checking is needed, I know, I'll get to that
> tomorrow.

The following version checks for not valid UTF-8 characters.  I could
make it fail in this case and send a dead.letter but I imagine that
those who really use mail(1) surely do it mostly in a tty console where,
at least with a non US keyboard, is too easy to type some non valid utf-8
character, hence this feature would be more a hassle than a help, so I
chose to make it simply skip adding any MIME header in this case (how it
has been used until now and no one complained :-)).  If you prefer the
other behavior let me know.


Index: send.c
===================================================================
RCS file: /cvs/src/usr.bin/mail/send.c,v
retrieving revision 1.26
diff -u -p -r1.26 send.c
--- send.c      8 Mar 2023 04:43:11 -0000       1.26
+++ send.c      21 Sep 2023 08:40:11 -0000
@@ -33,6 +33,15 @@
 #include "rcv.h"
 #include "extern.h"
 
+/*
+ * Variables and functions declared here will be useful to check the
+ * character set of the message to add the appropiate MIME headers.
+ */
+static char nascii = 0;
+static char nutf8 = 0;
+static int not_ascii(struct __sFILE *s);
+static int not_utf8(struct __sFILE *s, int len);
+
 static volatile sig_atomic_t sendsignal;       /* Interrupted by a signal? */
 
 /*
@@ -341,6 +350,15 @@ mail1(struct header *hp, int printheader
                else
                        puts("Null message body; hope that's ok");
        }
+
+       /* Check for non ASCII characters in the message */
+       nascii = not_ascii(mtf);
+       rewind(mtf);
+
+       /* Check for non valid UTF-8 characters in the message */
+       nutf8 = not_utf8(mtf, fsize(mtf));
+       rewind(mtf);
+
        /*
         * Now, take the user names from the combined
         * to and cc lists and do all the alias
@@ -525,6 +543,14 @@ puthead(struct header *hp, FILE *fo, int
                fmt("To:", hp->h_to, fo, w&GCOMMA), gotcha++;
        if (hp->h_subject != NULL && w & GSUBJECT)
                fprintf(fo, "Subject: %s\n", hp->h_subject), gotcha++;
+       if (!nascii)
+               fprintf(fo, "MIME-Version: 1.0\n"
+                       "Content-Type: text/plain; charset=us-ascii\n"
+                       "Content-Transfer-Encoding: 7bit\n"), gotcha++;
+       else if (nutf8 == 0)
+               fprintf(fo, "MIME-Version: 1.0\n"
+                       "Content-Type: text/plain; charset=utf-8\n"
+                       "Content-Transfer-Encoding: 8bit\n"), gotcha++;
        if (hp->h_cc != NULL && w & GCC)
                fmt("Cc:", hp->h_cc, fo, w&GCOMMA), gotcha++;
        if (hp->h_bcc != NULL && w & GBCC)
@@ -609,4 +635,67 @@ sendint(int s)
 {
 
        sendsignal = s;
+}
+
+/* Search non ASCII characters in the message */
+static int
+not_ascii(struct __sFILE *s)
+{
+       int ch, n;
+       n = 0;
+        while ((ch = getc(s)) != EOF)
+                if (ch > 0x7f)
+                       n = 1;
+
+       return n;
+}
+
+/* Search non valid UTF-8 characters in the message */
+static int
+not_utf8(struct __sFILE *message, int len)
+{
+       int i, nou8;
+       char c;
+       unsigned char s[len + 1];
+
+       i = 0;
+        while ((c = getc(message)) != EOF)
+               s[i++] = c;
+
+       s[i] = '\0';
+
+       i = nou8 = 0;
+       while (i != len)
+               if (s[i] <= 0x7f)
+                       ++i;
+               /* Two bytes case */
+               else if (s[i] >= 0xc2 && s[i] < 0xe0 &&
+                       s[i + 1] >= 0x80 && s[i + 1] <= 0xbf)
+                       i += 2;
+               /* Special three bytes case */
+               else if ((s[i] == 0xe0 &&
+                       s[i + 1] >= 0xa0 && s[i + 1] <= 0xbf &&
+                       s[i + 2] >= 0x80 && s[i + 2] <= 0xbf) ||
+               /* Three bytes case */
+                       (s[i] > 0xe0 && s[i] < 0xf0 &&
+                       s[i + 1] >= 0x80 && s[i + 1] <= 0xbf &&
+                       s[i + 2] >= 0x80 && s[i + 2] <= 0xbf))
+                       i += 3;
+               /* Special four bytes case */
+               else if ((s[i] == 0xf0 &&
+                       s[i + 1] >= 0x90 && s[i + 1] <= 0xbf &&
+                       s[i + 2] >= 0x80 && s[i + 2] <= 0xbf &&
+                       s[i + 3] >= 0x80 && s[i + 3] <= 0xbf) ||
+               /* Four bytes case */
+                       (s[i] > 0xf0 &&
+                       s[i + 1] >= 0x80 && s[i + 1] <= 0xbf &&
+                       s[i + 2] >= 0x80 && s[i + 2] <= 0xbf &&
+                       s[i + 3] >= 0x80 && s[i + 3] <= 0xbf))
+                       i += 4;
+               else {
+                       nou8 = i + 1;
+                       break;
+               }
+
+       return nou8;
 }


-- 
Walter

Reply via email to