HTML demangler for mailing lists

Tim Pierce Sun, 6 Jun 1999 03:49:26 -0700

There's considerable demand for a tool that will strip out the HTML
portions of multipart messages and leave only text/plain, but as far
as I can tell, no such tool exists.  So I wrote one.

This program reads a message on standard input and prints a demangled
version on standard output.  If the message has a content-type of
`multipart/alternative', the body is discarded and replaced with the
first text/plain subpart that can be found.  If the message isn't
multipart/alternative, or if it contains no text/plain subparts, the
original message is passed through unmodified.  For example, it can
easily be hooked into SmartList in rc.local.s00 (and .r00):

  :0 wf
  * ^Content-Type: multipart
  | unhtml

It is not guaranteed to be bug-free and should be considered beta
software, at best.  For what it's worth, it's been running in
production on RootsWeb's mail servers for several days (an excellent
torture test) and the initial bugs seem to have been shaken out.

Anyway, I give it unto the world for them what wants it.  Corrections
and fixes welcomed.

-- 
Regards,
Tim Pierce
RootsWeb Genealogical Data Cooperative
system obfuscator and hack-of-all-trades

/*
 * unhtml: parse a MIME multipart message and, if `multipart/alternative',
 * discard all but the `text/plain' part.
 *
 * This code is in the public domain except where noted otherwise.
 *
 * Tim Pierce <[EMAIL PROTECTED]>
 * 3 June 1999
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct list {
  struct message *data;
  struct list *next;
};

/*
 * The `message' struct is used to represent a MIME multipart message,
 * as defined in RFC 2045 and RFC 2046.
 *
 * The `header' and `preamble' fields are flat text arrays containg
 * the raw text of the message header and preamble (any text preceding
 * the first body part).  In the case of a single-part message, the
 * `preamble' field is used to store the whole body.
 *
 * The `epilogue' field stores any text following the last body part.
 *
 * The `content_type' field contains the value of the message's
 * Content-Type header, minus any parameters: for example,
 * "text/plain".  It is NULL if the message lacks a Content-Type
 * header.  The `boundary' field contains the value of the `boundary'
 * parameter to the Content-Type field, if present.
 *
 * The `hsize' parameter is the size of the header, in bytes.  The
 * `bsize' parameter is the size of the body, in bytes.  These
 * parameters are not presently used and may be discarded.
 */

struct message {
  char *header;
  char *preamble;       /* text (if any) preceding the first body-part */
  char *epilogue;       /* text (if any) following the last body-part */
  char *content_type;
  char *boundary;
  int is8bit;           /* does this message contain 8bit data? */
  int hsize;    /* bytes allocated for hdr (may be more than necessary) */
  int bsize;    /* bytes allocated for body (may be more than necessary) */
  struct list *parts;
};

struct message *mime_parse (char *body);
void mime_write (struct message *msg);
void mime_warn (char *s);
void mime_fatal (char *s);
void mime_destroy (struct message *msg);
int check_msgtype (struct message *msg, char *type);
void getheader (struct message *msg, char *hdrname, char **hdrvalp, int *hdrlen);
char *next_boundary (char *body, char *boundary);
int add_subpart (struct message *msg, struct message *part);
void print_msg_info (struct message *msg, int indent);
void mime_decode_qp (struct message *msg);
void mime_decode_b64 (char *buf);
void mime_output_qp (char *text);

int
main (argc, argv)
     int argc;
     char **argv;
{
  struct message *msg;
  int len, c, last_char, bi, bsize;
  char *p, *body;

  if (argc > 1)
    {
      if (freopen (argv[1], "r", stdin) == NULL)
        {
          perror ("unhtml: freopening standard input");
          exit(1);
        }
    }

  /* Read the message. */
  bi = 0;
  body = (char *) malloc (sizeof(char) * 1000);
  if (body == NULL)
    {
      perror ("unhtml: could not malloc memory for body");
      exit(1);
    }

  while ((len = read (fileno(stdin), body+bi, 1000)) == 1000)
    {
      bi += 1000;
      body = (char *) realloc (body, sizeof(char) * (bi+1000));
      if (body == NULL)
        {
          perror ("unhtml: could not realloc memory for body");
          exit(1);
        }
    }
  bsize = bi + len;
  body[bsize] = '\0';

  msg = mime_parse (body);

  /*
   * Here's the important stuff: walk through the parts of a
   * multipart/alternative and look for a text/plain attachment.
   * If we find one, rewrite the headers of the parent message
   * (this is ugly) and output the text/plain body.
   */
  if (msg->content_type &&
      !strncasecmp (msg->content_type, "multipart/alternative", 21))
    {
      struct list *p;
      for (p = msg->parts; p != NULL; p = p->next)
        {
          if (p->data->content_type == NULL ||
              !strncasecmp (p->data->content_type, "text/plain", 10))
            {
              /* XXX: Rewrite the headers.  This is clumsy, and also
               * doesn't handle Content-Type or Content-Length if they're
               * the first headers in the message. */
              char *cp;
              for (cp = msg->header; *cp; ++cp)
                {
                  putchar (*cp);

                  /* Skip Content-Length and Content-Transfer-Encoding. */
                  if (*cp == '\n' &&
                      (!strncasecmp (cp+1, "Content-Length:",15) ||
                       !strncasecmp (cp+1, "Content-Transfer-Encoding:", 26)))
                    {
                      do
                        {
                          cp = strchr (cp+1, '\n');
                        }
                      while (cp != NULL && isspace(cp[1]));
                      if (cp == NULL)
                        break;
                    }

                  /* Rewrite Content-Type. */
                  if (*cp == '\n' && !strncasecmp (cp+1, "Content-Type:", 13))
                    {
                      char *hp;
                      int hlen;

                      if (p->data->content_type)
                        printf ("Content-Type: %s\n", p->data->content_type);

                      /* Skip to next header. */
                      do
                        {
                          cp = strchr (cp+1, '\n');
                        }
                      while (cp != NULL && isspace(cp[1]));
                      if (cp == NULL)
                        break;
                    }
                }
              putchar ('\n');
              puts (p->data->preamble);
              break;
            }
        }
      if (p != NULL)
        return 0;
    }

  /*
   * If we got here, either the message wasn't multipart/alternative
   * or it didn't have a text/plain component.  In either case we
   * give up and write the original message to stdout.
   */
  fputs (body, stdout);
  return 0;
}

/*
 * mime_parse: process an RFC 2046 multipart message and return
 *      a message structure with all the necessary fields filled in.
 *
 * The BODY argument is a character array containing the raw text of the
 * message to be parsed.
 *
 * In the event of a fatal system error (should only happen in the
 * case of insufficient memory) or a fatal MIME parsing error, a
 * message will be printed on standard error and the return value
 * will be NULL.
 */

struct message *
mime_parse (body)
     char *body;
{
  char *p, *bodyp;
  int len;
  struct message *msg;

  /* Initialize the message. */
  msg = (struct message *) malloc (sizeof(struct message));
  if (msg == NULL)
    {
      perror ("unhtml: mime_parse could not malloc new message struct");
      exit(1);
    }
  msg->header = NULL;
  msg->preamble = NULL;
  msg->epilogue = NULL;
  msg->content_type = NULL;
  msg->boundary = NULL;
  msg->parts = NULL;
  msg->is8bit = 0;

  /* Get the header. */
  /* Special case for message with zero-length header. */
  if (body[0] == '\n')
    {
      msg->hsize = 0;
      msg->header = (char *) malloc(sizeof(char));
      if (msg->header == NULL)
        {
          perror ("unhtml: mime_parse could not malloc header buffer");
          return NULL;
        }
      msg->header[0] = '\0';
      bodyp = body + 1;
    }
  else
    {
      bodyp = strstr (body, "\n\n");
      if (bodyp == NULL)
        {
          mime_fatal ("no message header found");
          return NULL;
        }
      msg->hsize = bodyp - body + 1;
      msg->header = (char *) malloc (sizeof(char) * (msg->hsize+1));
      if (msg->header == NULL)
        {
          perror ("unhtml: mime_parse could not malloc header buffer");
          return NULL;
        }
      strncpy (msg->header, body, msg->hsize);
      msg->header[msg->hsize] = '\0';
      bodyp += 2;
    }

  /* Find the content-type. */
  getheader (msg, "Content-Type", &p, &len);
  if (p != NULL)
    {
      msg->content_type = (char *) malloc (sizeof(char) * (len + 1));
      if (msg->content_type == NULL)
        {
          perror ("unhtml: mime_parse could not malloc Content-Type buffer");
          mime_destroy (msg);
          return NULL;
        }
      strncpy (msg->content_type, p, len);
      msg->content_type[len] = '\0';
    }

  /*
   * If this is a message/rfc822, then the body is an encapsulated
   * message.  Parse it, attach the result to the current message,
   * and we're done.
   */
  if (msg->content_type && !strcasecmp (msg->content_type, "message/rfc822"))
    {
      msg->parts = (struct list *) malloc (sizeof(struct list));
      if (msg->parts == NULL)
        {
          perror ("unhtml: mime_parse could not malloc attachment list");
          return NULL;
        }
      msg->parts->data = mime_parse (bodyp);
      msg->parts->next = NULL;
      return msg;
    }

  /* Find the message boundary. */
  if (msg->content_type && !strncasecmp (msg->content_type, "multipart/", 10))
    {
      /* Skip to next semicolon and see what keyword follows it. */
      p = msg->content_type;
      while ((p = strchr (p, ';')) != NULL)
        {
          ++p;
          p += strspn (p, " \t\v\r\n");
          if (strncasecmp (p, "boundary", 8) == 0)
            {
              char *dest;
              p += 8 + strspn (p+8, " \t\v\r\n");
              if (*p++ != '=')
                {
                  mime_fatal ("expected `=' after `boundary' parameter");
                  mime_destroy (msg);
                  return NULL;
                }
              p += strspn (p, " \t\v\r\n");
              dest = msg->boundary = (char *) malloc (strlen(p));
              if (dest == NULL)
                {
                  perror ("unhtml: mime_parse could not malloc boundary");
                  mime_destroy (msg);
                  return NULL;
                }
              /* If next char is a quote, read the following quoted-string. */
              if (*p == '"')
                {
                  ++p;
                  while (*p != '\0' && *p != '"')
                    {
                      if (*p == '\\')
                        ++p;
                      *dest++ = *p;
                      ++p;
                    }
                }
              else      /* Generic non-special characters. */
                {
                  while (*p != '\0' && !strchr ("()<>@,;:\\\"/[]?=", *p))
                    {
                      *dest++ = *p;
                      ++p;
                    }
                }
              *dest = '\0';
              break;
            }
        }
      if (msg->boundary == NULL)
        {
          mime_fatal ("Content-Type lacks required `boundary' parameter");
          mime_destroy (msg);
          return NULL;
        }
    }

  /* Break up multiparts. */
  if (check_msgtype (msg, "multipart/"))
    {
      char *nextpart;
      struct message *part;

      /* Preamble. */
      p = next_boundary (bodyp, msg->boundary);
      if (p == NULL)
        msg->preamble = strdup (bodyp);
      else
        {
          int psize = p - bodyp - strlen(msg->boundary) - 3;

          /* Special case: a boundary line may occur at the very beginning
             of the body, which means that no newline precedes it and psize
             is negative. */
          if (psize < 0)
            psize = 0;
          msg->preamble = (char *) malloc (sizeof(char) * (psize+1));
          if (msg->preamble == NULL)
            {
              perror ("unhtml: mime_parse could not malloc preamble buffer");
              mime_destroy (msg);
              return NULL;
            }
          strncpy (msg->preamble, bodyp, psize);
          msg->preamble[psize] = '\0';
        }

      /* Scan to each boundary and parse the body part contained therein. */
      while (p != NULL && strncmp (p, "--\n", 3) != 0)
        {
          nextpart = next_boundary (++p, msg->boundary);
          if (nextpart == NULL)
            {
              char buf[512];
              snprintf (buf, sizeof buf, "no terminating `%s' boundary found",
                        msg->boundary);
              mime_warn (buf);
              break;
            }
          else
            {
              char *part_end = nextpart - strlen(msg->boundary) - 3;
              char c = *part_end;

              /* XXX: Parsing a body part should not require munging
                 the buffer passed to mime_parse. */
              *part_end = '\0';
              part = mime_parse (p);
              *part_end = c;
              if (part == NULL)
                {
                  mime_destroy (msg);
                  return NULL;
                }
              if (!add_subpart (msg, part))
                {
                  mime_destroy (msg);
                  return NULL;
                }
            }
          p = nextpart;
        }

      /* Get epilogue. */
      if (p != NULL)
        {
          while (*p++ != '\n')
            ;
          if ((msg->epilogue = strdup(p)) == NULL)
            {
              perror ("unhtml: mime_parse could not strdup message epilogue");
              mime_destroy (msg);
              return NULL;
            }
        }
    }
  else  /* not multipart */
    {
      msg->preamble = strdup (bodyp);
      if (msg->preamble == NULL)
        {
          perror ("unhtml: mime_parse could not strdup message body");
          mime_destroy (msg);
          return NULL;
        }
    }

  /* Decode the body (preamble), if appropriate. */
  getheader (msg, "Content-Transfer-Encoding", &p, &len);
  if (p != NULL)
    {
      if (!strncasecmp (p, "quoted-printable", len))
        mime_decode_qp (msg);
      else if (!strncasecmp (p, "base64", len))
        mime_decode_b64 (msg->preamble);
      else if (strncasecmp (p, "7bit", 4) != 0 &&
               strncasecmp (p, "8bit", 4) != 0)
        mime_warn ("unknown Content-Transfer-Encoding");
    }

  return msg;
}

/*
 * check_msgtype: check the type of a MIME message structure and return 1 if
 *      the message is of the desired type, 0 otherwise.
 */

int
check_msgtype (msg, type)
     struct message *msg;
     char *type;
{
  return (msg->content_type &&
          !strncasecmp (msg->content_type, type, strlen(type)));
}

/*
 * getheader: examine a MIME message for a particular header, and
 *      record the location of that header's value (following the header name)
 *      and its length (excluding the trailing newline).
 *
 * The MSG argument is a message structure containing a parsed MIME message.
 * The HDRNAME argument is the name of the desired header, e.g. "Content-Type".
 * The HDRVALP argument stores a pointer to the beginning of the header
 *      contents, if that header is found in the message.
 * The HDRLEN argument stores the length of the header contents.
 */

void
getheader (msg, hdrname, hdrvalp, hdrlen)
     struct message *msg;
     char *hdrname;
     char **hdrvalp;
     int *hdrlen;
{
  char *p, *hvp;
  int hdrnamelen, hvlen;

  *hdrvalp = NULL;
  *hdrlen = 0;

  hdrnamelen = strlen(hdrname);

  p = msg->header;
  while (p != NULL)
    {
      if (strncasecmp (p, hdrname, hdrnamelen) == 0)
        {
          hvp = p + hdrnamelen;
          if (*hvp++ != ':')    /* colon must follow header name */
            continue;
          while (*hvp != '\0' && isspace(*hvp))
            ++hvp;
          for (hvlen = 0; hvp[hvlen] != '\0'; ++hvlen)
            {
              if (hvp[hvlen] == '\n' && !isspace(hvp[hvlen+1]))
                {
                  *hdrvalp = hvp;
                  *hdrlen = hvlen;
                  return;
                }
            }
        }
      p = strchr (p, '\n');
      if (p)
        ++p;
    }
}

/*
 * next_boundary: find the next MIME multipart boundary in a message.
 *      The return value is a pointer to the end of the boundary text,
 *      or NULL if no boundary can be found in this message.
 *
 * The BODY argument is a character array containing the message body.
 * The BOUNDARY argument is a character array containing the boundary
 *      delimiter.
 *
 * Because the return value points to the end of the boundary, it
 * will point to `\n' if this is an ordinary boundary or `--\n' if
 * it is a final boundary.
 */

char *
next_boundary (body, boundary)
     char *body;
     char *boundary;
{
  char *p;

  /* For efficiency reasons, look for the boundary first and then
     examine the characters around it. */

  p = strstr (body, boundary);
  if (p != NULL && strncmp (p-3, "\n--", 3) == 0)
    return p + strlen(boundary);
  return NULL;
}

/*
 * add_subpart: append one message to the list of sub-parts for another
 *      message.
 *
 * The argument MSG is a message structure representing a multipart message.
 * The argument PART is another message (possibly multipart) which is to
 *      be added to MSG's list of sub-parts.
 *
 * Return 1 on success.  If a fatal error arises, return 0.
 */

int
add_subpart (msg, part)
     struct message *msg;
     struct message *part;
{
  struct list *p;

  if (msg->parts == NULL)
    {
      msg->parts = (struct list *) malloc (sizeof(struct list));
      if (msg->parts == NULL)
        {
          perror ("unhtml: add_subpart could not malloc attachment list");
          return 0;
        }
      msg->parts->data = part;
      msg->parts->next = NULL;
    }
  else
    {
      for (p = msg->parts; p->next != NULL; p = p->next)
        ;
      p = p->next = (struct list *) malloc (sizeof(struct list));
      if (p == NULL)
        {
          perror ("unhtml: add_subpart could not malloc attachment buffer");
          return 0;
        }
      p->data = part;
      p->next = NULL;
    }

  return 1;
}

void
mime_write (msg)
     struct message *msg;
{
  if (msg->header)
    {
      fputs (msg->header, stdout);
      putc ('\n', stdout);
    }

  /* message/rfc822 needs special handling. */
  if (msg->content_type &&
      !strncasecmp (msg->content_type, "message/rfc822", 14))
    {
      mime_write (msg->parts->data);
      return;
    }

  /* XXX: watch out for 8bit data here. */
  fputs (msg->preamble, stdout);

  if (msg->parts != NULL)
    {
      struct list *p;
      for (p = msg->parts; p != NULL; p = p->next)
        {
          printf ("\n--%s\n", msg->boundary);
          mime_write (p->data);
        }
      printf ("\n--%s--\n", msg->boundary);
      fputs (msg->epilogue, stdout);
    }
}


void
mime_warn (s)
     char *s;
{
  fprintf (stderr, "MIME parser: warning: %s\n", s);
}

void
mime_fatal (s)
     char *s;
{
  fprintf (stderr, "MIME parser: fatal: %s\n", s);
}

void
mime_destroy (msg)
     struct message *msg;
{
  struct list *p, *q;

  if (msg->header != NULL)
    free (msg->header);
  if (msg->preamble != NULL)
    free (msg->preamble);
  if (msg->epilogue != NULL)
    free (msg->epilogue);
  if (msg->content_type != NULL)
    free (msg->content_type);
  if (msg->boundary != NULL)
    free (msg->boundary);

  p = msg->parts;
  while (p != NULL)
    {
      if (p->data != NULL)
        mime_destroy (p->data);
      q = p;
      p = p->next;
      free (q);
    }
}

void
print_msg_info (msg, indent)
     struct message *msg;
     int indent;
{
  char indbuf[80];

  indbuf[indent--] = '\0';
  while (indent >= 0)
    indbuf[indent--] = ' ';

  printf ("%sHeader:\n", indbuf);
  printf ("%s--BEGIN--\n", indbuf);
  printf ("%s\n", msg->header);
  printf ("%s--END--\n", indbuf);
  printf ("%sContent-Type: %s\n", indbuf, msg->content_type);
  printf ("%sBoundary: %s\n", indbuf, msg->boundary);
  printf ("%s----------------------------------------\n", indbuf);

  if (msg->parts != NULL) {
    struct list *p;
    for (p = msg->parts; p != NULL; p = p->next) {
      print_msg_info (p->data, indent + 4);
    }
  }
}

int
hex2dec_char(ch)
     char ch;
{
  if (isdigit(ch))
    return ch-'0';
  else if (isupper(ch))
    return ch-'A'+10;
  else
    return ch-'a'+10;
}

/*
 * mime_decode_qp: convert the preamble of MSG from a quoted-printable
 *      encoding to raw text.
 */
void
mime_decode_qp (msg)
     struct message *msg;
{
  unsigned char *src, *dst;

  dst = src = msg->preamble;
  while (*src != '\0')
    {
      if (*src == '=')
        {
          if (*++src == '\n')
            {
              ++src;
              continue;
            }
          else
            {
              int hi, lo;
              hi = hex2dec_char(*src++);
              lo = hex2dec_char(*src);
              *dst = hi*16 + lo;
              if (*dst > 0x7f)
                msg->is8bit = 1;
            }
        }
      else
        *dst = *src;
      ++dst, ++src;
    }
}

void
mime_output_qp (text)
     char *text;
{
  /* XXX: write this. */
}

/*
 * The char64 macro and `mime_decode_b64' routine are taken from
 * metamail 2.7, which is copyright (c) 1991 Bell Communications
 * Research, Inc. (Bellcore).  The following license applies to all
 * code below this point:
 *
 * Permission to use, copy, modify, and distribute this material 
 * for any purpose and without fee is hereby granted, provided 
 * that the above copyright notice and this permission notice 
 * appear in all copies, and that the name of Bellcore not be 
 * used in advertising or publicity pertaining to this 
 * material without the specific, prior written permission 
 * of an authorized representative of Bellcore.  BELLCORE 
 * MAKES NO REPRESENTATIONS ABOUT THE ACCURACY OR SUITABILITY 
 * OF THIS MATERIAL FOR ANY PURPOSE.  IT IS PROVIDED "AS IS", 
 * WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
 */

static char index_64[128] = {
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
    52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
    -1, 0, 1, 2,  3, 4, 5, 6,  7, 8, 9,10, 11,12,13,14,
    15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
    -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
    41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
};

#define char64(c)  (((c) < 0 || (c) > 127) ? -1 : index_64[(c)])

void
mime_decode_b64 (src) 
     char *src;
{
  char *dst;
  int c1, c2, c3, c4;
  int newline = 1, DataDone = 0;

  dst = src;
  while ((c1 = *src++) != '\0')
    {
      if (isspace(c1)) {
        if (c1 == '\n') {
          newline = 1;
        } else {
          newline = 0;
        }
        continue;
      }
      if (DataDone) continue;
      newline = 0;
      do {
        c2 = *src++;
      } while (c2 != '\0' && isspace(c2));
      do {
        c3 = *src++;
      } while (c3 != '\0' && isspace(c3));
      do {
        c4 = *src++;
      } while (c4 != '\0' && isspace(c4));
      if (c2 == '\0' || c3 == '\0' || c4 == '\0')
        {
          fprintf(stderr, "Warning: base64 decoder saw premature EOF!\n");
          return;
        }
      if (c1 == '=' || c2 == '=') {
        DataDone=1;
        continue;
      }
      c1 = char64(c1);
      c2 = char64(c2);
      *dst++ = (c1<<2) | ((c2&0x30)>>4);
      if (c3 == '=')
        DataDone = 1;
      else
        {
          c3 = char64(c3);
          *dst++ = ((c2&0XF) << 4) | ((c3&0x3C) >> 2);
          if (c4 == '=')
            DataDone = 1;
          else
            {
              c4 = char64(c4);
              *dst++ = ((c3&0x03) <<6) | c4;
            }
        }
    }
  *dst = '\0';
}

HTML demangler for mailing lists

Reply via email to