Hi, folks.  I finally bit the bullet last Friday and developed some code
for the external converters idea that had been tossed around before,
and has been rattling around in my brain off and on since.  This was
precipitated by a couple e-mail messages I received in the past couple
weeks: first a request for an external parser for Excel spreadsheets,
then an announcement for an Excel to HTML converter, xlHtml.

Rather than coming up with a whole new C++ class, and another config
attribute, and figuring how to tie it all into the Retriever and Document
classes, I opted for a simpler approach, which I hope you won't think is
too much of a kludge.  I designed it as an extension to the ExternalParser
class, which kept the changes localized to one source file.  A lot of the
work was already done for me as well, so it's a fairly simple change.

The only thing I'm not wild about is the fact I needed to duplicate
some of the Parsable selection code from the Document class, because
the parser isn't given the Document object (and because I can't delete
the current ExternalParser object from within the method that's working
on it).  Not a big deal, though.

Please have a look, give it a try, and let me know what you think.

The way it works is instead of just specifying a single content-type as
the first string of a pair in the external_parsers attribute, you specify
two types as type1->type2, as one string with no spaces, then the second
string will define an external converter rather than an external parser,
to convert the first type to the second.

E.g.:
  external_parsers: application/pdf /usr/local/bin/parse_doc.pl \
                    application/msword->text/html /usr/local/bin/wordtohtml

If the second type is "user-defined", then it's up to the converter script
to put out a "Content-Type: text/foo" header followed by a blank line,
to indicate to htdig what type it should expect for the output, sort of
like what a CGI script would do.

E.g.:
  external_parsers: application/msword->user-defined /usr/local/bin/mswordconv \
                    application/x-gunzip->user-defined /usr/local/bin/ungzipper

  where mswordconv would output a "Content-Type: text/html" header,
  a blank line, then the HTML output of mswordview, and ungzipper would
  gunzip its input, determine the type, and output the apporpriate header
  before outputting the unzipped output.

I'd also eventually like to add a second type of "magic", which would
make htdig determine the content-type by looking at the start of the
script's output.  If anyone wan't to develop a function to do that
(or "borrow" it from Apache's mod_magic), it'd be a big help.

Here's my patch, too late for the feature freeze of course, unless you
vote it in.  (It's a patch for 3.1.3, but should apply to 3.2, if you
change the unlink() call in the last hunk to match the 3.2 source.)

--- htdig-3.1.3/htdig/ExternalParser.cc.noconv  Wed Sep 22 11:18:40 1999
+++ htdig-3.1.3/htdig/ExternalParser.cc Tue Oct 19 16:40:09 1999
@@ -11,6 +11,9 @@ static char RCSid[] = "$Id: ExternalPars
 #endif
 
 #include "ExternalParser.h"
+#include "HTML.h"
+#include "Plaintext.h"
+#include "PDF.h"
 #include "htdig.h"
 #include "htString.h"
 #include "QuotedStringList.h"
@@ -21,6 +24,7 @@ static char RCSid[] = "$Id: ExternalPars
 #include "good_strtok.h"
 
 static Dictionary      *parsers = 0;
+static Dictionary      *toTypes = 0;
 extern String          configFile;
 
 //*****************************************************************************
@@ -88,13 +92,25 @@ ExternalParser::canParse(char *contentTy
     if (!parsers)
     {
        parsers = new Dictionary();
+       toTypes = new Dictionary();
        
        QuotedStringList        qsl(config["external_parsers"], " \t");
+       String                  from, to;
        int                     i;
+       int                     sep;
 
        for (i = 0; qsl[i]; i += 2)
        {
-           parsers->Add(qsl[i], new String(qsl[i + 1]));
+           from = qsl[i];
+           to = "";
+           sep = from.indexOf("->");
+           if (sep != -1)
+           {
+               to = from.sub(sep+2).get();
+               from = from.sub(0, sep).get();
+           }
+           parsers->Add(from, new String(qsl[i + 1]));
+           toTypes->Add(from, new String(to));
        }
     }
     return parsers->Exists(contentType);
@@ -150,8 +166,45 @@ ExternalParser::parse(Retriever &retriev
     char       *token1, *token2, *token3;
     int                loc, hd;
     URL                url;
+    String     convertToType = ((String *)toTypes->Find(contentType))->get();
+    int                get_hdr = (mystrcasecmp(convertToType, "user-defined") == 0);
+    int                get_file = (convertToType.length() != 0);
+    String     newcontent;
     while (readLine(input, line))
     {
+       if (get_hdr)
+       {
+           line.chop('\r');
+           if (line.length() == 0)
+               get_hdr = FALSE;
+           else if (mystrncasecmp(line, "content-type:", 13) == 0)
+           {
+               token1 = line.get() + 13;
+               while (*token1 && isspace(*token1))
+                   token1++;
+               token1 = strtok(token1, "\n\t");
+               convertToType = token1;
+           }
+           continue;
+       }
+       if (get_file)
+       {
+           if (newcontent.length() == 0 &&
+               !canParse(convertToType) &&
+               mystrncasecmp(convertToType, "text/", 5) != 0 &&
+               mystrncasecmp(convertToType, "application/pdf", 15) != 0)
+           {
+               if (mystrcasecmp(convertToType, "user-defined") == 0)
+                   cerr << "External parser error: no Content-Type given\n";
+               else
+                   cerr << "External parser error: can't parse Content-Type \""
+                        << convertToType << "\"\n";
+               cerr << " URL: " << base.get() << "\n";
+               break;
+           }
+           newcontent << line << '\n';
+           continue;
+       }
        token1 = strtok(line, "\t");
        if (token1 == NULL)
            token1 = "";
@@ -340,6 +393,50 @@ ExternalParser::parse(Retriever &retriev
     }
     pclose(input);
     unlink(path);
+
+    if (newcontent.length() > 0)
+    {
+       static HTML                     *html = 0;
+       static Plaintext                *plaintext = 0;
+       static PDF                      *pdf = 0;
+       Parsable                        *parsable = 0;
+
+       contentType = convertToType;
+       if (canParse(contentType))
+       {
+           currentParser = ((String *)parsers->Find(contentType))->get();
+           parsable = this;
+       }
+       else if (mystrncasecmp(contentType, "text/html", 9) == 0)
+       {
+           if (!html)
+               html = new HTML();
+           parsable = html;
+       }
+       else if (mystrncasecmp(contentType, "text/plain", 10) == 0)
+       {
+           if (!plaintext)
+               plaintext = new Plaintext();
+           parsable = plaintext;
+       }
+       else if (mystrncasecmp(contentType, "application/pdf", 15) == 0)
+       {
+           if (!pdf)
+               pdf = new PDF();
+           parsable = pdf;
+       }
+       else
+       {
+           if (!plaintext)
+               plaintext = new Plaintext();
+           parsable = plaintext;
+           if (debug)
+               cout << "External parser error: \"" << contentType <<
+                       "\" not a recognized type.  Assuming text\n";
+       }
+       parsable->setContents(newcontent.get(), newcontent.length());
+       parsable->parse(retriever, base);
+    }
 }
 
 

-- 
Gilles R. Detillieux              E-mail: <[EMAIL PROTECTED]>
Spinal Cord Research Centre       WWW:    http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba  Phone:  (204)789-3766
Winnipeg, MB  R3E 3J7  (Canada)   Fax:    (204)789-3930

------------------------------------
To unsubscribe from the htdig3-dev mailing list, send a message to
[EMAIL PROTECTED] containing the single word "unsubscribe" in
the SUBJECT of the message.

Reply via email to