Hi, folks. I finally bit the bullet last Friday and developed some code
for the external converters idea that had been tossed around before,
and has been rattling around in my brain off and on since. This was
precipitated by a couple e-mail messages I received in the past couple
weeks: first a request for an external parser for Excel spreadsheets,
then an announcement for an Excel to HTML converter, xlHtml.
Rather than coming up with a whole new C++ class, and another config
attribute, and figuring how to tie it all into the Retriever and Document
classes, I opted for a simpler approach, which I hope you won't think is
too much of a kludge. I designed it as an extension to the ExternalParser
class, which kept the changes localized to one source file. A lot of the
work was already done for me as well, so it's a fairly simple change.
The only thing I'm not wild about is the fact I needed to duplicate
some of the Parsable selection code from the Document class, because
the parser isn't given the Document object (and because I can't delete
the current ExternalParser object from within the method that's working
on it). Not a big deal, though.
Please have a look, give it a try, and let me know what you think.
The way it works is instead of just specifying a single content-type as
the first string of a pair in the external_parsers attribute, you specify
two types as type1->type2, as one string with no spaces, then the second
string will define an external converter rather than an external parser,
to convert the first type to the second.
E.g.:
external_parsers: application/pdf /usr/local/bin/parse_doc.pl \
application/msword->text/html /usr/local/bin/wordtohtml
If the second type is "user-defined", then it's up to the converter script
to put out a "Content-Type: text/foo" header followed by a blank line,
to indicate to htdig what type it should expect for the output, sort of
like what a CGI script would do.
E.g.:
external_parsers: application/msword->user-defined /usr/local/bin/mswordconv \
application/x-gunzip->user-defined /usr/local/bin/ungzipper
where mswordconv would output a "Content-Type: text/html" header,
a blank line, then the HTML output of mswordview, and ungzipper would
gunzip its input, determine the type, and output the apporpriate header
before outputting the unzipped output.
I'd also eventually like to add a second type of "magic", which would
make htdig determine the content-type by looking at the start of the
script's output. If anyone wan't to develop a function to do that
(or "borrow" it from Apache's mod_magic), it'd be a big help.
Here's my patch, too late for the feature freeze of course, unless you
vote it in. (It's a patch for 3.1.3, but should apply to 3.2, if you
change the unlink() call in the last hunk to match the 3.2 source.)
--- htdig-3.1.3/htdig/ExternalParser.cc.noconv Wed Sep 22 11:18:40 1999
+++ htdig-3.1.3/htdig/ExternalParser.cc Tue Oct 19 16:40:09 1999
@@ -11,6 +11,9 @@ static char RCSid[] = "$Id: ExternalPars
#endif
#include "ExternalParser.h"
+#include "HTML.h"
+#include "Plaintext.h"
+#include "PDF.h"
#include "htdig.h"
#include "htString.h"
#include "QuotedStringList.h"
@@ -21,6 +24,7 @@ static char RCSid[] = "$Id: ExternalPars
#include "good_strtok.h"
static Dictionary *parsers = 0;
+static Dictionary *toTypes = 0;
extern String configFile;
//*****************************************************************************
@@ -88,13 +92,25 @@ ExternalParser::canParse(char *contentTy
if (!parsers)
{
parsers = new Dictionary();
+ toTypes = new Dictionary();
QuotedStringList qsl(config["external_parsers"], " \t");
+ String from, to;
int i;
+ int sep;
for (i = 0; qsl[i]; i += 2)
{
- parsers->Add(qsl[i], new String(qsl[i + 1]));
+ from = qsl[i];
+ to = "";
+ sep = from.indexOf("->");
+ if (sep != -1)
+ {
+ to = from.sub(sep+2).get();
+ from = from.sub(0, sep).get();
+ }
+ parsers->Add(from, new String(qsl[i + 1]));
+ toTypes->Add(from, new String(to));
}
}
return parsers->Exists(contentType);
@@ -150,8 +166,45 @@ ExternalParser::parse(Retriever &retriev
char *token1, *token2, *token3;
int loc, hd;
URL url;
+ String convertToType = ((String *)toTypes->Find(contentType))->get();
+ int get_hdr = (mystrcasecmp(convertToType, "user-defined") == 0);
+ int get_file = (convertToType.length() != 0);
+ String newcontent;
while (readLine(input, line))
{
+ if (get_hdr)
+ {
+ line.chop('\r');
+ if (line.length() == 0)
+ get_hdr = FALSE;
+ else if (mystrncasecmp(line, "content-type:", 13) == 0)
+ {
+ token1 = line.get() + 13;
+ while (*token1 && isspace(*token1))
+ token1++;
+ token1 = strtok(token1, "\n\t");
+ convertToType = token1;
+ }
+ continue;
+ }
+ if (get_file)
+ {
+ if (newcontent.length() == 0 &&
+ !canParse(convertToType) &&
+ mystrncasecmp(convertToType, "text/", 5) != 0 &&
+ mystrncasecmp(convertToType, "application/pdf", 15) != 0)
+ {
+ if (mystrcasecmp(convertToType, "user-defined") == 0)
+ cerr << "External parser error: no Content-Type given\n";
+ else
+ cerr << "External parser error: can't parse Content-Type \""
+ << convertToType << "\"\n";
+ cerr << " URL: " << base.get() << "\n";
+ break;
+ }
+ newcontent << line << '\n';
+ continue;
+ }
token1 = strtok(line, "\t");
if (token1 == NULL)
token1 = "";
@@ -340,6 +393,50 @@ ExternalParser::parse(Retriever &retriev
}
pclose(input);
unlink(path);
+
+ if (newcontent.length() > 0)
+ {
+ static HTML *html = 0;
+ static Plaintext *plaintext = 0;
+ static PDF *pdf = 0;
+ Parsable *parsable = 0;
+
+ contentType = convertToType;
+ if (canParse(contentType))
+ {
+ currentParser = ((String *)parsers->Find(contentType))->get();
+ parsable = this;
+ }
+ else if (mystrncasecmp(contentType, "text/html", 9) == 0)
+ {
+ if (!html)
+ html = new HTML();
+ parsable = html;
+ }
+ else if (mystrncasecmp(contentType, "text/plain", 10) == 0)
+ {
+ if (!plaintext)
+ plaintext = new Plaintext();
+ parsable = plaintext;
+ }
+ else if (mystrncasecmp(contentType, "application/pdf", 15) == 0)
+ {
+ if (!pdf)
+ pdf = new PDF();
+ parsable = pdf;
+ }
+ else
+ {
+ if (!plaintext)
+ plaintext = new Plaintext();
+ parsable = plaintext;
+ if (debug)
+ cout << "External parser error: \"" << contentType <<
+ "\" not a recognized type. Assuming text\n";
+ }
+ parsable->setContents(newcontent.get(), newcontent.length());
+ parsable->parse(retriever, base);
+ }
}
--
Gilles R. Detillieux E-mail: <[EMAIL PROTECTED]>
Spinal Cord Research Centre WWW: http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba Phone: (204)789-3766
Winnipeg, MB R3E 3J7 (Canada) Fax: (204)789-3930
------------------------------------
To unsubscribe from the htdig3-dev mailing list, send a message to
[EMAIL PROTECTED] containing the single word "unsubscribe" in
the SUBJECT of the message.