[htdig] yet another pdf parser

Stefan Nehlsen Thu, 06 Sep 2001 04:16:56 -0700

hi,

maybe you will find this useful.

This perl script may be used by htdig as external parser for pdf-files.

cu, Stefan
-- 
Stefan Nehlsen | ParlaNet Administration | [EMAIL PROTECTED] | +49 431 988-1260

#!/usr/bin/perl --
#
# parse pdf files for htdig
#
# - generate anchor tags
# - do site specific rewriting url to title
#   for missing or bad titles
# - I suppose it is faster then parse_doc.pl
#
# based on:
#       - htdig documentation
#       - parse_doc.pl
#       - pdftodig.py (http://po.gaillard.free.fr/pdftodig.py)
#
# Stefan Nehlsen  [EMAIL PROTECTED]

# external tools from the xpdf package
$parser = "/usr/bin/pdftotext";
$info   = "/usr/bin/pdfinfo";

my($infile, $content_type, $url, $config) = @ARGV;

# paranoid
die "pdfinfo \"$info\" not executable!\n"  unless -x $info;
die "parser \"$parser\" not executable!\n" unless -x $parser;
die "\"$infile\" not readable\n" unless -f $infile;
open PDF, $infile or die "opening $infile failed\n";
$text = <PDF>; # read first line
close PDF;
die "\"$infile is not a PDF-File!\n" unless $text=~/^%PDF-\d\.\d/;
# everything seems to be ok

# use pdfinfo to retrieve meta information
open INFO, "$info \"$infile\" 2>/dev/null |" or warn "$info \"$infile\" failed\n";
while (<INFO>) {
        chop;
        if(s/^Title:\s*//){
                s/\s+$//; s/\s+/ /g; s/[\376\377]//g; # delete unicode (?) marker
                # if title is a filename we better use the real filename
                $title = $_ unless /\.pdf$|Microsoft\s+Word\s+-/i or
                                (length($_)> 16 and /\.\.\.$/);
                last;
        }
}
close INFO;

# At this point I do some site-specific rewriting of the title
# based on structured urls and/or an external database.

# read text from pdftotext
undef $/;
open PDF, "$parser -raw -q \"$infile\" - 2>/dev/null |"
        or die "error opening pdf \"$infile\"\n";
$text = <PDF>; # read whole file
close PDF;

# the point of no return
($title = $url) =~ s#^.*/(.*?\.pdf$)#PDF Dokument $1#i unless $title;
$title =~ s/&/\&amp\;/g; $title =~ s/</\&lt\;/g; $title =~ s/>/\&gt\;/g;
print "t\t", $title, "\n";

$text =~ s/^[\s\n\f]*//s; $text =~ s/[\s\n\f]*$//s;
$text =~ s/-\s*\n+\s*([a-z\340-\377])/$1/gs; # dehyphen

($header = $text) =~ s/[\s\n\f]+/ /gs;
if( $header ){
        $header =~ s/&/\&amp\;/g; $header =~ s/</\&lt\;/g; $header =~ s/>/\&gt\;/g;
        print "h\t", $header, "\n";
}

@words = grep { /\f|.{3,}/ } split /[^A-Za-z\300-\377\f]+/, $text;
$n = 0; $page = 2; $k = 1000 / @words if @words;
foreach $word ( @words){
        if( $word eq "\f" ){
                printf "a\tpage=%d\n", $page++;
        } else {
                printf "w\t%s\t%d\t0\n", $word, $n++ * $k;
        }
}

[htdig] yet another pdf parser

Reply via email to