On 22 Aug 2003, Jeremy Newman wrote: [...] > - No <html><head><body> tags. Just the content. ie, everything that > would be between the <body> tags.
I have a Perl script that does that part. Should be easy to extend to also extract the title... Here goes as a starting point. Maybe I'll work a bit more on it tomorrow but if anyone feels like hacking on it, feel free! #!/usr/bin/perl -w use strict; use File::Copy; my $filename=$ARGV[0]; print " $filename\n"; #FIXME:assuming that because there is a .bak file, this is what we want is #probably flawed. Or is it??? if (! -e "$filename.bak") { if (!copy("$filename","$filename.bak")) { print STDERR "error: unable to make a backup of $filename:\n"; print STDERR " $!\n"; return; } } if (!open(FILEI,"$filename.bak")) { print STDERR "error: unable to open $filename.bak for reading:\n"; print STDERR " $!\n"; return; } if (!open(FILEO,">$filename")) { print STDERR "error: unable to open $filename for writing:\n"; print STDERR " $!\n"; return; } my $line; while ($line=<FILEI>) { if ($line =~ s/<body[^>]*>//i) { print "matched <body>: $line"; last; } elsif ($line =~ s/<body[^>]*$//i) { print "matched <body: $line"; while ($line=<FILEI>) { print "looking for > $line"; if ($line =~ s/^[^>]*>//i) { last; } } last; } } print FILEO $line; while ($line=<FILEI>) { if ($line =~ s/<\/body//i) { print FILEO $line; last; } print FILEO $line; } close FILEI; close FILEO; exit 0; -- Francois Gouget [EMAIL PROTECTED] http://fgouget.free.fr/ "Lotto: A tax on people who are bad at math." -- unknown "Windows: Microsoft's tax on computer illiterates." -- WE7U