I had some trouble in the past with HTML::Parser when dealing with the
fat
blobs of code that come from WYSWYG HTML editors. Not only was it
parsing
an object tree with hundreds of elements when I was only interested in a
few, but some of those editors are not so good about closing tags and
the
reconstituted document was sometimes mangled. My solution was to skip
parsing and go to a simple regex-based tag extractor. There are some
disadvantages - notably the lack of support for nested tags and poor
handling of mis-matched quotes - but I have found it to be quick enough
for my purposes, and no one complains anymore about mangled content.
YMMV.
These are excerpts from a library published under GPL. The library has
not
been made generaly available because there are still some serious warts
to remove...
-jh
----------------------------------------8<----------------------------------------
# Copyright (C) 1999, John Hurst
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# Note that the second argument can be a regular expression, so you can
# look for the next one of a set of tags by saying
#
# $struct = extract_tag($text, 'tag1|tag2|tag3')
#
#
sub extract_tag {
my $text = shift;
my $tag = shift;
my $retval = undef;
if($text =~ /<($tag)( [^>]*)?>/is){
$tag = $1;
my $attrib_list = $2;
$retval = {'_tag' => lc $tag,
'_fore' => $`,
'_aft' => $',
'_content' => '',
};
# did we get all of it? assumes odd number of " means no...
while($retval->{'_aft'} =~ />/ && _num_quotes($attrib_list) %
2){
$retval->{'_aft'} =~ /^([^>]*)>/s;
$attrib_list .= ">$1";
$retval->{'_aft'} = $';
}
foreach(shellwords($attrib_list)){
my ($name, $val) = split('=', $_, 2);
$retval->{lc $name} = $val;
}
# is this an 'empty' tag?
if($retval->{'_aft'} =~ /<\/$tag\s*>\s*/i){ # nope
my $tmp_1 = $`;
my $tmp_2 = $';
unless($tmp_1 =~ /<$tag/i){
$retval->{'_content'} = $tmp_1;
$retval->{'_aft'} = $tmp_2;
}
}
}
$retval;
}
#
#
sub _num_quotes {
my $str = shift;
my ($count, $ofst);
while(my $tmp = index($str, '"', $ofst)){
last if($tmp == -1);
$ofst = $tmp + 1;
$count++;
}
$count;
}
# Building on the function above, this one returns the template with all
# specified tags removed, followed by the found tag structures in the
order
# in which they were encountered.
#
#
sub extract_tags {
my $template = shift;
my $tag = shift;
my ($tmp, @retval);
while($tmp = extract_tag($template, $tag)){
$template = "$tmp->{'_fore'}$tmp->{'_aft'}";
delete($tmp->{'_fore'});
delete($tmp->{'_aft'});
push(@retval, $tmp);
}
($template, @retval);
}
----------------------------------------8<----------------------------------------