I had some trouble in the past with HTML::Parser when dealing with the
fat
blobs of code that come from WYSWYG HTML editors. Not only was it
parsing
an object tree with hundreds of elements when I was only interested in a
few, but some of those editors are not so good about closing tags and
the
reconstituted document was sometimes mangled. My solution was to skip
parsing and go to a simple regex-based tag extractor. There are some
disadvantages - notably the lack of support for nested tags and poor
handling of mis-matched quotes - but I have found it to be quick enough
for my purposes, and no one complains anymore about mangled content.
YMMV.

These are excerpts from a library published under GPL. The library has
not
been made generaly available because there are still some serious warts
to remove...

-jh

----------------------------------------8<----------------------------------------
# Copyright (C) 1999, John Hurst

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.


#
# Note that the second argument can be a regular expression, so you can
# look for the next one of a set of tags by saying
#
#   $struct = extract_tag($text, 'tag1|tag2|tag3')
#
#
sub extract_tag {
    my $text = shift;
    my $tag = shift;
    my $retval = undef;

    if($text =~ /<($tag)( [^>]*)?>/is){
        $tag = $1;
        my $attrib_list = $2;

        $retval = {'_tag'     => lc $tag,
                   '_fore'    => $`,
                   '_aft'     => $',
                   '_content' => '',
               };

        # did we get all of it? assumes odd number of " means no...
        while($retval->{'_aft'} =~ />/ && _num_quotes($attrib_list) %
2){
            $retval->{'_aft'} =~ /^([^>]*)>/s;
            $attrib_list .= ">$1";
            $retval->{'_aft'} = $';
        }

        foreach(shellwords($attrib_list)){
            my ($name, $val) = split('=', $_, 2);
            $retval->{lc $name} = $val;
        }

        # is this an 'empty' tag?
        if($retval->{'_aft'} =~ /<\/$tag\s*>\s*/i){ # nope
            my $tmp_1 = $`;
            my $tmp_2 = $';

            unless($tmp_1 =~ /<$tag/i){
                $retval->{'_content'} = $tmp_1;
                $retval->{'_aft'} = $tmp_2;
            }
        }
    }

    $retval;
}


#
#
sub _num_quotes {
    my $str = shift;
    my ($count, $ofst);

    while(my $tmp = index($str, '"', $ofst)){
        last if($tmp == -1);
        $ofst = $tmp + 1;
        $count++;
    }

    $count;
}


# Building on the function above, this one returns the template with all
# specified tags removed, followed by the found tag structures in the
order
# in which they were encountered.
#
#
sub extract_tags {
    my $template = shift;
    my $tag = shift;
    my ($tmp, @retval);

    while($tmp = extract_tag($template, $tag)){
        $template = "$tmp->{'_fore'}$tmp->{'_aft'}";
        delete($tmp->{'_fore'});
        delete($tmp->{'_aft'});
        push(@retval, $tmp);
    }

    ($template, @retval);
}

----------------------------------------8<----------------------------------------

Reply via email to