stas 02/03/27 18:19:48
Modified: lib/DocSet/Source HTML.pm
Log:
sync with DocSet 0.12, the latest change:
- add code that parses source HTML and escapes unsafe URI/HTML chars
in links.
Revision Changes Path
1.4 +106 -19 modperl-docs/lib/DocSet/Source/HTML.pm
Index: HTML.pm
===================================================================
RCS file: /home/cvs/modperl-docs/lib/DocSet/Source/HTML.pm,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- HTML.pm 30 Jan 2002 16:55:04 -0000 1.3
+++ HTML.pm 28 Mar 2002 02:19:48 -0000 1.4
@@ -9,6 +9,8 @@
require DocSet::Doc;
@ISA = qw(DocSet::Doc);
+use constant ENCODE_CHARS => '<>&" ';
+
sub retrieve_meta_data {
my($self) = @_;
@@ -31,6 +33,26 @@
# there is no autogenerated TOC for HTML files
}
+my %linkElements = ( # from HTML::Element.pm
+ body => 'background',
+ base => 'href',
+ a => 'href',
+ img => [qw(src lowsrc usemap)], # lowsrc is a Netscape invention
+ form => 'action',
+ input => 'src',
+ 'link' => 'href', # need quoting since link is a perl builtin
+ frame => 'src',
+ applet => 'codebase',
+ area => 'href',
+);
+my %tag_attr;
+for my $tag (keys %linkElements) {
+ my $tagval = $linkElements{$tag};
+ for my $attr (ref $tagval ? @$tagval : $tagval) {
+ $tag_attr{"$tag $attr"}++;
+ }
+}
+
# currently retrieves these parts from the source HTML
# head.title
# head.meta.description
@@ -41,30 +63,95 @@
# already parsed
return if exists $self->{parsed_tree} && $self->{parsed_tree};
- # this one retrievs the body and the title of the given html
require HTML::Parser;
- sub start_h {
- my($self, $tagname, $attr) = @_;
- if ($tagname eq 'meta' && lc $attr->{name} eq 'description') {
- $self->{parsed_tree}->{abstract} = $attr->{content};
+ require HTML::Entities;
+
+ my $new_content;
+
+ # this parsing is for fixing up unsafe chars in URLs
+ {
+ # accum_h(self, $text)
+ sub accum_h {
+ my $self = shift;
+ #print "[ @_ ]";
+ $self->{content} .= join '', @_;
+ }
+
+ # encode unsafe chars in the URL attributes
+ sub start_h {
+ my($self, $tagname, $attr, $text) = @_;
+
+ # store away the HTML as is
+ unless ($linkElements{$tagname}) {
+ accum_h($self, $text);
+ return;
+ }
+
+ # escape thpse that include link elements
+ accum_h($self, qq{<$tagname});
+ for (keys %$attr) {
+ accum_h($self, qq{ $_="});
+ my $val = $attr->{$_};
+ if ($tag_attr{"$tagname $_"}) {
+ $val = HTML::Entities::encode($val, ENCODE_CHARS);
+ }
+ accum_h($self, $val);
+ }
+ accum_h($self, qq{">});
+ }
+
+ sub end_h {
+ my($self, $tagname) = @_;
+ accum_h($self, "</$tagname>");
}
+
+ sub text_h {
+ my($self, $text) = @_;
+ accum_h($self, $text);
+ }
+
+ my $p = HTML::Parser->new(api_version => 3,
+ start_h => [\&start_h, "self, tagname,
attr, text"],
+ end_h => [\&end_h, "self, tagname"],
+ text_h => [\&text_h, "self, text"],
+ );
+ # Parse document text chunk by chunk
+ $p->parse(${ $self->{content} });
+ $p->eof;
+ $new_content = $p->{content};
+ $self->{content} = \$new_content;
+ #print $new_content, "\n\n\n";
}
- sub end_h {
- my($self, $tagname, $skipped_text) = @_;
- # use $p itself as a tmp storage (ok according to the docs)
- $self->{parsed_tree}->{$tagname} = $skipped_text;
+
+ {
+ # this one retrieves and stashes away the description (As
'abstract')
+ # and the body and the title of the given html
+ my $start_h = sub {
+ my($self, $tagname, $attr) = @_;
+ if ($tagname eq 'meta' && lc $attr->{name} eq 'description') {
+ $self->{parsed_tree}->{abstract} = $attr->{content};
+ }
+ };
+
+ my $end_h = sub {
+ my($self, $tagname, $skipped_text) = @_;
+ # use $p itself as a tmp storage (ok according to the docs)
+ $self->{parsed_tree}->{$tagname} = $skipped_text;
+ };
+
+ my $p = HTML::Parser->new(api_version => 3,
+ report_tags => [qw(title meta body)],
+ start_h => [$start_h, "self, tagname,
attr"],
+ end_h => [$end_h, "self, tagname,
skipped_text"],
+ );
+ # Parse document text chunk by chunk
+ $p->parse(${ $self->{content} });
+ $p->eof;
+
+ # store the tree away
+ $self->{parsed_tree} = $p->{parsed_tree};
}
- my $p = HTML::Parser->new(api_version => 3,
- report_tags => [qw(title body meta)],
- start_h => [\&start_h, "self,tagname,attr"],
- end_h => [\&end_h,
"self,tagname,skipped_text"],
- );
- # Parse document text chunk by chunk
- $p->parse(${ $self->{content} });
- $p->eof;
- # store the tree away
- $self->{parsed_tree} = $p->{parsed_tree};
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]