Ok, this is kind of cool. It wraps the sections in <div> and then indexes
the sections separately. So a search will point to the exact section where
the results are found.
The results will show a title like:
3 Perl Reference: Lexical Variables and Symbols -- rank: 923
4 Debugging mod_perl: gdb says there are no debugging symbols -- rank: 905
Note, you will need to edit dst_html/search/SwishSpiderConfig.pl to point
to the correct URL.
SwishSpiderConfig.pl is where all the work is done for splitting by
section. Feel free to fix up my perl. I was in a hurry.
Now, I really have to start packing....
Index: src/search/spider.pl
===================================================================
RCS file: /home/cvspublic/modperl-docs/src/search/spider.pl,v
retrieving revision 1.1
diff -u -r1.1 spider.pl
--- src/search/spider.pl 30 Jan 2002 06:35:00 -0000 1.1
+++ src/search/spider.pl 30 Jan 2002 23:29:41 -0000
@@ -410,7 +410,7 @@
print STDERR "-Skipped indexing $uri some callback set 'no_index'
flag\n" if $server->{debug}&DEBUG_SKIPPED;
} else {
- return unless check_user_function( 'filter_content', $uri, $server,
$response, \$content );
+ return $links_extracted unless check_user_function( 'filter_content',
$uri, $server, $response, \$content );
output_content( $server, \$content, $uri, $response )
unless $server->{no_index};
Index: src/search/swish.conf
===================================================================
RCS file: /home/cvspublic/modperl-docs/src/search/swish.conf,v
retrieving revision 1.1
diff -u -r1.1 swish.conf
--- src/search/swish.conf 30 Jan 2002 06:35:00 -0000 1.1
+++ src/search/swish.conf 30 Jan 2002 23:29:41 -0000
@@ -1,5 +1,3 @@
-SwishProgParameters default http://perl.apache.org/~stas/modperl-site/
-#SwishProgParameters default http://localhost/modperl-site/
IndexDir ./spider.pl
DefaultContents HTML2
StoreDescription HTML2 <body> 100000
Index: tmpl/custom/html/page_body
===================================================================
RCS file: /home/cvspublic/modperl-docs/tmpl/custom/html/page_body,v
retrieving revision 1.11
diff -u -r1.11 page_body
--- tmpl/custom/html/page_body 30 Jan 2002 05:14:46 -0000 1.11
+++ tmpl/custom/html/page_body 30 Jan 2002 23:29:41 -0000
@@ -18,6 +18,7 @@
# render the content
"<!-- SwishCommand index -->";
FOREACH sec = doc.body;
+ '<div class="index_section">';
sec;
"<br><br>";
IF loop.count == loop.size;
@@ -28,6 +29,7 @@
INCLUDE top_link;
END;
"<br><br>";
+ "</div>";
END;
"<!-- SwishCommand noindex -->";
%]
--- /dev/null Wed Apr 12 01:48:29 2000
+++ src/search/SwishSpiderConfig.pl Wed Jan 30 15:27:06 2002
@@ -0,0 +1,113 @@
+
+
[EMAIL PROTECTED] = (
+ {
+ base_url => 'http://mardy:40994/dst_html/index.html',
+
+ # Debugging -- see perldoc spider.pl
+
+ #base_url =>
'http://mardy.hank.org:40994/dst_html/docs/guide/index.html',
+ #max_depth => 1,
+ #debug => DEBUG_HEADERS,
+ #debug => DEBUG_URL|DEBUG_SKIPPED|DEBUG_INFO,
+ #debug => DEBUG_LINKS,
+
+ keep_alive => 1, # enable keep alives requests
+ email => '[EMAIL PROTECTED]',
+
+ use_md5 => 1, # catch duplicates ( / and /index.html )
+
+ delay_min => .0001,
+
+ # Ignore images files
+ test_url => sub { $_[0]->path !~ /\.(?:gif|jpeg|.png)$/i },
+
+ # Only index text/html
+ test_response => sub { return $_[2]->content_type =~ m[text/html] },
+
+
+ # split content - comment out to disable splitting
+ filter_content => \&split_page,
+
+ # optionally validate external links
+ #validate_links => 1,
+ },
+
+);
+
+use HTML::TreeBuilder;
+use HTML::Element;
+
+
+
+sub split_page {
+
+ my %params;
+ @params{ qw/ uri server response content / } = @_;
+ $params{found} = 0;
+
+
+ my $tree = HTML::TreeBuilder->new;
+ $tree->parse( ${$params{content}} ); # Why not allow a scalar ref?
+ $tree->eof;
+
+ my $head = $tree->look_down( '_tag', 'head' );
+
+ for my $section ( $tree->look_down( '_tag', 'div', 'class',
'index_section' ) ) {
+ create_page( $head->clone, $section->clone, \%params )
+ }
+
+ $tree->delete;
+
+ return !$params{found}; # tell spider.pl to not index the page
+}
+
+sub create_page {
+ my ( $head, $section, $params ) = @_;
+
+ my $uri = $params->{uri};
+
+ my $section_name = 'Unknown_Section';
+ my $name = $section->look_down( '_tag', 'a', sub {
defined($_[0]->attr('name')) } );
+
+ if ( $name ) {
+ $section_name = $name->attr('name');
+ $uri->fragment( $section_name );
+ }
+
+ my $text_title = $section_name;
+ $text_title =~ tr/_/ /s;
+
+ my $title = $head->look_down('_tag', 'title');
+
+ if ( $title ) {
+ $title->push_content(": $text_title");
+ } else {
+ my $title = HTML::Element->new('title');
+ $title->push_content(": $text_title");
+ $head->push_content( $title );
+ }
+
+
+
+ my $body = HTML::Element->new('body');
+ my $doc = HTML::Element->new('html');
+
+ $body->push_content( $section );
+ $doc->push_content( $head, $body );
+
+
+ my $new_content = $doc->as_HTML(undef,"\t");
+ output_content( $params->{server}, \$new_content, $uri,
$params->{response} );
+
+ $uri->fragment(undef);
+
+ $params->{found}++; # set flag;
+
+
+ $doc->delete;
+}
+
+
+1;
+
--
Bill Moseley
mailto:[EMAIL PROTECTED]
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]