[patch] search engine

Bill Moseley 30 Jan 2002 23:39:05 -0000

Ok, this is kind of cool.  It wraps the sections in <div> and then indexes
the sections separately.  So a search will point to the exact section where
the results are found.


The results will show a title like:

3 Perl Reference: Lexical Variables and Symbols -- rank: 923 
4 Debugging mod_perl: gdb says there are no debugging symbols -- rank: 905 

Note, you will need to edit dst_html/search/SwishSpiderConfig.pl to point
to the correct URL.

SwishSpiderConfig.pl is where all the work is done for splitting by
section.  Feel free to fix up my perl.  I was in a hurry.

Now, I really have to start packing....

Index: src/search/spider.pl
===================================================================
RCS file: /home/cvspublic/modperl-docs/src/search/spider.pl,v
retrieving revision 1.1
diff -u -r1.1 spider.pl
--- src/search/spider.pl        30 Jan 2002 06:35:00 -0000      1.1
+++ src/search/spider.pl        30 Jan 2002 23:29:41 -0000
@@ -410,7 +410,7 @@
         print STDERR "-Skipped indexing $uri some callback set 'no_index' 
flag\n" if $server->{debug}&DEBUG_SKIPPED;

     } else {
-        return unless check_user_function( 'filter_content', $uri, $server, 
$response, \$content );
+        return $links_extracted unless check_user_function( 'filter_content', 
$uri, $server, $response, \$content );

         output_content( $server, \$content, $uri, $response )
             unless $server->{no_index};
Index: src/search/swish.conf
===================================================================
RCS file: /home/cvspublic/modperl-docs/src/search/swish.conf,v
retrieving revision 1.1
diff -u -r1.1 swish.conf
--- src/search/swish.conf       30 Jan 2002 06:35:00 -0000      1.1
+++ src/search/swish.conf       30 Jan 2002 23:29:41 -0000
@@ -1,5 +1,3 @@
-SwishProgParameters default http://perl.apache.org/~stas/modperl-site/
-#SwishProgParameters default http://localhost/modperl-site/
 IndexDir ./spider.pl
 DefaultContents HTML2
 StoreDescription HTML2 <body> 100000
Index: tmpl/custom/html/page_body
===================================================================
RCS file: /home/cvspublic/modperl-docs/tmpl/custom/html/page_body,v
retrieving revision 1.11
diff -u -r1.11 page_body
--- tmpl/custom/html/page_body  30 Jan 2002 05:14:46 -0000      1.11
+++ tmpl/custom/html/page_body  30 Jan 2002 23:29:41 -0000
@@ -18,6 +18,7 @@
     # render the content
     "<!-- SwishCommand index -->";
     FOREACH sec = doc.body;
+        '<div class="index_section">';
         sec;
         "<br><br>";
         IF loop.count == loop.size;
@@ -28,6 +29,7 @@
             INCLUDE top_link;
         END;
         "<br><br>";
+        "</div>";
     END;
     "<!-- SwishCommand noindex -->";
 %]


--- /dev/null   Wed Apr 12 01:48:29 2000
+++ src/search/SwishSpiderConfig.pl     Wed Jan 30 15:27:06 2002
@@ -0,0 +1,113 @@
+
+
[EMAIL PROTECTED] = (
+    {
+        base_url        => 'http://mardy:40994/dst_html/index.html',
+
+        # Debugging -- see perldoc spider.pl
+
+        #base_url        => 
'http://mardy.hank.org:40994/dst_html/docs/guide/index.html',
+        #max_depth => 1,
+        #debug => DEBUG_HEADERS,
+        #debug => DEBUG_URL|DEBUG_SKIPPED|DEBUG_INFO,
+        #debug => DEBUG_LINKS,
+
+        keep_alive      => 1,         # enable keep alives requests
+        email           => '[EMAIL PROTECTED]',
+
+        use_md5         => 1,    # catch duplicates ( / and /index.html )
+
+        delay_min       => .0001,
+
+        # Ignore images files
+        test_url        => sub { $_[0]->path !~ /\.(?:gif|jpeg|.png)$/i },
+
+        # Only index text/html
+        test_response   => sub { return $_[2]->content_type =~ m[text/html] },
+
+
+        # split content - comment out to disable splitting
+        filter_content  => \&split_page,
+
+        # optionally validate external links
+        #validate_links => 1,
+    },
+
+);
+
+use HTML::TreeBuilder;
+use HTML::Element;
+
+
+
+sub split_page {
+
+    my %params;
+    @params{ qw/ uri server response content / } = @_;
+    $params{found} = 0;
+
+
+    my $tree = HTML::TreeBuilder->new;
+    $tree->parse( ${$params{content}} );  # Why not allow a scalar ref?
+    $tree->eof;
+
+    my $head = $tree->look_down( '_tag', 'head' );
+
+    for my $section ( $tree->look_down( '_tag', 'div', 'class', 
'index_section' ) ) {
+        create_page( $head->clone, $section->clone, \%params )
+    }
+
+    $tree->delete;
+
+    return !$params{found};  # tell spider.pl to not index the page
+}
+
+sub create_page {
+    my ( $head, $section, $params ) = @_;
+
+    my $uri = $params->{uri};
+
+    my $section_name = 'Unknown_Section';
+    my $name = $section->look_down( '_tag', 'a', sub { 
defined($_[0]->attr('name')) } );
+
+    if ( $name ) {
+        $section_name = $name->attr('name');
+        $uri->fragment( $section_name );
+    }
+
+    my $text_title = $section_name;
+    $text_title =~ tr/_/ /s;
+
+    my $title = $head->look_down('_tag', 'title');
+
+    if ( $title ) {
+        $title->push_content(": $text_title");
+    } else {
+        my $title = HTML::Element->new('title');
+        $title->push_content(": $text_title");
+        $head->push_content( $title );
+    }
+
+
+
+    my $body = HTML::Element->new('body');
+    my $doc  = HTML::Element->new('html');
+
+    $body->push_content( $section );
+    $doc->push_content( $head, $body );
+
+
+    my $new_content = $doc->as_HTML(undef,"\t");
+    output_content( $params->{server}, \$new_content, $uri, 
$params->{response} );
+
+    $uri->fragment(undef);
+
+    $params->{found}++;  # set flag;
+
+
+    $doc->delete;
+}
+
+
+1;
+

-- 
Bill Moseley
mailto:[EMAIL PROTECTED]

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

[patch] search engine

Reply via email to