moseley 02/04/19 23:13:05
Modified: src/search spider.pl
Log:
Add debugging and quiet mode for the spider.
perldoc spider.pl
Revision Changes Path
1.4 +47 -7 modperl-docs/src/search/spider.pl
Index: spider.pl
===================================================================
RCS file: /home/cvs/modperl-docs/src/search/spider.pl,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- spider.pl 3 Mar 2002 11:27:22 -0000 1.3
+++ spider.pl 20 Apr 2002 06:13:05 -0000 1.4
@@ -2,7 +2,7 @@
use strict;
-# $Id: spider.pl,v 1.3 2002/03/03 11:27:22 stas Exp $
+# $Id: spider.pl,v 1.4 2002/04/20 06:13:05 moseley Exp $
#
# "prog" document source for spidering web servers
#
@@ -23,7 +23,7 @@
use HTML::Tagset;
use vars '$VERSION';
-$VERSION = sprintf '%d.%02d', q$Revision: 1.3 $ =~ /: (\d+)\.(\d+)/;
+$VERSION = sprintf '%d.%02d', q$Revision: 1.4 $ =~ /: (\d+)\.(\d+)/;
use vars '$bit';
use constant DEBUG_ERRORS => $bit = 1; # program errors
@@ -34,6 +34,17 @@
use constant DEBUG_INFO => $bit <<= 1; # more verbose
use constant DEBUG_LINKS => $bit <<= 1; # prints links as they are
extracted
+my %DEBUG_MAP = (
+ errors => DEBUG_ERRORS,
+ url => DEBUG_URL,
+ headers => DEBUG_HEADERS,
+ failed => DEBUG_FAILED,
+ skipped => DEBUG_SKIPPED,
+ info => DEBUG_INFO,
+ links => DEBUG_LINKS,
+);
+
+
use constant MAX_SIZE => 5_000_000; # Max size of document to fetch
use constant MAX_WAIT_TIME => 30; # request time.
@@ -62,7 +73,7 @@
}
- print STDERR "$0: Reading parameters from '$config'\n";
+ print STDERR "$0: Reading parameters from '$config'\n" unless
$ENV{SPIDER_QUIET};
my $abort;
local $SIG{HUP} = sub { warn "Caught SIGHUP\n"; $abort++ } unless $^O =~
/Win32/i;
@@ -103,8 +114,18 @@
# set defaults
- $server->{debug} ||= 0;
- die "debug parameter '$server->{debug}' must be a number\n" unless
$server->{debug} =~ /^\d+$/;
+ if ( $ENV{SPIDER_DEBUG} ) {
+ $server->{debug} = 0;
+
+ $server->{debug} |= (exists $DEBUG_MAP{lc $_} ? $DEBUG_MAP{lc $_} :
die "Bad debug setting passed in environment '$_'\nOptions are: " . join( ', ',
keys %DEBUG_MAP) ."\n")
+ for split /\s*,\s*/, $ENV{SPIDER_DEBUG};
+
+ } else {
+ $server->{debug} ||= 0;
+ die "debug parameter '$server->{debug}' must be a number\n" unless
$server->{debug} =~ /^\d+$/;
+ }
+
+ $server->{quiet} ||= $ENV{SPIDER_QUIET} || 0;
$server->{max_size} ||= MAX_SIZE;
@@ -137,7 +158,7 @@
my $start = time;
if ( $server->{skip} ) {
- print STDERR "Skipping: $server->{base_url}\n";
+ print STDERR "Skipping: $server->{base_url}\n" unless
$server->{quiet};
return;
}
@@ -235,6 +256,9 @@
eval { spider( $server, $uri ) };
print STDERR $@ if $@;
+ return if $server->{quiet};
+
+
$start = time - $start;
$start++ unless $start;
@@ -246,6 +270,7 @@
$max_num = length $val if length $val > $max_num;
}
+
printf STDERR "\nSummary for: $server->{base_url}\n";
for ( sort keys %{$server->{counts}} ) {
@@ -468,7 +493,7 @@
return;
}
- $response->request->uri->userinfo( undef );
+ $response->request->uri->userinfo( undef ) if $response->request;
# skip excluded by robots.txt
@@ -1339,6 +1364,21 @@
And you will see debugging info as it runs, and the fetched documents will
be saved
in the C<spider.out> file.
+
+Debugging can be also be set by an environment variable when running swish.
This will
+override any setting in the configuration file. Set the variable
SPIDER_DEBUG when running
+the spider. You can specify any of the above debugging options, separated
by a comma.
+
+For example with Bourne type shell:
+
+ SPIDER_DEBUG=url,links
+
+=item quiet
+
+If this is true then normal, non-error messages will be supressed. Quiet
mode can also
+be set by setting the environment variable SPIDER_QUIET to any true value.
+
+ SPIDER_QUIET=1
=item max_depth
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]