Liam Quinn <[EMAIL PROTECTED]> writes:

> LWP::RobotUA won't parse a robots.txt file if the file does not contain
> "Disallow".  The check for "Disallow" is case sensitive, but according to
> the robot exclusion standard, field names are case insensitive.  This
> causes LWP::RobotUA to ignore some robots.txt files that it should parse.
> 
> Attached is a patch that makes the check for "Disallow" case insensitive.  
> The patch is against libwww-perl 5.76 (RobotUA.pm 1.23).

Thanks! Applied as:

Index: lib/LWP/RobotUA.pm
===================================================================
RCS file: /cvsroot/libwww-perl/lwp5/lib/LWP/RobotUA.pm,v
retrieving revision 1.23
retrieving revision 1.24
diff -u -p -r1.23 -r1.24
--- lib/LWP/RobotUA.pm  24 Oct 2003 11:13:03 -0000      1.23
+++ lib/LWP/RobotUA.pm  6 Apr 2004 11:02:50 -0000       1.24
@@ -1,10 +1,10 @@
 package LWP::RobotUA;
 
-# $Id: RobotUA.pm,v 1.23 2003/10/24 11:13:03 gisle Exp $
+# $Id: RobotUA.pm,v 1.24 2004/04/06 11:02:50 gisle Exp $
 
 require LWP::UserAgent;
 @ISA = qw(LWP::UserAgent);
-$VERSION = sprintf("%d.%02d", q$Revision: 1.23 $ =~ /(\d+)\.(\d+)/);
+$VERSION = sprintf("%d.%02d", q$Revision: 1.24 $ =~ /(\d+)\.(\d+)/);
 
 require WWW::RobotRules;
 require HTTP::Request;
@@ -126,7 +126,7 @@ sub simple_request
        my $fresh_until = $robot_res->fresh_until;
        if ($robot_res->is_success) {
            my $c = $robot_res->content;
-           if ($robot_res->content_type =~ m,^text/, && $c =~ /Disallow/) {
+           if ($robot_res->content_type =~ m,^text/, && $c =~ /^Disallow\s*:/mi) {
                LWP::Debug::debug("Parsing robot rules");
                $self->{'rules'}->parse($robot_url, $c, $fresh_until);
            }

> 
> -- 
> Liam Quinn
> 
> 
> 
> --- LWP/RobotUA.pm.orig       2003-10-24 07:13:03.000000000 -0400
> +++ LWP/RobotUA.pm    2004-04-03 17:59:04.000000000 -0500
> @@ -126,7 +126,7 @@
>       my $fresh_until = $robot_res->fresh_until;
>       if ($robot_res->is_success) {
>           my $c = $robot_res->content;
> -         if ($robot_res->content_type =~ m,^text/, && $c =~ /Disallow/) {
> +         if ($robot_res->content_type =~ m,^text/, && $c =~ /Disallow/i) {
>               LWP::Debug::debug("Parsing robot rules");
>               $self->{'rules'}->parse($robot_url, $c, $fresh_until);
>           }

Reply via email to