Author: jerome Date: Mon Jun 5 14:43:42 2006 New Revision: 411926 URL: http://svn.apache.org/viewvc?rev=411926&view=rev Log: NUTCH-298 : No more NPE if a 404 for a robots.txt + some unit tests
Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=411926&r1=411925&r2=411926&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Mon Jun 5 14:43:42 2006 @@ -70,8 +70,8 @@ * file, and can test paths against those rules. */ public static class RobotRuleSet { - ArrayList tmpEntries; - RobotsEntry[] entries; + ArrayList tmpEntries = new ArrayList(); + RobotsEntry[] entries = null; long expireTime; /** Modified: lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=411926&r1=411925&r2=411926&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Mon Jun 5 14:43:42 2006 @@ -25,7 +25,29 @@ private static final String CR= "\r"; private static final String CRLF= "\r\n"; - + private static final boolean[] ACCEPT_ALL = { + true, // "/a", + true, // "/a/", + true, // "/a/bloh/foo.html" + true, // "/b", + true, // "/b/a", + true, // "/b/a/index.html", + true, // "/b/b/foo.html", + true, // "/c", + true, // "/c/a", + true, // "/c/a/index.html", + true, // "/c/b/foo.html", + true, // "/d", + true, // "/d/a", + true, // "/e/a/index.html", + true, // "/e/d", + true, // "/e/d/foo.html", + true, // "/e/doh.html", + true, // "/f/index.html", + true, // "/foo/bar.html", + true, // "/f/", + }; + private static final String[] ROBOTS_STRINGS= new String[] { "User-Agent: Agent1 #foo" + CR + "Disallow: /a" + CR @@ -40,6 +62,7 @@ + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR, + null // Used to test EMPTY_RULES }; private static final String[] AGENT_STRINGS= new String[] { @@ -57,7 +80,14 @@ false, false, true, - } + }, + { + false, + false, + false, + false, + true, + } }; private static final String[] TEST_PATHS= new String[] { @@ -195,6 +225,13 @@ false, // "/foo/bar.html", true, // "/f/", } + }, + { // ROBOTS_STRINGS[1] + ACCEPT_ALL, // Agent 1 + ACCEPT_ALL, // Agent 2 + ACCEPT_ALL, // Agent 3 + ACCEPT_ALL, // Agent 4 + ACCEPT_ALL, // Agent 5 } }; @@ -233,7 +270,9 @@ for (int i= 1; i < agents.length; i++) agentsString= agentsString + "," + agents[i]; RobotRulesParser p= new RobotRulesParser(agents); - RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString].getBytes()); + RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString] != null + ? ROBOTS_STRINGS[robotsString].getBytes() + : null); for (int i= 0; i < paths.length; i++) { assertTrue("testing robots file "+robotsString+", on agents (" + agentsString + "), and path " + TEST_PATHS[i] + "; got " @@ -243,4 +282,6 @@ } } + + }