Modified: nutch/branches/2.x/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java (original) +++ nutch/branches/2.x/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java Fri Jan 9 06:34:33 2015 @@ -23,104 +23,47 @@ import org.junit.Before; import org.junit.Test; import static org.junit.Assert.*; - /** * JUnit test for <code>SuffixURLFilter</code>. - * + * * @author Andrzej Bialecki */ public class TestSuffixURLFilter { - private static final String suffixes = - "# this is a comment\n" + - "\n" + - ".gif\n" + - ".jpg\n" + - ".js\n"; - + private static final String suffixes = "# this is a comment\n" + "\n" + + ".gif\n" + ".jpg\n" + ".js\n"; + private static final String[] urls = new String[] { - "http://www.example.com/test.gif", - "http://www.example.com/TEST.GIF", - "http://www.example.com/test.jpg", - "http://www.example.com/test.JPG", - "http://www.example.com/test.html", - "http://www.example.com/test.HTML", - "http://www.example.com/test.html?q=abc.js", - "http://www.example.com/test.js?foo=bar&baz=bar#12333", - }; - - private static String[] urlsModeAccept = new String[] { - null, - urls[1], - null, - urls[3], - urls[4], - urls[5], - null, - urls[7] - }; - - private static String[] urlsModeReject = new String[] { - urls[0], - null, - urls[2], - null, - null, - null, - urls[6], - null - }; - - private static String[] urlsModeAcceptIgnoreCase = new String[] { - null, - null, - null, - null, - urls[4], - urls[5], - null, - urls[7] - }; - - private static String[] urlsModeRejectIgnoreCase = new String[] { - urls[0], - urls[1], - urls[2], - urls[3], - null, - null, - urls[6], - null - }; - - private static String[] urlsModeAcceptAndPathFilter = new String[] { - null, - urls[1], - null, - urls[3], - urls[4], - urls[5], - urls[6], - null - }; - - private static String[] urlsModeAcceptAndNonPathFilter = new String[] { - null, - urls[1], - null, - urls[3], - urls[4], - urls[5], - null, - urls[7] - }; - + "http://www.example.com/test.gif", "http://www.example.com/TEST.GIF", + "http://www.example.com/test.jpg", "http://www.example.com/test.JPG", + "http://www.example.com/test.html", "http://www.example.com/test.HTML", + "http://www.example.com/test.html?q=abc.js", + "http://www.example.com/test.js?foo=bar&baz=bar#12333", }; + + private static String[] urlsModeAccept = new String[] { null, urls[1], null, + urls[3], urls[4], urls[5], null, urls[7] }; + + private static String[] urlsModeReject = new String[] { urls[0], null, + urls[2], null, null, null, urls[6], null }; + + private static String[] urlsModeAcceptIgnoreCase = new String[] { null, null, + null, null, urls[4], urls[5], null, urls[7] }; + + private static String[] urlsModeRejectIgnoreCase = new String[] { urls[0], + urls[1], urls[2], urls[3], null, null, urls[6], null }; + + private static String[] urlsModeAcceptAndPathFilter = new String[] { null, + urls[1], null, urls[3], urls[4], urls[5], urls[6], null }; + + private static String[] urlsModeAcceptAndNonPathFilter = new String[] { null, + urls[1], null, urls[3], urls[4], urls[5], null, urls[7] }; + private SuffixURLFilter filter = null; - + @Before public void setUp() throws IOException { filter = new SuffixURLFilter(new StringReader(suffixes)); } - + @Test public void testModeAccept() { filter.setIgnoreCase(false); @@ -156,7 +99,7 @@ public class TestSuffixURLFilter { assertTrue(urlsModeRejectIgnoreCase[i] == filter.filter(urls[i])); } } - + @Test public void testModeAcceptAndNonPathFilter() { filter.setModeAccept(true); @@ -165,7 +108,7 @@ public class TestSuffixURLFilter { assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter.filter(urls[i])); } } - + @Test public void testModeAcceptAndPathFilter() { filter.setModeAccept(true);
Modified: nutch/branches/2.x/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java (original) +++ nutch/branches/2.x/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java Fri Jan 9 06:34:33 2015 @@ -134,8 +134,10 @@ public class UrlValidator implements URL private int maxTldLength; - private static String TOP_LEVEL_DOMAIN_LENGTH = "urlfilter.tld.length"; // maximum length of TLD - + private static String TOP_LEVEL_DOMAIN_LENGTH = "urlfilter.tld.length"; // maximum + // length + // of + // TLD private static final int TOP_LEVEL_DOMAIN_LENGTH_VALUE = 8; @@ -287,8 +289,7 @@ public class UrlValidator implements URL segCount++; } String topLevel = domainSegment[segCount - 1]; - if (topLevel.length() < 2 - || topLevel.length() > maxTldLength) { + if (topLevel.length() < 2 || topLevel.length() > maxTldLength) { return false; } Modified: nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java (original) +++ nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java Fri Jan 9 06:34:33 2015 @@ -113,8 +113,7 @@ public class TestUrlValidator extends Te } /** - * Generate Sample of Invalid Tld. - * character + * Generate Sample of Invalid Tld. character */ public String generateInvalidTld(int length) { Modified: nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (original) +++ nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java Fri Jan 9 06:34:33 2015 @@ -39,177 +39,171 @@ import org.apache.oro.text.regex.*; * </ul> */ public class BasicURLNormalizer extends Configured implements URLNormalizer { - public static final Logger LOG = LoggerFactory.getLogger(BasicURLNormalizer.class); + public static final Logger LOG = LoggerFactory + .getLogger(BasicURLNormalizer.class); - private Perl5Compiler compiler = new Perl5Compiler(); - private ThreadLocal<Perl5Matcher> matchers = new ThreadLocal<Perl5Matcher>() { - protected Perl5Matcher initialValue() { - return new Perl5Matcher(); - } - }; - private final Rule relativePathRule; - private final Rule leadingRelativePathRule; - private final Rule currentPathRule; - private final Rule adjacentSlashRule; - - private Configuration conf; - - public BasicURLNormalizer() { - try { - // this pattern tries to find spots like "/xx/../" in the url, which - // could be replaced by "/" xx consists of chars, different then "/" - // (slash) and needs to have at least one char different from "." - relativePathRule = new Rule(); - relativePathRule.pattern = (Perl5Pattern) - compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)", - Perl5Compiler.READ_ONLY_MASK); - relativePathRule.substitution = new Perl5Substitution("/"); - - // this pattern tries to find spots like leading "/../" in the url, - // which could be replaced by "/" - leadingRelativePathRule = new Rule(); - leadingRelativePathRule.pattern = (Perl5Pattern) - compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK); - leadingRelativePathRule.substitution = new Perl5Substitution("/"); - - // this pattern tries to find spots like "/./" in the url, - // which could be replaced by "/" - currentPathRule = new Rule(); - currentPathRule.pattern = (Perl5Pattern) - compiler.compile("(/\\./)", Perl5Compiler.READ_ONLY_MASK); - currentPathRule.substitution = new Perl5Substitution("/"); - - // this pattern tries to find spots like "xx//yy" in the url, - // which could be replaced by a "/" - adjacentSlashRule = new Rule(); - adjacentSlashRule.pattern = (Perl5Pattern) - compiler.compile("/{2,}", Perl5Compiler.READ_ONLY_MASK); - adjacentSlashRule.substitution = new Perl5Substitution("/"); - - } catch (MalformedPatternException e) { - throw new RuntimeException(e); - } + private Perl5Compiler compiler = new Perl5Compiler(); + private ThreadLocal<Perl5Matcher> matchers = new ThreadLocal<Perl5Matcher>() { + protected Perl5Matcher initialValue() { + return new Perl5Matcher(); + } + }; + private final Rule relativePathRule; + private final Rule leadingRelativePathRule; + private final Rule currentPathRule; + private final Rule adjacentSlashRule; + + private Configuration conf; + + public BasicURLNormalizer() { + try { + // this pattern tries to find spots like "/xx/../" in the url, which + // could be replaced by "/" xx consists of chars, different then "/" + // (slash) and needs to have at least one char different from "." + relativePathRule = new Rule(); + relativePathRule.pattern = (Perl5Pattern) compiler.compile( + "(/[^/]*[^/.]{1}[^/]*/\\.\\./)", Perl5Compiler.READ_ONLY_MASK); + relativePathRule.substitution = new Perl5Substitution("/"); + + // this pattern tries to find spots like leading "/../" in the url, + // which could be replaced by "/" + leadingRelativePathRule = new Rule(); + leadingRelativePathRule.pattern = (Perl5Pattern) compiler.compile( + "^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK); + leadingRelativePathRule.substitution = new Perl5Substitution("/"); + + // this pattern tries to find spots like "/./" in the url, + // which could be replaced by "/" + currentPathRule = new Rule(); + currentPathRule.pattern = (Perl5Pattern) compiler.compile("(/\\./)", + Perl5Compiler.READ_ONLY_MASK); + currentPathRule.substitution = new Perl5Substitution("/"); + + // this pattern tries to find spots like "xx//yy" in the url, + // which could be replaced by a "/" + adjacentSlashRule = new Rule(); + adjacentSlashRule.pattern = (Perl5Pattern) compiler.compile("/{2,}", + Perl5Compiler.READ_ONLY_MASK); + adjacentSlashRule.substitution = new Perl5Substitution("/"); + + } catch (MalformedPatternException e) { + throw new RuntimeException(e); } + } - public String normalize(String urlString, String scope) - throws MalformedURLException { - if ("".equals(urlString)) // permit empty - return urlString; - - urlString = urlString.trim(); // remove extra spaces - - URL url = new URL(urlString); - - String protocol = url.getProtocol(); - String host = url.getHost(); - int port = url.getPort(); - String file = url.getFile(); - - boolean changed = false; - - if (!urlString.startsWith(protocol)) // protocol was lowercased - changed = true; - - if ("http".equals(protocol) || "https".equals(protocol) || "ftp".equals(protocol)) { - - if (host != null) { - String newHost = host.toLowerCase(); // lowercase host - if (!host.equals(newHost)) { - host = newHost; - changed = true; - } - } - - if (port == url.getDefaultPort()) { // uses default port - port = -1; // so don't specify it - changed = true; - } - - if (file == null || "".equals(file)) { // add a slash - file = "/"; - changed = true; - } - - if (url.getRef() != null) { // remove the ref - changed = true; - } - - // check for unnecessary use of "/../" - String file2 = substituteUnnecessaryRelativePaths(file); - - if (!file.equals(file2)) { - changed = true; - file = file2; - } + public String normalize(String urlString, String scope) + throws MalformedURLException { + if ("".equals(urlString)) // permit empty + return urlString; - } + urlString = urlString.trim(); // remove extra spaces - if (changed) - urlString = new URL(protocol, host, port, file).toString(); + URL url = new URL(urlString); - return urlString; - } + String protocol = url.getProtocol(); + String host = url.getHost(); + int port = url.getPort(); + String file = url.getFile(); - private String substituteUnnecessaryRelativePaths(String file) { - String fileWorkCopy = file; - int oldLen = file.length(); - int newLen = oldLen - 1; - - // All substitutions will be done step by step, to ensure that certain - // constellations will be normalized, too - // - // For example: "/aa/bb/../../cc/../foo.html will be normalized in the - // following manner: - // "/aa/bb/../../cc/../foo.html" - // "/aa/../cc/../foo.html" - // "/cc/../foo.html" - // "/foo.html" - // - // The normalization also takes care of leading "/../", which will be - // replaced by "/", because this is a rather a sign of bad webserver - // configuration than of a wanted link. For example, urls like - // "http://www.foo.com/../" should return a http 404 error instead of - // redirecting to "http://www.foo.com". - // - Perl5Matcher matcher = matchers.get(); - - while (oldLen != newLen) { - // substitue first occurence of "/xx/../" by "/" - oldLen = fileWorkCopy.length(); - fileWorkCopy = Util.substitute - (matcher, relativePathRule.pattern, - relativePathRule.substitution, fileWorkCopy, 1); - - // remove leading "/../" - fileWorkCopy = Util.substitute - (matcher, leadingRelativePathRule.pattern, - leadingRelativePathRule.substitution, fileWorkCopy, 1); - - // remove unnecessary "/./" - fileWorkCopy = Util.substitute - (matcher, currentPathRule.pattern, - currentPathRule.substitution, fileWorkCopy, 1); - - - // collapse adjacent slashes with "/" - fileWorkCopy = Util.substitute - (matcher, adjacentSlashRule.pattern, - adjacentSlashRule.substitution, fileWorkCopy, 1); - - newLen = fileWorkCopy.length(); + boolean changed = false; + + if (!urlString.startsWith(protocol)) // protocol was lowercased + changed = true; + + if ("http".equals(protocol) || "https".equals(protocol) + || "ftp".equals(protocol)) { + + if (host != null) { + String newHost = host.toLowerCase(); // lowercase host + if (!host.equals(newHost)) { + host = newHost; + changed = true; } + } + + if (port == url.getDefaultPort()) { // uses default port + port = -1; // so don't specify it + changed = true; + } + + if (file == null || "".equals(file)) { // add a slash + file = "/"; + changed = true; + } + + if (url.getRef() != null) { // remove the ref + changed = true; + } + + // check for unnecessary use of "/../" + String file2 = substituteUnnecessaryRelativePaths(file); + + if (!file.equals(file2)) { + changed = true; + file = file2; + } - return fileWorkCopy; } + if (changed) + urlString = new URL(protocol, host, port, file).toString(); + + return urlString; + } + + private String substituteUnnecessaryRelativePaths(String file) { + String fileWorkCopy = file; + int oldLen = file.length(); + int newLen = oldLen - 1; + + // All substitutions will be done step by step, to ensure that certain + // constellations will be normalized, too + // + // For example: "/aa/bb/../../cc/../foo.html will be normalized in the + // following manner: + // "/aa/bb/../../cc/../foo.html" + // "/aa/../cc/../foo.html" + // "/cc/../foo.html" + // "/foo.html" + // + // The normalization also takes care of leading "/../", which will be + // replaced by "/", because this is a rather a sign of bad webserver + // configuration than of a wanted link. For example, urls like + // "http://www.foo.com/../" should return a http 404 error instead of + // redirecting to "http://www.foo.com". + // + Perl5Matcher matcher = matchers.get(); + + while (oldLen != newLen) { + // substitue first occurence of "/xx/../" by "/" + oldLen = fileWorkCopy.length(); + fileWorkCopy = Util.substitute(matcher, relativePathRule.pattern, + relativePathRule.substitution, fileWorkCopy, 1); + + // remove leading "/../" + fileWorkCopy = Util.substitute(matcher, leadingRelativePathRule.pattern, + leadingRelativePathRule.substitution, fileWorkCopy, 1); + + // remove unnecessary "/./" + fileWorkCopy = Util.substitute(matcher, currentPathRule.pattern, + currentPathRule.substitution, fileWorkCopy, 1); + + // collapse adjacent slashes with "/" + fileWorkCopy = Util.substitute(matcher, adjacentSlashRule.pattern, + adjacentSlashRule.substitution, fileWorkCopy, 1); - /** - * Class which holds a compiled pattern and its corresponding substition - * string. - */ - private static class Rule { - public Perl5Pattern pattern; - public Perl5Substitution substitution; + newLen = fileWorkCopy.length(); } -} + return fileWorkCopy; + } + + /** + * Class which holds a compiled pattern and its corresponding substition + * string. + */ + private static class Rule { + public Perl5Pattern pattern; + public Perl5Substitution substitution; + } +} Modified: nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java (original) +++ nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java Fri Jan 9 06:34:33 2015 @@ -20,3 +20,4 @@ * and dot segments in path. */ package org.apache.nutch.net.urlnormalizer.basic; + Modified: nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java (original) +++ nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java Fri Jan 9 06:34:33 2015 @@ -29,7 +29,7 @@ import static org.junit.Assert.*; public class TestBasicURLNormalizer { private BasicURLNormalizer normalizer; private Configuration conf; - + @Before public void setUp() { normalizer = new BasicURLNormalizer(); @@ -59,57 +59,47 @@ public class TestBasicURLNormalizer { // check that references are removed normalizeTest("http://foo.com/foo.html#ref", "http://foo.com/foo.html"); - // // check that encoding is normalized - // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); + // // check that encoding is normalized + // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); // check that unnecessary "../" are removed - normalizeTest("http://foo.com/aa/./foo.html", - "http://foo.com/aa/foo.html" ); - normalizeTest("http://foo.com/aa/../", - "http://foo.com/" ); - normalizeTest("http://foo.com/aa/bb/../", - "http://foo.com/aa/"); - normalizeTest("http://foo.com/aa/..", - "http://foo.com/aa/.."); + normalizeTest("http://foo.com/aa/./foo.html", "http://foo.com/aa/foo.html"); + normalizeTest("http://foo.com/aa/../", "http://foo.com/"); + normalizeTest("http://foo.com/aa/bb/../", "http://foo.com/aa/"); + normalizeTest("http://foo.com/aa/..", "http://foo.com/aa/.."); normalizeTest("http://foo.com/aa/bb/cc/../../foo.html", - "http://foo.com/aa/foo.html"); + "http://foo.com/aa/foo.html"); normalizeTest("http://foo.com/aa/bb/../cc/dd/../ee/foo.html", - "http://foo.com/aa/cc/ee/foo.html"); - normalizeTest("http://foo.com/../foo.html", - "http://foo.com/foo.html" ); - normalizeTest("http://foo.com/../../foo.html", - "http://foo.com/foo.html" ); - normalizeTest("http://foo.com/../aa/../foo.html", - "http://foo.com/foo.html" ); - normalizeTest("http://foo.com/aa/../../foo.html", - "http://foo.com/foo.html" ); + "http://foo.com/aa/cc/ee/foo.html"); + normalizeTest("http://foo.com/../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/../../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html"); + normalizeTest("http://foo.com/aa/../../foo.html", "http://foo.com/foo.html"); normalizeTest("http://foo.com/aa/../bb/../foo.html/../../", - "http://foo.com/" ); - normalizeTest("http://foo.com/../aa/foo.html", - "http://foo.com/aa/foo.html" ); - normalizeTest("http://foo.com/../aa/../foo.html", - "http://foo.com/foo.html" ); + "http://foo.com/"); + normalizeTest("http://foo.com/../aa/foo.html", "http://foo.com/aa/foo.html"); + normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html"); normalizeTest("http://foo.com/a..a/foo.html", - "http://foo.com/a..a/foo.html" ); - normalizeTest("http://foo.com/a..a/../foo.html", - "http://foo.com/foo.html" ); + "http://foo.com/a..a/foo.html"); + normalizeTest("http://foo.com/a..a/../foo.html", "http://foo.com/foo.html"); normalizeTest("http://foo.com/foo.foo/../foo.html", - "http://foo.com/foo.html" ); + "http://foo.com/foo.html"); normalizeTest("http://foo.com//aa/bb/foo.html", - "http://foo.com/aa/bb/foo.html" ); + "http://foo.com/aa/bb/foo.html"); normalizeTest("http://foo.com/aa//bb/foo.html", - "http://foo.com/aa/bb/foo.html" ); + "http://foo.com/aa/bb/foo.html"); normalizeTest("http://foo.com/aa/bb//foo.html", - "http://foo.com/aa/bb/foo.html" ); + "http://foo.com/aa/bb/foo.html"); normalizeTest("http://foo.com//aa//bb//foo.html", - "http://foo.com/aa/bb/foo.html" ); + "http://foo.com/aa/bb/foo.html"); normalizeTest("http://foo.com////aa////bb////foo.html", - "http://foo.com/aa/bb/foo.html" ); + "http://foo.com/aa/bb/foo.html"); } private void normalizeTest(String weird, String normal) throws Exception { - assertEquals(normal, normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT)); + assertEquals(normal, + normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT)); } } \ No newline at end of file Modified: nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java (original) +++ nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java Fri Jan 9 06:34:33 2015 @@ -24,15 +24,17 @@ import org.apache.nutch.net.URLNormalize /** * This URLNormalizer doesn't change urls. It is sometimes useful if for a given - * scope at least one normalizer must be defined but no transformations are required. + * scope at least one normalizer must be defined but no transformations are + * required. * * @author Andrzej Bialecki */ public class PassURLNormalizer implements URLNormalizer { private Configuration conf; - - public String normalize(String urlString, String scope) throws MalformedURLException { + + public String normalize(String urlString, String scope) + throws MalformedURLException { return urlString; } @@ -41,7 +43,7 @@ public class PassURLNormalizer implement } public void setConf(Configuration conf) { - this.conf = conf; + this.conf = conf; } } Modified: nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java (original) +++ nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java Fri Jan 9 06:34:33 2015 @@ -20,3 +20,4 @@ * one URL normalizer must be defined in any scope. */ package org.apache.nutch.net.urlnormalizer.pass; + Modified: nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java (original) +++ nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java Fri Jan 9 06:34:33 2015 @@ -16,7 +16,6 @@ */ package org.apache.nutch.net.urlnormalizer.pass; - import java.net.MalformedURLException; import org.apache.hadoop.conf.Configuration; @@ -31,7 +30,7 @@ public class TestPassURLNormalizer { @Test public void testPassURLNormalizer() { Configuration conf = NutchConfiguration.create(); - + PassURLNormalizer normalizer = new PassURLNormalizer(); normalizer.setConf(conf); String url = "http://www.example.com/test/..//"; @@ -41,7 +40,7 @@ public class TestPassURLNormalizer { } catch (MalformedURLException mue) { fail(mue.toString()); } - + assertEquals(url, result); } } Modified: nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java (original) +++ nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java Fri Jan 9 06:34:33 2015 @@ -51,19 +51,23 @@ import org.xml.sax.InputSource; * Allows users to do regex substitutions on all/any URLs that are encountered, * which is useful for stripping session IDs from URLs. * - * <p>This class uses the <tt>urlnormalizer.regex.file</tt> property. - * It should be set to the file name of an xml file which should contain the - * patterns and substitutions to be done on encountered URLs. + * <p> + * This class uses the <tt>urlnormalizer.regex.file</tt> property. It should be + * set to the file name of an xml file which should contain the patterns and + * substitutions to be done on encountered URLs. + * </p> + * <p> + * This class also supports different rules depending on the scope. Please see + * the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more details. * </p> - * <p>This class also supports different rules depending on the scope. Please see - * the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more details.</p> * * @author Luke Baker * @author Andrzej Bialecki */ public class RegexURLNormalizer extends Configured implements URLNormalizer { - private static final Logger LOG = LoggerFactory.getLogger(RegexURLNormalizer.class); + private static final Logger LOG = LoggerFactory + .getLogger(RegexURLNormalizer.class); /** * Class which holds a compiled pattern and its corresponding substition @@ -75,19 +79,18 @@ public class RegexURLNormalizer extends public String substitution; } - private ThreadLocal<HashMap<String, List<Rule>>> scopedRulesThreadLocal = - new ThreadLocal<HashMap<String,List<Rule>>>() { - protected java.util.HashMap<String,java.util.List<Rule>> initialValue() { + private ThreadLocal<HashMap<String, List<Rule>>> scopedRulesThreadLocal = new ThreadLocal<HashMap<String, List<Rule>>>() { + protected java.util.HashMap<String, java.util.List<Rule>> initialValue() { return new HashMap<String, List<Rule>>(); }; }; - + public HashMap<String, List<Rule>> getScopedRules() { return scopedRulesThreadLocal.get(); } - - private List<Rule> defaultRules; - + + private List<Rule> defaultRules; + private static final List<Rule> EMPTY_RULES = Collections.emptyList(); /** @@ -107,7 +110,7 @@ public class RegexURLNormalizer extends * configuration files for it. */ public RegexURLNormalizer(Configuration conf, String filename) - throws IOException, PatternSyntaxException { + throws IOException, PatternSyntaxException { super(conf); List<Rule> rules = readConfigurationFile(filename); if (rules != null) { @@ -117,7 +120,8 @@ public class RegexURLNormalizer extends public void setConf(Configuration conf) { super.setConf(conf); - if (conf == null) return; + if (conf == null) + return; // the default constructor was called String filename = getConf().get("urlnormalizer.regex.file"); @@ -147,9 +151,10 @@ public class RegexURLNormalizer extends void setConfiguration(Reader reader, String scope) { List<Rule> rules = readConfiguration(reader); getScopedRules().put(scope, rules); - LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " rules."); + LOG.debug("Set config for scope '" + scope + "': " + rules.size() + + " rules."); } - + /** * This function does the replacements by iterating through all the regex * patterns. It accepts a string url as input and returns the altered string. @@ -190,7 +195,7 @@ public class RegexURLNormalizer extends } public String normalize(String urlString, String scope) - throws MalformedURLException { + throws MalformedURLException { return regexNormalize(urlString, scope); } @@ -207,17 +212,17 @@ public class RegexURLNormalizer extends return EMPTY_RULES; } } - + private List<Rule> readConfiguration(Reader reader) { List<Rule> rules = new ArrayList<Rule>(); try { // borrowed heavily from code in Configuration.java Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() - .parse(new InputSource(reader)); + .parse(new InputSource(reader)); Element root = doc.getDocumentElement(); if ((!"regex-normalize".equals(root.getTagName())) - && (LOG.isErrorEnabled())) { + && (LOG.isErrorEnabled())) { LOG.error("bad conf file: top-level element not <regex-normalize>"); } NodeList regexes = root.getChildNodes(); @@ -240,7 +245,7 @@ public class RegexURLNormalizer extends if ("pattern".equals(field.getTagName()) && field.hasChildNodes()) patternValue = ((Text) field.getFirstChild()).getData(); if ("substitution".equals(field.getTagName()) - && field.hasChildNodes()) + && field.hasChildNodes()) subValue = ((Text) field.getFirstChild()).getData(); if (!field.hasChildNodes()) subValue = ""; @@ -251,7 +256,8 @@ public class RegexURLNormalizer extends rule.pattern = Pattern.compile(patternValue); } catch (PatternSyntaxException e) { if (LOG.isErrorEnabled()) { - LOG.error("skipped rule: " + patternValue + " -> " + subValue + " : invalid regular expression pattern: " + e); + LOG.error("skipped rule: " + patternValue + " -> " + subValue + + " : invalid regular expression pattern: " + e); } continue; } @@ -265,13 +271,14 @@ public class RegexURLNormalizer extends } return EMPTY_RULES; } - if (rules.size() == 0) return EMPTY_RULES; + if (rules.size() == 0) + return EMPTY_RULES; return rules; } /** Spits out patterns and substitutions that are in the configuration file. */ public static void main(String args[]) throws PatternSyntaxException, - IOException { + IOException { RegexURLNormalizer normalizer = new RegexURLNormalizer(); normalizer.setConf(NutchConfiguration.create()); HashMap<String, List<Rule>> scopedRules = normalizer.getScopedRules(); @@ -290,9 +297,10 @@ public class RegexURLNormalizer extends Iterator<String> it = scopedRules.keySet().iterator(); while (it.hasNext()) { String scope = it.next(); - if (URLNormalizers.SCOPE_DEFAULT.equals(scope)) continue; + if (URLNormalizers.SCOPE_DEFAULT.equals(scope)) + continue; System.out.println("* Rules for '" + scope + "' scope:"); - i = ((List<Rule>)scopedRules.get(scope)).iterator(); + i = ((List<Rule>) scopedRules.get(scope)).iterator(); while (i.hasNext()) { Rule r = (Rule) i.next(); System.out.print(" " + r.pattern.pattern() + " -> "); @@ -303,10 +311,12 @@ public class RegexURLNormalizer extends if (args.length > 0) { System.out.println("\n---------- Normalizer test -----------"); String scope = URLNormalizers.SCOPE_DEFAULT; - if (args.length > 1) scope = args[1]; + if (args.length > 1) + scope = args[1]; System.out.println("Scope: " + scope); System.out.println("Input url: '" + args[0] + "'"); - System.out.println("Output url: '" + normalizer.normalize(args[0], scope) + "'"); + System.out.println("Output url: '" + normalizer.normalize(args[0], scope) + + "'"); } System.exit(0); } Modified: nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java (original) +++ nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java Fri Jan 9 06:34:33 2015 @@ -20,3 +20,4 @@ * ({@link java.util.regex.Pattern}). */ package org.apache.nutch.net.urlnormalizer.regex; + Modified: nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java (original) +++ nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java Fri Jan 9 06:34:33 2015 @@ -40,17 +40,19 @@ import org.apache.nutch.util.NutchConfig /** Unit tests for RegexUrlNormalizer. */ public class TestRegexURLNormalizer { - private static final Logger LOG = LoggerFactory.getLogger(TestRegexURLNormalizer.class); - + private static final Logger LOG = LoggerFactory + .getLogger(TestRegexURLNormalizer.class); + private RegexURLNormalizer normalizer; private Configuration conf; private HashMap<String, NormalizedURL[]> testData = new HashMap<String, NormalizedURL[]>(); - + // This system property is defined in ./src/plugin/build-plugin.xml private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in // ./src/plugin/urlnormalizer-regex/build.xml during plugin compilation. - + @Before public void setUp() throws IOException { normalizer = new RegexURLNormalizer(); @@ -58,7 +60,8 @@ public class TestRegexURLNormalizer { normalizer.setConf(conf); File[] configs = new File(sampleDir).listFiles(new FileFilter() { public boolean accept(File f) { - if (f.getName().endsWith(".xml") && f.getName().startsWith("regex-normalize-")) + if (f.getName().endsWith(".xml") + && f.getName().startsWith("regex-normalize-")) return true; return false; } @@ -79,8 +82,8 @@ public class TestRegexURLNormalizer { @Test public void testNormalizerDefault() throws Exception { - normalizeTest((NormalizedURL[])testData.get(URLNormalizers.SCOPE_DEFAULT), - URLNormalizers.SCOPE_DEFAULT); + normalizeTest((NormalizedURL[]) testData.get(URLNormalizers.SCOPE_DEFAULT), + URLNormalizers.SCOPE_DEFAULT); } @Test @@ -88,36 +91,31 @@ public class TestRegexURLNormalizer { Iterator<String> it = testData.keySet().iterator(); while (it.hasNext()) { String scope = it.next(); - normalizeTest((NormalizedURL[])testData.get(scope), scope); + normalizeTest((NormalizedURL[]) testData.get(scope), scope); } } - private void normalizeTest(NormalizedURL[] urls, String scope) throws Exception { + private void normalizeTest(NormalizedURL[] urls, String scope) + throws Exception { for (int i = 0; i < urls.length; i++) { String url = urls[i].url; String normalized = normalizer.normalize(urls[i].url, scope); String expected = urls[i].expectedURL; - LOG.info("scope: " + scope + " url: " + url + " | normalized: " + normalized + " | expected: " + expected); + LOG.info("scope: " + scope + " url: " + url + " | normalized: " + + normalized + " | expected: " + expected); assertEquals(urls[i].expectedURL, normalized); } } - - /** Currently this is not being used in this class - private void bench(int loops, String scope) { - long start = System.currentTimeMillis(); - try { - NormalizedURL[] expected = (NormalizedURL[])testData.get(scope); - if (expected == null) return; - for (int i = 0; i < loops; i++) { - normalizeTest(expected, scope); - } - } catch (Exception e) { - fail(e.toString()); - } - LOG.info("bench time (" + loops + ") " + - (System.currentTimeMillis() - start) + "ms"); - } - */ + + /** + * Currently this is not being used in this class private void bench(int + * loops, String scope) { long start = System.currentTimeMillis(); try { + * NormalizedURL[] expected = (NormalizedURL[])testData.get(scope); if + * (expected == null) return; for (int i = 0; i < loops; i++) { + * normalizeTest(expected, scope); } } catch (Exception e) { + * fail(e.toString()); } LOG.info("bench time (" + loops + ") " + + * (System.currentTimeMillis() - start) + "ms"); } + */ private static class NormalizedURL { String url; @@ -132,16 +130,17 @@ public class TestRegexURLNormalizer { private NormalizedURL[] readTestFile(String scope) throws IOException { File f = new File(sampleDir, "regex-normalize-" + scope + ".test"); - BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); + BufferedReader in = new BufferedReader(new InputStreamReader( + new FileInputStream(f), "UTF-8")); List<NormalizedURL> list = new ArrayList<NormalizedURL>(); String line; - while((line = in.readLine()) != null) { - if ( line.trim().length() == 0 || - line.startsWith("#") || - line.startsWith(" ")) continue; + while ((line = in.readLine()) != null) { + if (line.trim().length() == 0 || line.startsWith("#") + || line.startsWith(" ")) + continue; list.add(new NormalizedURL(line)); } in.close(); return (NormalizedURL[]) list.toArray(new NormalizedURL[list.size()]); - } + } } Modified: nutch/branches/2.x/src/test/org/apache/nutch/api/TestAPI.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/api/TestAPI.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/test/org/apache/nutch/api/TestAPI.java (original) +++ nutch/branches/2.x/src/test/org/apache/nutch/api/TestAPI.java Fri Jan 9 06:34:33 2015 @@ -37,186 +37,189 @@ import org.junit.Test; public class TestAPI { @Test - public void test() throws Exception {} -// -// private static NutchServer server; -// ClientResource cli; -// -// private static String baseUrl = "http://localhost:8192/nutch/"; -// -// @BeforeClass -// public static void before() throws Exception { -// server = new NutchServer(8192); -// server.start(); -// } -// -// @AfterClass -// public static void after() throws Exception { -// if (!server.stop(false)) { -// for (int i = 1; i < 11; i++) { -// System.err.println("Waiting for jobs to complete - " + i + "s"); -// try { -// Thread.sleep(1000); -// } catch (Exception e) {}; -// server.stop(false); -// if (!server.isRunning()) { -// break; -// } -// } -// } -// if (server.isRunning()) { -// System.err.println("Forcibly stopping server..."); -// server.stop(true); -// } -// } -// -// @Test -// public void testInfoAPI() throws Exception { -// ClientResource cli = new ClientResource(baseUrl); -// String expected = "[[\"admin\",\"Service admin actions\"],[\"confs\",\"Configuration manager\"],[\"db\",\"DB data streaming\"],[\"jobs\",\"Job manager\"]]"; -// String got = cli.get().getText(); -// assertEquals(expected, got); -// } -// -// @SuppressWarnings("rawtypes") -// @Test -// public void testConfsAPI() throws Exception { -// ClientResource cli = new ClientResource(baseUrl + ConfResource.PATH); -// assertEquals("[\"default\"]", cli.get().getText()); -// // create -// Map<String,Object> map = new HashMap<String,Object>(); -// map.put(Params.CONF_ID, "test"); -// HashMap<String,String> props = new HashMap<String,String>(); -// props.put("testProp", "blurfl"); -// map.put(Params.PROPS, props); -// JacksonRepresentation<Map<String,Object>> jr = -// new JacksonRepresentation<Map<String,Object>>(map); -// System.out.println(cli.put(jr).getText()); -// assertEquals("[\"default\",\"test\"]", cli.get().getText()); -// cli = new ClientResource(baseUrl + ConfResource.PATH + "/test"); -// Map res = cli.get(Map.class); -// assertEquals("blurfl", res.get("testProp")); -// // delete -// cli.delete(); -// cli = new ClientResource(baseUrl + ConfResource.PATH); -// assertEquals("[\"default\"]", cli.get().getText()); -// } -// -// @SuppressWarnings("rawtypes") -// @Test -// public void testJobsAPI() throws Exception { -// ClientResource cli = new ClientResource(baseUrl + JobResource.PATH); -// assertEquals("[]", cli.get().getText()); -// // create -// Map<String,Object> map = new HashMap<String,Object>(); -// map.put(Params.JOB_TYPE, JobType.READDB.toString()); -// map.put(Params.CONF_ID, "default"); -// Representation r = cli.put(map); -// String jobId = r.getText(); -// assertNotNull(jobId); -// assertTrue(jobId.startsWith("default-READDB-")); -// // list -// Map[] list = cli.get(Map[].class); -// assertEquals(1, list.length); -// String id = (String)list[0].get("id"); -// String state = (String)list[0].get("state"); -// assertEquals(jobId, id); -// assertEquals(state, "RUNNING"); -// int cnt = 10; -// do { -// try { -// Thread.sleep(2000); -// } catch (Exception e) {}; -// list = cli.get(Map[].class); -// state = (String)list[0].get("state"); -// if (!state.equals("RUNNING")) { -// break; -// } -// } while (--cnt > 0); -// assertTrue(cnt > 0); -// if (list == null) return; -// for (Map m : list) { -// System.out.println(m); -// } -// } -// -// @SuppressWarnings("unchecked") -// @Test -// public void testStopKill() throws Exception { -// ClientResource cli = new ClientResource(baseUrl + JobResource.PATH); -// // create -// Map<String,Object> map = new HashMap<String,Object>(); -// map.put(Params.JOB_TYPE, JobType.CLASS.toString()); -// Map<String,Object> args = new HashMap<String,Object>(); -// map.put(Params.ARGS, args); -// args.put(Nutch.ARG_CLASS, SpinningJob.class.getName()); -// map.put(Params.CONF_ID, "default"); -// Representation r = cli.put(map); -// String jobId = r.getText(); -// cli.release(); -// assertNotNull(jobId); -// System.out.println(jobId); -// assertTrue(jobId.startsWith("default-CLASS-")); -// ClientResource stopCli = new ClientResource(baseUrl + JobResource.PATH + -// "?job=" + jobId + "&cmd=stop"); -// r = stopCli.get(); -// assertEquals("true", r.getText()); -// stopCli.release(); -// Thread.sleep(2000); // wait for the job to finish -// ClientResource jobCli = new ClientResource(baseUrl + JobResource.PATH + "/" + jobId); -// Map<String,Object> res = jobCli.get(Map.class); -// res = (Map<String,Object>)res.get("result"); -// assertEquals("stopped", res.get("res")); -// jobCli.release(); -// // restart and kill -// r = cli.put(map); -// jobId = r.getText(); -// cli.release(); -// assertNotNull(jobId); -// System.out.println(jobId); -// assertTrue(jobId.startsWith("default-CLASS-")); -// ClientResource killCli = new ClientResource(baseUrl + JobResource.PATH + -// "?job=" + jobId + "&cmd=abort"); -// r = killCli.get(); -// assertEquals("true", r.getText()); -// killCli.release(); -// Thread.sleep(2000); // wait for the job to finish -// jobCli = new ClientResource(baseUrl + JobResource.PATH + "/" + jobId); -// res = jobCli.get(Map.class); -// res = (Map<String,Object>)res.get("result"); -// assertEquals("killed", res.get("res")); -// jobCli.release(); -// } -// -// public static class SpinningJob extends NutchTool { -// volatile boolean shouldStop = false; -// -// @Override -// public Map<String, Object> run(Map<String, Object> args) throws Exception { -// status.put(Nutch.STAT_MESSAGE, "running"); -// int cnt = 60; -// while (!shouldStop && cnt-- > 0) { -// Thread.sleep(1000); -// } -// if (cnt == 0) { -// results.put("res", "failed"); -// } -// return results; -// } -// -// @Override -// public boolean stopJob() throws Exception { -// results.put("res", "stopped"); -// shouldStop = true; -// return true; -// } -// -// @Override -// public boolean killJob() throws Exception { -// results.put("res", "killed"); -// shouldStop = true; -// return true; -// } -// -// } + public void test() throws Exception { + } + // + // private static NutchServer server; + // ClientResource cli; + // + // private static String baseUrl = "http://localhost:8192/nutch/"; + // + // @BeforeClass + // public static void before() throws Exception { + // server = new NutchServer(8192); + // server.start(); + // } + // + // @AfterClass + // public static void after() throws Exception { + // if (!server.stop(false)) { + // for (int i = 1; i < 11; i++) { + // System.err.println("Waiting for jobs to complete - " + i + "s"); + // try { + // Thread.sleep(1000); + // } catch (Exception e) {}; + // server.stop(false); + // if (!server.isRunning()) { + // break; + // } + // } + // } + // if (server.isRunning()) { + // System.err.println("Forcibly stopping server..."); + // server.stop(true); + // } + // } + // + // @Test + // public void testInfoAPI() throws Exception { + // ClientResource cli = new ClientResource(baseUrl); + // String expected = + // "[[\"admin\",\"Service admin actions\"],[\"confs\",\"Configuration manager\"],[\"db\",\"DB data streaming\"],[\"jobs\",\"Job manager\"]]"; + // String got = cli.get().getText(); + // assertEquals(expected, got); + // } + // + // @SuppressWarnings("rawtypes") + // @Test + // public void testConfsAPI() throws Exception { + // ClientResource cli = new ClientResource(baseUrl + ConfResource.PATH); + // assertEquals("[\"default\"]", cli.get().getText()); + // // create + // Map<String,Object> map = new HashMap<String,Object>(); + // map.put(Params.CONF_ID, "test"); + // HashMap<String,String> props = new HashMap<String,String>(); + // props.put("testProp", "blurfl"); + // map.put(Params.PROPS, props); + // JacksonRepresentation<Map<String,Object>> jr = + // new JacksonRepresentation<Map<String,Object>>(map); + // System.out.println(cli.put(jr).getText()); + // assertEquals("[\"default\",\"test\"]", cli.get().getText()); + // cli = new ClientResource(baseUrl + ConfResource.PATH + "/test"); + // Map res = cli.get(Map.class); + // assertEquals("blurfl", res.get("testProp")); + // // delete + // cli.delete(); + // cli = new ClientResource(baseUrl + ConfResource.PATH); + // assertEquals("[\"default\"]", cli.get().getText()); + // } + // + // @SuppressWarnings("rawtypes") + // @Test + // public void testJobsAPI() throws Exception { + // ClientResource cli = new ClientResource(baseUrl + JobResource.PATH); + // assertEquals("[]", cli.get().getText()); + // // create + // Map<String,Object> map = new HashMap<String,Object>(); + // map.put(Params.JOB_TYPE, JobType.READDB.toString()); + // map.put(Params.CONF_ID, "default"); + // Representation r = cli.put(map); + // String jobId = r.getText(); + // assertNotNull(jobId); + // assertTrue(jobId.startsWith("default-READDB-")); + // // list + // Map[] list = cli.get(Map[].class); + // assertEquals(1, list.length); + // String id = (String)list[0].get("id"); + // String state = (String)list[0].get("state"); + // assertEquals(jobId, id); + // assertEquals(state, "RUNNING"); + // int cnt = 10; + // do { + // try { + // Thread.sleep(2000); + // } catch (Exception e) {}; + // list = cli.get(Map[].class); + // state = (String)list[0].get("state"); + // if (!state.equals("RUNNING")) { + // break; + // } + // } while (--cnt > 0); + // assertTrue(cnt > 0); + // if (list == null) return; + // for (Map m : list) { + // System.out.println(m); + // } + // } + // + // @SuppressWarnings("unchecked") + // @Test + // public void testStopKill() throws Exception { + // ClientResource cli = new ClientResource(baseUrl + JobResource.PATH); + // // create + // Map<String,Object> map = new HashMap<String,Object>(); + // map.put(Params.JOB_TYPE, JobType.CLASS.toString()); + // Map<String,Object> args = new HashMap<String,Object>(); + // map.put(Params.ARGS, args); + // args.put(Nutch.ARG_CLASS, SpinningJob.class.getName()); + // map.put(Params.CONF_ID, "default"); + // Representation r = cli.put(map); + // String jobId = r.getText(); + // cli.release(); + // assertNotNull(jobId); + // System.out.println(jobId); + // assertTrue(jobId.startsWith("default-CLASS-")); + // ClientResource stopCli = new ClientResource(baseUrl + JobResource.PATH + + // "?job=" + jobId + "&cmd=stop"); + // r = stopCli.get(); + // assertEquals("true", r.getText()); + // stopCli.release(); + // Thread.sleep(2000); // wait for the job to finish + // ClientResource jobCli = new ClientResource(baseUrl + JobResource.PATH + "/" + // + jobId); + // Map<String,Object> res = jobCli.get(Map.class); + // res = (Map<String,Object>)res.get("result"); + // assertEquals("stopped", res.get("res")); + // jobCli.release(); + // // restart and kill + // r = cli.put(map); + // jobId = r.getText(); + // cli.release(); + // assertNotNull(jobId); + // System.out.println(jobId); + // assertTrue(jobId.startsWith("default-CLASS-")); + // ClientResource killCli = new ClientResource(baseUrl + JobResource.PATH + + // "?job=" + jobId + "&cmd=abort"); + // r = killCli.get(); + // assertEquals("true", r.getText()); + // killCli.release(); + // Thread.sleep(2000); // wait for the job to finish + // jobCli = new ClientResource(baseUrl + JobResource.PATH + "/" + jobId); + // res = jobCli.get(Map.class); + // res = (Map<String,Object>)res.get("result"); + // assertEquals("killed", res.get("res")); + // jobCli.release(); + // } + // + // public static class SpinningJob extends NutchTool { + // volatile boolean shouldStop = false; + // + // @Override + // public Map<String, Object> run(Map<String, Object> args) throws Exception { + // status.put(Nutch.STAT_MESSAGE, "running"); + // int cnt = 60; + // while (!shouldStop && cnt-- > 0) { + // Thread.sleep(1000); + // } + // if (cnt == 0) { + // results.put("res", "failed"); + // } + // return results; + // } + // + // @Override + // public boolean stopJob() throws Exception { + // results.put("res", "stopped"); + // shouldStop = true; + // return true; + // } + // + // @Override + // public boolean killJob() throws Exception { + // results.put("res", "killed"); + // shouldStop = true; + // return true; + // } + // + // } } Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/DummyWritable.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/DummyWritable.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/test/org/apache/nutch/crawl/DummyWritable.java (original) +++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/DummyWritable.java Fri Jan 9 06:34:33 2015 @@ -21,12 +21,12 @@ import org.apache.hadoop.io.IntWritable; public class DummyWritable extends IntWritable { - public DummyWritable() { + public DummyWritable() { - } + } - public DummyWritable(int i) { - super(i); - } + public DummyWritable(int i) { + super(i); + } } Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java (original) +++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java Fri Jan 9 06:34:33 2015 @@ -40,32 +40,30 @@ import static org.junit.Assert.assertEqu * Basic generator test. 1. Insert entries in webtable 2. Generates entries to * fetch 3. Verifies that number of generated urls match 4. Verifies that * highest scoring urls are generated - * + * */ public class TestGenerator extends AbstractNutchTest { public static final Logger LOG = LoggerFactory.getLogger(TestGenerator.class); private static String[] FIELDS = new String[] { - WebPage.Field.MARKERS.getName(), - WebPage.Field.SCORE.getName() - }; - + WebPage.Field.MARKERS.getName(), WebPage.Field.SCORE.getName() }; + @Override @Before - public void setUp() throws Exception{ + public void setUp() throws Exception { super.setUp(); } - + @Override @After - public void tearDown()throws Exception { + public void tearDown() throws Exception { super.tearDown(); } /** * Test that generator generates fetchlist ordered by score (desc). - * + * * @throws Exception */ @Test @@ -87,7 +85,8 @@ public class TestGenerator extends Abstr generateFetchlist(NUM_RESULTS, conf, false); - ArrayList<URLWebPage> l = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK, FIELDS); + ArrayList<URLWebPage> l = CrawlTestUtil.readContents(webPageStore, + Mark.GENERATE_MARK, FIELDS); // sort urls by score desc Collections.sort(l, new ScoreComparator()); @@ -125,8 +124,9 @@ public class TestGenerator extends Abstr } /** - * Test that generator obeys the property "generate.max.count" and "generate.count.mode". - * + * Test that generator obeys the property "generate.max.count" and + * "generate.count.mode". + * * @throws Exception */ @Test @@ -145,10 +145,12 @@ public class TestGenerator extends Abstr Configuration myConfiguration = new Configuration(conf); myConfiguration.setInt(GeneratorJob.GENERATOR_MAX_COUNT, 1); - myConfiguration.set(GeneratorJob.GENERATOR_COUNT_MODE, GeneratorJob.GENERATOR_COUNT_VALUE_HOST); + myConfiguration.set(GeneratorJob.GENERATOR_COUNT_MODE, + GeneratorJob.GENERATOR_COUNT_VALUE_HOST); generateFetchlist(Integer.MAX_VALUE, myConfiguration, false); - ArrayList<URLWebPage> fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK, FIELDS); + ArrayList<URLWebPage> fetchList = CrawlTestUtil.readContents(webPageStore, + Mark.GENERATE_MARK, FIELDS); // verify we got right amount of records assertEquals(1, fetchList.size()); @@ -157,25 +159,27 @@ public class TestGenerator extends Abstr myConfiguration.setInt(GeneratorJob.GENERATOR_MAX_COUNT, 2); generateFetchlist(Integer.MAX_VALUE, myConfiguration, false); - fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK, FIELDS); + fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK, + FIELDS); // verify we got right amount of records - assertEquals(3, fetchList.size()); //3 as 2 + 1 skipped (already generated) + assertEquals(3, fetchList.size()); // 3 as 2 + 1 skipped (already generated) myConfiguration = new Configuration(conf); myConfiguration.setInt(GeneratorJob.GENERATOR_MAX_COUNT, 3); generateFetchlist(Integer.MAX_VALUE, myConfiguration, false); - fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK, FIELDS); + fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK, + FIELDS); // verify we got right amount of records - assertEquals(3, fetchList.size()); //3 as now all have generate mark + assertEquals(3, fetchList.size()); // 3 as now all have generate mark } /** * Test that generator obeys the property "generator.max.count" and * "generator.count.value=domain". - * + * * @throws Exception */ @Test @@ -197,11 +201,13 @@ public class TestGenerator extends Abstr Configuration myConfiguration = new Configuration(conf); myConfiguration.setInt(GeneratorJob.GENERATOR_MAX_COUNT, 1); - myConfiguration.set(GeneratorJob.GENERATOR_COUNT_MODE, GeneratorJob.GENERATOR_COUNT_VALUE_DOMAIN); + myConfiguration.set(GeneratorJob.GENERATOR_COUNT_MODE, + GeneratorJob.GENERATOR_COUNT_VALUE_DOMAIN); generateFetchlist(Integer.MAX_VALUE, myConfiguration, false); - ArrayList<URLWebPage> fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK, FIELDS); + ArrayList<URLWebPage> fetchList = CrawlTestUtil.readContents(webPageStore, + Mark.GENERATE_MARK, FIELDS); // verify we got right amount of records assertEquals(1, fetchList.size()); @@ -210,7 +216,8 @@ public class TestGenerator extends Abstr myConfiguration.setInt(GeneratorJob.GENERATOR_MAX_COUNT, 2); generateFetchlist(Integer.MAX_VALUE, myConfiguration, false); - fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK, FIELDS); + fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK, + FIELDS); // verify we got right amount of records assertEquals(3, fetchList.size()); // 2 + 1 skipped (already generated) @@ -219,7 +226,8 @@ public class TestGenerator extends Abstr myConfiguration.setInt(GeneratorJob.GENERATOR_MAX_COUNT, 3); generateFetchlist(Integer.MAX_VALUE, myConfiguration, false); - fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK, FIELDS); + fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK, + FIELDS); // verify we got right amount of records assertEquals(6, fetchList.size()); // 3 + 3 skipped (already generated) @@ -227,7 +235,7 @@ public class TestGenerator extends Abstr /** * Test generator obeys the filter setting. - * + * * @throws Exception * @throws IOException */ @@ -251,13 +259,15 @@ public class TestGenerator extends Abstr generateFetchlist(Integer.MAX_VALUE, myConfiguration, true); - ArrayList<URLWebPage> fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK, FIELDS); + ArrayList<URLWebPage> fetchList = CrawlTestUtil.readContents(webPageStore, + Mark.GENERATE_MARK, FIELDS); assertEquals(0, fetchList.size()); generateFetchlist(Integer.MAX_VALUE, myConfiguration, false); - fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK, FIELDS); + fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK, + FIELDS); // verify nothing got filtered assertEquals(list.size(), fetchList.size()); @@ -266,7 +276,7 @@ public class TestGenerator extends Abstr /** * Generate Fetchlist. - * + * * @param numResults * number of results to generate * @param config @@ -279,14 +289,15 @@ public class TestGenerator extends Abstr // generate batch GeneratorJob g = new GeneratorJob(); g.setConf(config); - String batchId = g.generate(numResults, System.currentTimeMillis(), filter, false); + String batchId = g.generate(numResults, System.currentTimeMillis(), filter, + false); if (batchId == null) throw new RuntimeException("Generator failed"); } /** * Constructs new {@link URLWebPage} from submitted parameters. - * + * * @param url * url to use * @param fetchInterval @@ -298,7 +309,7 @@ public class TestGenerator extends Abstr WebPage page = WebPage.newBuilder().build(); page.setFetchInterval(fetchInterval); page.setScore(score); - page.setStatus((int)CrawlStatus.STATUS_UNFETCHED); + page.setStatus((int) CrawlStatus.STATUS_UNFETCHED); return new URLWebPage(url, page); } Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java (original) +++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java Fri Jan 9 06:34:33 2015 @@ -38,7 +38,7 @@ import static org.junit.Assert.assertTru * Basic injector test: 1. Creates a text file with urls 2. Injects them into * crawldb 3. Reads crawldb entries and verifies contents 4. Injects more urls * into webdb 5. Reads crawldb entries and verifies contents - * + * */ public class TestInjector extends AbstractNutchTest { Path urlPath; @@ -101,13 +101,12 @@ public class TestInjector extends Abstra } private static final String[] fields = new String[] { - WebPage.Field.MARKERS.getName(), - WebPage.Field.METADATA.getName(), - WebPage.Field.SCORE.getName() - }; + WebPage.Field.MARKERS.getName(), WebPage.Field.METADATA.getName(), + WebPage.Field.SCORE.getName() }; private List<String> readDb() throws Exception { - List<URLWebPage> pages = CrawlTestUtil.readContents(webPageStore, null, fields); + List<URLWebPage> pages = CrawlTestUtil.readContents(webPageStore, null, + fields); ArrayList<String> read = new ArrayList<String>(); for (URLWebPage up : pages) { WebPage page = up.getDatum(); Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestSignatureFactory.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestSignatureFactory.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestSignatureFactory.java (original) +++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestSignatureFactory.java Fri Jan 9 06:34:33 2015 @@ -26,9 +26,9 @@ public class TestSignatureFactory { @Test public void testGetSignature() { - Configuration conf=NutchConfiguration.create(); - Signature signature1=SignatureFactory.getSignature(conf); - Signature signature2=SignatureFactory.getSignature(conf); + Configuration conf = NutchConfiguration.create(); + Signature signature1 = SignatureFactory.getSignature(conf); + Signature signature2 = SignatureFactory.getSignature(conf); assertNotNull(signature1); assertNotNull(signature2); assertEquals(signature1, signature2); Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestURLPartitioner.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestURLPartitioner.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestURLPartitioner.java (original) +++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestURLPartitioner.java Fri Jan 9 06:34:33 2015 @@ -38,21 +38,25 @@ import static org.junit.Assert.assertNot public class TestURLPartitioner { /** - * tests one reducer, everything goes into one partition, using host partitioner. + * tests one reducer, everything goes into one partition, using host + * partitioner. */ @Test public void testOneReducer() { URLPartitioner partitioner = new URLPartitioner(); Configuration conf = NutchConfiguration.create(); - conf.set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST); + conf.set(URLPartitioner.PARTITION_MODE_KEY, + URLPartitioner.PARTITION_MODE_HOST); partitioner.setConf(conf); - + int numReduceTasks = 1; - - assertEquals(0, partitioner.getPartition("http://example.org", numReduceTasks)); - assertEquals(0, partitioner.getPartition("http://www.apache.org", numReduceTasks)); + + assertEquals(0, + partitioner.getPartition("http://example.org", numReduceTasks)); + assertEquals(0, + partitioner.getPartition("http://www.apache.org", numReduceTasks)); } - + /** * tests partitioning by host */ @@ -60,22 +64,27 @@ public class TestURLPartitioner { public void testModeHost() { URLPartitioner partitioner = new URLPartitioner(); Configuration conf = NutchConfiguration.create(); - conf.set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST); + conf.set(URLPartitioner.PARTITION_MODE_KEY, + URLPartitioner.PARTITION_MODE_HOST); partitioner.setConf(conf); - + int numReduceTasks = 100; - - int partitionWithoutWWW = partitioner.getPartition("http://example.org/", numReduceTasks); - int partitionWithWWW = partitioner.getPartition("http://www.example.org/", numReduceTasks); - assertNotSame("partitions should differ because of different host", + + int partitionWithoutWWW = partitioner.getPartition("http://example.org/", + numReduceTasks); + int partitionWithWWW = partitioner.getPartition("http://www.example.org/", + numReduceTasks); + assertNotSame("partitions should differ because of different host", partitionWithoutWWW, partitionWithWWW); - - int partitionSame1 = partitioner.getPartition("http://www.example.org/paris", numReduceTasks); - int partitionSame2 = partitioner.getPartition("http://www.example.org/london", numReduceTasks); - assertEquals("partitions should be same because of same host", + + int partitionSame1 = partitioner.getPartition( + "http://www.example.org/paris", numReduceTasks); + int partitionSame2 = partitioner.getPartition( + "http://www.example.org/london", numReduceTasks); + assertEquals("partitions should be same because of same host", partitionSame1, partitionSame2); } - + /** * tests partitioning by domain */ @@ -83,22 +92,27 @@ public class TestURLPartitioner { public void testModeDomain() { URLPartitioner partitioner = new URLPartitioner(); Configuration conf = NutchConfiguration.create(); - conf.set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_DOMAIN); + conf.set(URLPartitioner.PARTITION_MODE_KEY, + URLPartitioner.PARTITION_MODE_DOMAIN); partitioner.setConf(conf); - + int numReduceTasks = 100; - - int partitionExample = partitioner.getPartition("http://www.example.org/", numReduceTasks); - int partitionApache = partitioner.getPartition("http://www.apache.org/", numReduceTasks); - assertNotSame("partitions should differ because of different domain", + + int partitionExample = partitioner.getPartition("http://www.example.org/", + numReduceTasks); + int partitionApache = partitioner.getPartition("http://www.apache.org/", + numReduceTasks); + assertNotSame("partitions should differ because of different domain", partitionExample, partitionApache); - - int partitionWithoutWWW = partitioner.getPartition("http://example.org/", numReduceTasks); - int partitionWithWWW = partitioner.getPartition("http://www.example.org/", numReduceTasks); - assertEquals("partitions should be same because of same domain", + + int partitionWithoutWWW = partitioner.getPartition("http://example.org/", + numReduceTasks); + int partitionWithWWW = partitioner.getPartition("http://www.example.org/", + numReduceTasks); + assertEquals("partitions should be same because of same domain", partitionWithoutWWW, partitionWithWWW); } - + /** * tests partitioning by IP */ @@ -106,23 +120,29 @@ public class TestURLPartitioner { public void testModeIP() { URLPartitioner partitioner = new URLPartitioner(); Configuration conf = NutchConfiguration.create(); - conf.set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_IP); + conf.set(URLPartitioner.PARTITION_MODE_KEY, + URLPartitioner.PARTITION_MODE_IP); partitioner.setConf(conf); - + int numReduceTasks = 100; - - int partitionExample = partitioner.getPartition("http://www.example.org/", numReduceTasks); - int partitionApache = partitioner.getPartition("http://www.apache.org/", numReduceTasks); - assertNotSame("partitions should differ because of different ip", + + int partitionExample = partitioner.getPartition("http://www.example.org/", + numReduceTasks); + int partitionApache = partitioner.getPartition("http://www.apache.org/", + numReduceTasks); + assertNotSame("partitions should differ because of different ip", partitionExample, partitionApache); - - int partitionWithoutWWW = partitioner.getPartition("http://example.org/", numReduceTasks); - int partitionWithWWW = partitioner.getPartition("http://www.example.org/", numReduceTasks); - //the following has dependendy on example.org (that is has the same ip as www.example.org) - assertEquals("partitions should be same because of same ip", + + int partitionWithoutWWW = partitioner.getPartition("http://example.org/", + numReduceTasks); + int partitionWithWWW = partitioner.getPartition("http://www.example.org/", + numReduceTasks); + // the following has dependendy on example.org (that is has the same ip as + // www.example.org) + assertEquals("partitions should be same because of same ip", partitionWithoutWWW, partitionWithWWW); } - + /** * Test the seed functionality, using host partitioner. */ @@ -130,84 +150,92 @@ public class TestURLPartitioner { public void testSeed() { URLPartitioner partitioner = new URLPartitioner(); Configuration conf = NutchConfiguration.create(); - conf.set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST); + conf.set(URLPartitioner.PARTITION_MODE_KEY, + URLPartitioner.PARTITION_MODE_HOST); partitioner.setConf(conf); - + int numReduceTasks = 100; - int partitionNoSeed = partitioner.getPartition("http://example.org/", numReduceTasks); - + int partitionNoSeed = partitioner.getPartition("http://example.org/", + numReduceTasks); + conf.setInt(URLPartitioner.PARTITION_URL_SEED, 1); partitioner.setConf(conf); - - int partitionWithSeed = partitioner.getPartition("http://example.org/", numReduceTasks); - - assertNotSame("partitions should differ because of different seed", + + int partitionWithSeed = partitioner.getPartition("http://example.org/", + numReduceTasks); + + assertNotSame("partitions should differ because of different seed", partitionNoSeed, partitionWithSeed); } - /** * Tests the {@link SelectorEntryPartitioner}. */ @Test public void testSelectorEntryPartitioner() { - //The reference partitioner + // The reference partitioner URLPartitioner refPartitioner = new URLPartitioner(); - - //The to be tested partitioner with specific signature - URLPartitioner.SelectorEntryPartitioner sigPartitioner = - new URLPartitioner.SelectorEntryPartitioner(); - + + // The to be tested partitioner with specific signature + URLPartitioner.SelectorEntryPartitioner sigPartitioner = new URLPartitioner.SelectorEntryPartitioner(); + Configuration conf = NutchConfiguration.create(); - conf.set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST); - + conf.set(URLPartitioner.PARTITION_MODE_KEY, + URLPartitioner.PARTITION_MODE_HOST); + refPartitioner.setConf(conf); sigPartitioner.setConf(conf); - + int numReduceTasks = 100; - - int partitionFromRef = refPartitioner.getPartition("http://www.example.org/", numReduceTasks); - //init selector entry (score shouldn't matter) - SelectorEntry selectorEntry = new SelectorEntry("http://www.example.org/", 1337); + + int partitionFromRef = refPartitioner.getPartition( + "http://www.example.org/", numReduceTasks); + // init selector entry (score shouldn't matter) + SelectorEntry selectorEntry = new SelectorEntry("http://www.example.org/", + 1337); WebPage page = WebPage.newBuilder().build(); - int partitionFromSig = sigPartitioner.getPartition(selectorEntry, page, numReduceTasks); - - assertEquals("partitions should be same", - partitionFromRef, partitionFromSig); - + int partitionFromSig = sigPartitioner.getPartition(selectorEntry, page, + numReduceTasks); + + assertEquals("partitions should be same", partitionFromRef, + partitionFromSig); + } - + /** * Tests the {@link FetchEntryPartitioner} - * @throws MalformedURLException + * + * @throws MalformedURLException */ @Test public void testFetchEntryPartitioner() throws MalformedURLException { - //The reference partitioner + // The reference partitioner URLPartitioner refPartitioner = new URLPartitioner(); - - //The to be tested partitioner with specific signature - URLPartitioner.FetchEntryPartitioner sigPartitioner = - new URLPartitioner.FetchEntryPartitioner(); - + + // The to be tested partitioner with specific signature + URLPartitioner.FetchEntryPartitioner sigPartitioner = new URLPartitioner.FetchEntryPartitioner(); + Configuration conf = NutchConfiguration.create(); - conf.set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST); - + conf.set(URLPartitioner.PARTITION_MODE_KEY, + URLPartitioner.PARTITION_MODE_HOST); + refPartitioner.setConf(conf); sigPartitioner.setConf(conf); - + int numReduceTasks = 100; - - int partitionFromRef = refPartitioner.getPartition("http://www.example.org/", numReduceTasks); - IntWritable intWritable = new IntWritable(1337); //doesn't matter + + int partitionFromRef = refPartitioner.getPartition( + "http://www.example.org/", numReduceTasks); + IntWritable intWritable = new IntWritable(1337); // doesn't matter WebPage page = WebPage.newBuilder().build(); String key = TableUtil.reverseUrl("http://www.example.org/"); FetchEntry fetchEntry = new FetchEntry(conf, key, page); - int partitionFromSig = sigPartitioner.getPartition(intWritable, fetchEntry, numReduceTasks); - - assertEquals("partitions should be same", - partitionFromRef, partitionFromSig); - + int partitionFromSig = sigPartitioner.getPartition(intWritable, fetchEntry, + numReduceTasks); + + assertEquals("partitions should be same", partitionFromRef, + partitionFromSig); + } - + } Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestUrlWithScore.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestUrlWithScore.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestUrlWithScore.java (original) +++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestUrlWithScore.java Fri Jan 9 06:34:33 2015 @@ -42,7 +42,7 @@ public class TestUrlWithScore { UrlWithScore keyOut = new UrlWithScore("http://example.org/", 1f); assertEquals("http://example.org/", keyOut.getUrl().toString()); assertEquals(1f, keyOut.getScore().get(), 0.001); - + // write to out ByteArrayOutputStream bos = new ByteArrayOutputStream(); DataOutputStream out = new DataOutputStream(bos); @@ -59,45 +59,45 @@ public class TestUrlWithScore { in.close(); out.close(); } - + @Test public void testPartitioner() throws IOException { UrlOnlyPartitioner part = new UrlOnlyPartitioner(); - + UrlWithScore k1 = new UrlWithScore("http://example.org/1", 1f); UrlWithScore k2 = new UrlWithScore("http://example.org/1", 2f); UrlWithScore k3 = new UrlWithScore("http://example.org/2", 1f); UrlWithScore k4 = new UrlWithScore("http://example.org/2", 2f); UrlWithScore k5 = new UrlWithScore("http://example.org/2", 3f); - + int numReduces = 7; - + // keys 1 and 2 should be partitioned together int partForKey1 = part.getPartition(k1, null, numReduces); assertEquals(partForKey1, part.getPartition(k2, null, numReduces)); assertEquals(partForKey1, part.getPartition(k2, null, numReduces)); - + // keys 3, 4 and 5 should be partitioned together int partForKey3 = part.getPartition(k3, null, numReduces); assertEquals(partForKey3, part.getPartition(k4, null, numReduces)); assertEquals(partForKey3, part.getPartition(k5, null, numReduces)); } - + @Test public void testUrlOnlySorting() throws IOException { UrlOnlyComparator comp = new UrlOnlyComparator(); - + UrlWithScore k1 = new UrlWithScore("http://example.org/1", 1f); UrlWithScore k2 = new UrlWithScore("http://example.org/1", 2f); UrlWithScore k3 = new UrlWithScore("http://example.org/2", 1f); UrlWithScore k4 = new UrlWithScore("http://example.org/2", 2f); UrlWithScore k5 = new UrlWithScore("http://example.org/2", 3f); - + // k1 should be equal to k2 assertEquals(0, compareBothRegularAndRaw(comp, k1, k2)); // test symmetry assertEquals(0, compareBothRegularAndRaw(comp, k2, k1)); - + // k1 is before k3, k4 and k5 assertEquals(-1, compareBothRegularAndRaw(comp, k1, k3)); assertEquals(-1, compareBothRegularAndRaw(comp, k1, k4)); @@ -107,22 +107,22 @@ public class TestUrlWithScore { assertEquals(1, compareBothRegularAndRaw(comp, k4, k1)); assertEquals(1, compareBothRegularAndRaw(comp, k5, k1)); } - + @Test public void testUrlScoreSorting() throws IOException { UrlScoreComparator comp = new UrlScoreComparator(); - + UrlWithScore k1 = new UrlWithScore("http://example.org/1", 1f); UrlWithScore k2 = new UrlWithScore("http://example.org/1", 2f); UrlWithScore k3 = new UrlWithScore("http://example.org/2", 1f); UrlWithScore k4 = new UrlWithScore("http://example.org/2", 2f); UrlWithScore k5 = new UrlWithScore("http://example.org/2", 3f); - + // k1 is after k2, because score is lower assertEquals(1, comp.compare(k1, k2)); // test symmetry assertEquals(-1, comp.compare(k2, k1)); - + // k1 is before k3, k4 and k5, because url is lower assertEquals(-1, compareBothRegularAndRaw(comp, k1, k3)); assertEquals(-1, compareBothRegularAndRaw(comp, k1, k4)); @@ -131,7 +131,7 @@ public class TestUrlWithScore { assertEquals(1, compareBothRegularAndRaw(comp, k3, k1)); assertEquals(1, compareBothRegularAndRaw(comp, k4, k1)); assertEquals(1, compareBothRegularAndRaw(comp, k5, k1)); - + // k3 after k4 and k4 after k5 and therefore k3 after k5 (transitivity) assertEquals(1, compareBothRegularAndRaw(comp, k3, k4)); assertEquals(1, compareBothRegularAndRaw(comp, k4, k5)); @@ -150,19 +150,19 @@ public class TestUrlWithScore { * @param k1 * @param k2 * @return The compare result. (When k1 != k2, assert failure kicks in) - * @throws IOException + * @throws IOException */ - private Object compareBothRegularAndRaw(RawComparator<UrlWithScore> comp, + private Object compareBothRegularAndRaw(RawComparator<UrlWithScore> comp, UrlWithScore k1, UrlWithScore k2) throws IOException { int regular = comp.compare(k1, k2); - + byte[] bytes1 = extractBytes(k1); byte[] bytes2 = extractBytes(k2); - + int raw = comp.compare(bytes1, 0, bytes1.length, bytes2, 0, bytes2.length); - + assertEquals("Regular compare should equal raw compare", regular, raw); - + return regular; } @@ -181,5 +181,5 @@ public class TestUrlWithScore { out.close(); return bytes; } - + }