Author: siren Date: Sun Nov 19 10:48:39 2006 New Revision: 476879 URL: http://svn.apache.org/viewvc?view=rev&rev=476879 Log: NUTCH-403 Make URL filtering optional in Generator
Added: lucene/nutch/trunk/src/test/filter-all.txt Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=476879&r1=476878&r2=476879 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sun Nov 19 10:48:39 2006 @@ -78,6 +78,8 @@ 25. NUTCH-404 - Fix LinkDB Usage - implementation mismatch (siren) +26. NUTCH-403 - Make URL filtering optional in Generator (siren) + Release 0.8 - 2006-07-25 0. Totally new architecture, based on hadoop Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?view=diff&rev=476879&r1=476878&r2=476879 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Sun Nov 19 10:48:39 2006 @@ -115,9 +115,8 @@ injector.inject(crawlDb, rootUrlDir); for (int i = 0; i < depth; i++) { // generate new segment - Path segment = - generator.generate(crawlDb, segments, -1, - topN, System.currentTimeMillis()); + Path segment = generator.generate(crawlDb, segments, -1, topN, System + .currentTimeMillis(), false); fetcher.fetch(segment, threads); // fetch it if (!Fetcher.isParsing(job)) { parseSegment.parse(segment); // parse it, if needed Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=476879&r1=476878&r2=476879 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Sun Nov 19 10:48:39 2006 @@ -44,6 +44,7 @@ /** Generates a subset of a crawl db to fetch. */ public class Generator extends ToolBase { + public static final String CRAWL_GENERATE_FILTER = "crawl.generate.filter"; public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip"; public static final String GENERATE_MAX_PER_HOST = "generate.max.per.host"; public static final String CRAWL_TOP_N = "crawl.topN"; @@ -89,6 +90,7 @@ private FloatWritable sortValue = new FloatWritable(); private boolean byIP; private long dnsFailure = 0L; + private boolean filter; public void configure(JobConf job) { curTime = job.getLong(CRAWL_GEN_CUR_TIME, System.currentTimeMillis()); @@ -99,6 +101,7 @@ normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_GENERATE_HOST_COUNT); scfilters = new ScoringFilters(job); hostPartitioner.configure(job); + filter = job.getBoolean(CRAWL_GENERATE_FILTER, true); } public void close() {} @@ -108,13 +111,16 @@ OutputCollector output, Reporter reporter) throws IOException { Text url = (Text)key; - // don't generate URLs that don't pass URLFilters - try { - if (filters.filter(url.toString()) == null) - return; - } catch (URLFilterException e) { - if (LOG.isWarnEnabled()) { - LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")"); + if (filter) { + // If filtering is on don't generate URLs that don't pass URLFilters + try { + if (filters.filter(url.toString()) == null) + return; + } catch (URLFilterException e) { + if (LOG.isWarnEnabled()) { + LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + + ")"); + } } } CrawlDatum crawlDatum = (CrawlDatum)value; @@ -291,13 +297,13 @@ /** Generate fetchlists in a segment. */ public Path generate(Path dbDir, Path segments) throws IOException { - return generate(dbDir, segments, - -1, Long.MAX_VALUE, System.currentTimeMillis()); + return generate(dbDir, segments, -1, Long.MAX_VALUE, System + .currentTimeMillis(), true); } /** Generate fetchlists in a segment. */ public Path generate(Path dbDir, Path segments, - int numLists, long topN, long curTime) + int numLists, long topN, long curTime, boolean filter) throws IOException { Path tempDir = @@ -308,10 +314,12 @@ Path segment = new Path(segments, generateSegmentName()); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); - if (LOG.isInfoEnabled()) { - LOG.info("Generator: starting"); - LOG.info("Generator: segment: " + segment); - LOG.info("Generator: Selecting best-scoring urls due for fetch."); + LOG.info("Generator: Selecting best-scoring urls due for fetch."); + LOG.info("Generator: starting"); + LOG.info("Generator: segment: " + segment); + LOG.info("Generator: filtering: " + filter); + if (topN != Long.MAX_VALUE) { + LOG.info("Generator: topN: " + topN); } // map to inverted subset due for fetch, sort by link count @@ -326,8 +334,9 @@ LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } - job.setLong("crawl.gen.curTime", curTime); - job.setLong("crawl.topN", topN); + job.setLong(CRAWL_GEN_CUR_TIME, curTime); + job.setLong(CRAWL_TOP_N, topN); + job.setBoolean(CRAWL_GENERATE_FILTER, filter); job.setInputPath(new Path(dbDir, CrawlDatum.DB_DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); @@ -393,7 +402,7 @@ public int run(String[] args) throws Exception { if (args.length < 2) { - System.out.println("Usage: Generator <crawldb> <segments_dir> [-topN N] [-numFetchers numFetchers] [-adddays numDays]"); + System.out.println("Usage: Generator <crawldb> <segments_dir> [-topN N] [-numFetchers numFetchers] [-adddays numDays] [-noFilter]"); return -1; } @@ -402,6 +411,7 @@ long curTime = System.currentTimeMillis(); long topN = Long.MAX_VALUE; int numFetchers = -1; + boolean filter = true; for (int i = 2; i < args.length; i++) { if ("-topN".equals(args[i])) { @@ -413,14 +423,14 @@ } else if ("-adddays".equals(args[i])) { long numDays = Integer.parseInt(args[i+1]); curTime += numDays * 1000L * 60 * 60 * 24; + } else if ("-noFilter".equals(args[i])) { + filter = false; } + } - if ((LOG.isInfoEnabled()) && (topN != Long.MAX_VALUE)) { - LOG.info("topN: " + topN); - } try { - generate(dbDir, segmentsDir, numFetchers, topN, curTime); + generate(dbDir, segmentsDir, numFetchers, topN, curTime, filter); return 0; } catch (Exception e) { LOG.fatal("Generator: " + StringUtils.stringifyException(e)); Added: lucene/nutch/trunk/src/test/filter-all.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/filter-all.txt?view=auto&rev=476879 ============================================================================== --- lucene/nutch/trunk/src/test/filter-all.txt (added) +++ lucene/nutch/trunk/src/test/filter-all.txt Sun Nov 19 10:48:39 2006 @@ -0,0 +1,7 @@ +# Config file for urlfilter-suffix plugin +# Filter away all urls + +# case-insensitive, disallow unknown suffixes +-I + +# allow these Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?view=diff&rev=476879&r1=476878&r2=476879 ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Sun Nov 19 10:48:39 2006 @@ -85,7 +85,7 @@ createCrawlDB(list); - Path generatedSegment = generateFetchlist(NUM_RESULTS, conf); + Path generatedSegment = generateFetchlist(NUM_RESULTS, conf, false); Path fetchlist = new Path(new Path(generatedSegment, CrawlDatum.GENERATE_DIR_NAME), "part-00000"); @@ -145,7 +145,8 @@ Configuration myConfiguration = new Configuration(conf); myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 1); - Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration); + Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, + myConfiguration, false); Path fetchlistPath = new Path(new Path(generatedSegment, CrawlDatum.GENERATE_DIR_NAME), "part-00000"); @@ -155,10 +156,10 @@ // verify we got right amount of records assertEquals(1, fetchList.size()); - myConfiguration = new Configuration(conf); myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 2); - generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration); + generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, + false); fetchlistPath = new Path(new Path(generatedSegment, CrawlDatum.GENERATE_DIR_NAME), "part-00000"); @@ -170,7 +171,8 @@ myConfiguration = new Configuration(conf); myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 3); - generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration); + generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, + false); fetchlistPath = new Path(new Path(generatedSegment, CrawlDatum.GENERATE_DIR_NAME), "part-00000"); @@ -180,7 +182,7 @@ // verify we got right amount of records assertEquals(3, fetchList.size()); } - + /** * Test that generator obeys the property "generate.max.per.host" and * "generate.max.per.host.by.ip". @@ -189,12 +191,9 @@ public void testGenerateHostIPLimit() throws Exception{ ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>(); - list.add(createURLCrawlDatum("http://www.example.com/index.html", - 1, 1)); - list.add(createURLCrawlDatum("http://www.example.net/index.html", - 1, 1)); - list.add(createURLCrawlDatum("http://www.example.org/index.html", - 1, 1)); + list.add(createURLCrawlDatum("http://www.example.com/index.html", 1, 1)); + list.add(createURLCrawlDatum("http://www.example.net/index.html", 1, 1)); + list.add(createURLCrawlDatum("http://www.example.org/index.html", 1, 1)); createCrawlDB(list); @@ -202,7 +201,8 @@ myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 1); myConfiguration.setBoolean(Generator.GENERATE_MAX_PER_HOST_BY_IP, true); - Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration); + Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, + myConfiguration, false); Path fetchlistPath = new Path(new Path(generatedSegment, CrawlDatum.GENERATE_DIR_NAME), "part-00000"); @@ -214,7 +214,7 @@ myConfiguration = new Configuration(myConfiguration); myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 2); - generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration); + generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, false); fetchlistPath = new Path(new Path(generatedSegment, CrawlDatum.GENERATE_DIR_NAME), "part-00000"); @@ -226,7 +226,8 @@ myConfiguration = new Configuration(myConfiguration); myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 3); - generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration); + generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, + false); fetchlistPath = new Path(new Path(generatedSegment, CrawlDatum.GENERATE_DIR_NAME), "part-00000"); @@ -237,6 +238,47 @@ assertEquals(3, fetchList.size()); } + /** + * Test generator obeys the filter setting. + * @throws Exception + * @throws IOException + */ + public void testFilter() throws IOException, Exception{ + + ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>(); + + list.add(createURLCrawlDatum("http://www.example.com/index.html", 1, 1)); + list.add(createURLCrawlDatum("http://www.example.net/index.html", 1, 1)); + list.add(createURLCrawlDatum("http://www.example.org/index.html", 1, 1)); + + createCrawlDB(list); + + Configuration myConfiguration = new Configuration(conf); + myConfiguration.set("urlfilter.suffix.file", "filter-all.txt"); + + Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, + myConfiguration, true); + + Path fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath); + + // verify all got filtered out + assertEquals(0, fetchList.size()); + + generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, false); + + fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + fetchList = readContents(fetchlistPath); + + // verify nothing got filtered + assertEquals(list.size(), fetchList.size()); + + } + /** * Read contents of fetchlist. @@ -270,11 +312,12 @@ * @return path to generated segment * @throws IOException */ - private Path generateFetchlist(int numResults, Configuration config) throws IOException { + private Path generateFetchlist(int numResults, Configuration config, + boolean filter) throws IOException { // generate segment Generator g = new Generator(config); Path generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults, - Long.MAX_VALUE); + Long.MAX_VALUE, filter); return generatedSegment; } Modified: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?view=diff&rev=476879&r1=476878&r2=476879 ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Sun Nov 19 10:48:39 2006 @@ -87,7 +87,8 @@ //generate Generator g=new Generator(conf); - Path generatedSegment=g.generate(crawldbPath, segmentsPath, 1, Long.MAX_VALUE, Long.MAX_VALUE); + Path generatedSegment = g.generate(crawldbPath, segmentsPath, 1, + Long.MAX_VALUE, Long.MAX_VALUE, false); long time=System.currentTimeMillis(); //fetch