Author: ab Date: Thu Feb 15 15:45:22 2007 New Revision: 508238 URL: http://svn.apache.org/viewvc?view=rev&rev=508238 Log: Enhance FreeGenerator to use host partitioning and hash-based sorting, to ensure that the same model for politeness checking works here too.
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java?view=diff&rev=508238&r1=508237&r2=508238 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java Thu Feb 15 15:45:22 2007 @@ -39,6 +39,7 @@ import org.apache.hadoop.util.ToolBase; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Generator; +import org.apache.nutch.crawl.PartitionUrlByHost; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.scoring.ScoringFilters; @@ -141,12 +142,14 @@ job.addInputPath(new Path(args[0])); job.setInputFormat(TextInputFormat.class); job.setMapperClass(FG.class); - job.setCombinerClass(FG.class); + job.setPartitionerClass(PartitionUrlByHost.class); job.setReducerClass(FG.class); String segName = Generator.generateSegmentName(); + job.setNumReduceTasks(job.getNumMapTasks()); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); + job.setOutputKeyComparatorClass(Generator.HashComparator.class); job.setOutputPath(new Path(args[1], new Path(segName, CrawlDatum.GENERATE_DIR_NAME))); try { JobClient.runJob(job); ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs