Author: ab
Date: Thu Feb 15 15:45:22 2007
New Revision: 508238

URL: http://svn.apache.org/viewvc?view=rev&rev=508238
Log:
Enhance FreeGenerator to use host partitioning and hash-based sorting,
to ensure that the same model for politeness checking works here too.

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java?view=diff&rev=508238&r1=508237&r2=508238
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java Thu 
Feb 15 15:45:22 2007
@@ -39,6 +39,7 @@
 import org.apache.hadoop.util.ToolBase;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Generator;
+import org.apache.nutch.crawl.PartitionUrlByHost;
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.scoring.ScoringFilters;
@@ -141,12 +142,14 @@
     job.addInputPath(new Path(args[0]));
     job.setInputFormat(TextInputFormat.class);
     job.setMapperClass(FG.class);
-    job.setCombinerClass(FG.class);
+    job.setPartitionerClass(PartitionUrlByHost.class);
     job.setReducerClass(FG.class);
     String segName = Generator.generateSegmentName();
+    job.setNumReduceTasks(job.getNumMapTasks());
     job.setOutputFormat(SequenceFileOutputFormat.class);
     job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(CrawlDatum.class);
+    job.setOutputKeyComparatorClass(Generator.HashComparator.class);
     job.setOutputPath(new Path(args[1], new Path(segName, 
CrawlDatum.GENERATE_DIR_NAME)));
     try {
       JobClient.runJob(job);



-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to