Author: markus Date: Tue Apr 21 07:43:32 2015 New Revision: 1675058 URL: http://svn.apache.org/r1675058 Log: NUTCH-1697 SegmentMerger to implement Tool
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1675058&r1=1675057&r2=1675058&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Apr 21 07:43:32 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1697 SegmentMerger to implement Tool (markus, snagel) + * NUTCH-1987 - Make bin/crawl indexer agnostic (Michael Joyce, snagel via mattmann) * NUTCH-1854 bin/crawl fails with a parsing fetcher (Asitang Mishra via snagel) Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=1675058&r1=1675057&r2=1675058&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original) +++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Tue Apr 21 07:43:32 2015 @@ -51,6 +51,8 @@ import org.apache.hadoop.mapred.Sequence import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileRecordReader; import org.apache.hadoop.util.Progressable; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Generator; import org.apache.nutch.metadata.MetaWrapper; @@ -118,7 +120,7 @@ import org.apache.nutch.util.NutchJob; * * @author Andrzej Bialecki */ -public class SegmentMerger extends Configured implements +public class SegmentMerger extends Configured implements Tool, Mapper<Text, MetaWrapper, Text, MetaWrapper>, Reducer<Text, MetaWrapper, Text, MetaWrapper> { private static final Logger LOG = LoggerFactory @@ -691,7 +693,7 @@ public class SegmentMerger extends Confi /** * @param args */ - public static void main(String[] args) throws Exception { + public int run(String[] args) throws Exception { if (args.length < 2) { System.err .println("SegmentMerger output_dir (-dir segments | seg1 seg2 ...) [-filter] [-slice NNNN]"); @@ -706,7 +708,7 @@ public class SegmentMerger extends Confi .println("\t-normalize\t\tnormalize URL via current URLNormalizers"); System.err .println("\t-slice NNNN\tcreate many output segments, each containing NNNN URLs"); - return; + return -1; } Configuration conf = NutchConfiguration.create(); final FileSystem fs = FileSystem.get(conf); @@ -734,11 +736,18 @@ public class SegmentMerger extends Confi } if (segs.size() == 0) { System.err.println("ERROR: No input segments."); - return; + return -1; } - SegmentMerger merger = new SegmentMerger(conf); - merger.merge(out, segs.toArray(new Path[segs.size()]), filter, normalize, + + merge(out, segs.toArray(new Path[segs.size()]), filter, normalize, sliceSize); + return 0; + } + + public static void main(String[] args) throws Exception { + int result = ToolRunner.run(NutchConfiguration.create(), + new SegmentMerger(), args); + System.exit(result); } }