Author: cutting Date: Sun Jul 10 14:11:20 2005 New Revision: 210034 URL: http://svn.apache.org/viewcvs?rev=210034&view=rev Log: Fix so that fetcher does not split its input files, since they're already split by host and should not be subdivided.
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=210034&r1=210033&r2=210034&view=diff ============================================================================== --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Sun Jul 10 14:11:20 2005 @@ -38,6 +38,19 @@ public static final String DIGEST_KEY = "nutch.content.digest"; + public class InputFormat extends SequenceFileInputFormat { + /** Don't split inputs, to keep things polite. */ + public FileSplit[] getSplits(NutchFileSystem fs, JobConf job, int nSplits) + throws IOException { + File[] files = listFiles(fs, job); + FileSplit[] splits = new FileSplit[files.length]; + for (int i = 0; i < files.length; i++) { + splits[i] = new FileSplit(files[i], 0, fs.getLength(files[i])); + } + return splits; + } + } + private RecordReader input; private OutputCollector output;