[ https://issues.apache.org/jira/browse/NUTCH-2544?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16453830#comment-16453830 ]
ASF GitHub Bot commented on NUTCH-2544: --------------------------------------- sebastian-nagel closed pull request #320: NUTCH-2544 Nutch 1.15 no longer compatible with AWS EMR and S3 URL: https://github.com/apache/nutch/pull/320 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java b/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java index 56b24e4a7..9feb7458d 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java +++ b/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java @@ -47,15 +47,11 @@ @Override public void checkOutputSpecs(JobContext job) throws IOException { Configuration conf = job.getConfiguration(); - FileSystem fs = FileSystem.get(conf); Path out = FileOutputFormat.getOutputPath(job); if ((out == null) && (job.getNumReduceTasks() != 0)) { throw new InvalidJobConfException("Output directory not set in conf."); } - - if (fs == null) { - fs = out.getFileSystem(conf); - } + FileSystem fs = out.getFileSystem(conf); if (fs.exists(new Path(out, CrawlDatum.FETCH_DIR_NAME))) { throw new IOException("Segment already fetched!"); } diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java index ea28550bd..0762ae4a7 100644 --- a/src/java/org/apache/nutch/util/SitemapProcessor.java +++ b/src/java/org/apache/nutch/util/SitemapProcessor.java @@ -336,7 +336,7 @@ public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean stric LOG.info("SitemapProcessor: Starting at {}", sdf.format(start)); } - FileSystem fs = FileSystem.get(getConf()); + FileSystem fs = crawldb.getFileSystem(getConf()); Path old = new Path(crawldb, "old"); Path current = new Path(crawldb, "current"); Path tempCrawlDb = new Path(crawldb, "crawldb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Nutch 1.15 no longer compatible with AWS EMR and S3 > --------------------------------------------------- > > Key: NUTCH-2544 > URL: https://issues.apache.org/jira/browse/NUTCH-2544 > Project: Nutch > Issue Type: Bug > Components: fetcher, generator > Affects Versions: 1.15 > Reporter: Steven W > Assignee: Sebastian Nagel > Priority: Critical > Fix For: 1.15 > > > Nutch 1.14 is working OK with AWS EMR and S3 storage, but NUTCH-2375 appears > to have broken this. > Generator partitioning fails with Error: java.lang.NullPointerException at > org.apache.nutch.crawl.URLPartitioner.getPartition(URLPartitioner.java:75) -- This message was sent by Atlassian JIRA (v7.6.3#76005)