This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 814f8b997ddbdc670d7ffba59ee33f1b3cc3ab96 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Thu Apr 23 15:55:32 2020 +0200 NUTCH-1194 Generator: CrawlDB lock should be released earlier - release CrawlDb lock after select step, in case, generated items are not marked in CrawlDb (generate.update.crawldb is false) --- src/java/org/apache/nutch/crawl/Generator.java | 18 +++++++++++++----- src/java/org/apache/nutch/util/NutchJob.java | 14 ++++++++++++-- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index 5dcd2ea..04c2ae8 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -841,6 +841,14 @@ public class Generator extends NutchTool implements Tool { String.format(Locale.ROOT, "%6d", counter.getValue()), counter.getName()); } + if (!getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { + /* + * generated items are not marked in CrawlDb, and CrawlDb will not + * accessed anymore: we already can release the lock + */ + LockUtil.removeLockFile(getConf(), lock); + lock = null; + } // read the subdirectories generated in the temp // output and turn them into segments @@ -858,15 +866,13 @@ public class Generator extends NutchTool implements Tool { } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); - LockUtil.removeLockFile(getConf(), lock); - fs.delete(tempDir, true); + NutchJob.cleanupAfterFailure(tempDir, lock, fs); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); - LockUtil.removeLockFile(getConf(), lock); - fs.delete(tempDir, true); + NutchJob.cleanupAfterFailure(tempDir, lock, fs); return null; } @@ -913,7 +919,9 @@ public class Generator extends NutchTool implements Tool { fs.delete(tempDir2, true); } - LockUtil.removeLockFile(getConf(), lock); + if (lock != null) { + LockUtil.removeLockFile(getConf(), lock); + } fs.delete(tempDir, true); long end = System.currentTimeMillis(); diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java index 991e506..13257d2 100644 --- a/src/java/org/apache/nutch/util/NutchJob.java +++ b/src/java/org/apache/nutch/util/NutchJob.java @@ -41,7 +41,15 @@ public class NutchJob extends Job { return Job.getInstance(conf); } - /* + /** + * Clean up the file system in case of a job failure. + */ + public static void cleanupAfterFailure(Path tempDir, FileSystem fs) + throws IOException { + cleanupAfterFailure(tempDir, null, fs); + } + + /** * Clean up the file system in case of a job failure. */ public static void cleanupAfterFailure(Path tempDir, Path lock, FileSystem fs) @@ -50,7 +58,9 @@ public class NutchJob extends Job { if (fs.exists(tempDir)) { fs.delete(tempDir, true); } - LockUtil.removeLockFile(fs, lock); + if (lock != null) { + LockUtil.removeLockFile(fs, lock); + } } catch (IOException e) { LOG.error("NutchJob cleanup failed: {}", e.getMessage()); throw e;