hadoop git commit: HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P. McCabe)

2017-08-30 Thread shv
Repository: hadoop
Updated Branches:
  refs/heads/branch-2.7 70df729a1 -> c2350ec42


HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp 
via Colin P. McCabe)

(cherry-picked from 4e7c6a653f108d44589f84d78a03d92ee0e8a3c3)

Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/c2350ec4
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/c2350ec4
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/c2350ec4

Branch: refs/heads/branch-2.7
Commit: c2350ec42d9389fc6374a395574f4bdd646209c0
Parents: 70df729
Author: Colin Patrick Mccabe 
Authored: Fri Sep 25 15:25:42 2015 -0700
Committer: Konstantin V Shvachko 
Committed: Wed Aug 30 16:15:03 2017 -0700

--
 hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt |  3 ++
 .../blockmanagement/HeartbeatManager.java   | 30 ++--
 .../blockmanagement/TestHeartbeatHandling.java  | 27 ++
 3 files changed, 58 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/c2350ec4/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
--
diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt 
b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
index 845a964..d1ebdcd 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@@ -19,6 +19,9 @@ Release 2.7.5 - UNRELEASED
 HDFS-12157. Do fsyncDirectory(..) outside of FSDataset lock.
 (Vinayakumar B. via kihwal)
 
+HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn
+Sharp via Colin P. McCabe)
+
 Release 2.7.4 - 2017-08-04
 
   INCOMPATIBLE CHANGES

http://git-wip-us.apache.org/repos/asf/hadoop/blob/c2350ec4/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
index dfc3c60..93e9b32 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
@@ -19,6 +19,7 @@ package org.apache.hadoop.hdfs.server.blockmanagement;
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
@@ -28,10 +29,13 @@ import org.apache.hadoop.hdfs.server.namenode.Namesystem;
 import org.apache.hadoop.hdfs.server.protocol.StorageReport;
 import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
 import org.apache.hadoop.util.Daemon;
+import org.apache.hadoop.util.StopWatch;
 import org.apache.hadoop.util.Time;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.annotations.VisibleForTesting;
+
 /**
  * Manage the heartbeats received from datanodes.
  * The datanode list and statistics are synchronized
@@ -56,8 +60,8 @@ class HeartbeatManager implements DatanodeStatistics {
   private final long heartbeatRecheckInterval;
   /** Heartbeat monitor thread */
   private final Daemon heartbeatThread = new Daemon(new Monitor());
+  private final StopWatch heartbeatStopWatch = new StopWatch();
 
-
   final Namesystem namesystem;
   final BlockManager blockManager;
 
@@ -245,7 +249,18 @@ class HeartbeatManager implements DatanodeStatistics {
   stats.add(node);
 }
   }
-  
+
+  @VisibleForTesting
+  void restartHeartbeatStopWatch() {
+heartbeatStopWatch.reset().start();
+  }
+
+  @VisibleForTesting
+  boolean shouldAbortHeartbeatCheck(long offset) {
+long elapsed = heartbeatStopWatch.now(TimeUnit.MILLISECONDS);
+return elapsed + offset > heartbeatRecheckInterval;
+  }
+
   /**
* Check if there are any expired heartbeats, and if so,
* whether any blocks have to be re-replicated.
@@ -292,6 +307,10 @@ class HeartbeatManager implements DatanodeStatistics {
   int numOfStaleStorages = 0;
   synchronized(this) {
 for (DatanodeDescriptor d : datanodes) {
+  // check if an excessive GC pause has occurred
+  if (shouldAbortHeartbeatCheck(0)) {
+return;
+  }
   if (dead == null && dm.isDatanodeDead(d)) {
 stats.incrExpiredHeartbeats();
 dead = d;
@@ -360,6 +379,7 @@ class HeartbeatManager implements DatanodeStatistics {
 @Override
 public void run() {
   while(namesystem.isRunning()) {
+restartHea

[06/50] [abbrv] hadoop git commit: HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P. McCabe)

2015-09-29 Thread aengineer
HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp 
via Colin P. McCabe)

Change-Id: Ib8420310e515bb98091de86ea5c4be354878d43c


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/4e7c6a65
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/4e7c6a65
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/4e7c6a65

Branch: refs/heads/HDFS-7240
Commit: 4e7c6a653f108d44589f84d78a03d92ee0e8a3c3
Parents: d3c49e7
Author: Colin Patrick Mccabe 
Authored: Fri Sep 25 15:25:42 2015 -0700
Committer: Colin Patrick Mccabe 
Committed: Fri Sep 25 15:25:42 2015 -0700

--
 .../blockmanagement/HeartbeatManager.java   | 30 ++--
 .../blockmanagement/TestHeartbeatHandling.java  | 27 ++
 2 files changed, 55 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/4e7c6a65/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
index cc9365d..f2e9827 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
@@ -24,6 +24,7 @@ import java.util.IdentityHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.StorageType;
@@ -34,10 +35,13 @@ import org.apache.hadoop.hdfs.server.namenode.Namesystem;
 import org.apache.hadoop.hdfs.server.protocol.StorageReport;
 import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
 import org.apache.hadoop.util.Daemon;
+import org.apache.hadoop.util.StopWatch;
 import org.apache.hadoop.util.Time;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.annotations.VisibleForTesting;
+
 /**
  * Manage the heartbeats received from datanodes.
  * The datanode list and statistics are synchronized
@@ -62,8 +66,8 @@ class HeartbeatManager implements DatanodeStatistics {
   private final long heartbeatRecheckInterval;
   /** Heartbeat monitor thread */
   private final Daemon heartbeatThread = new Daemon(new Monitor());
+  private final StopWatch heartbeatStopWatch = new StopWatch();
 
-
   final Namesystem namesystem;
   final BlockManager blockManager;
 
@@ -260,7 +264,18 @@ class HeartbeatManager implements DatanodeStatistics {
   stats.add(node);
 }
   }
-  
+
+  @VisibleForTesting
+  void restartHeartbeatStopWatch() {
+heartbeatStopWatch.reset().start();
+  }
+
+  @VisibleForTesting
+  boolean shouldAbortHeartbeatCheck(long offset) {
+long elapsed = heartbeatStopWatch.now(TimeUnit.MILLISECONDS);
+return elapsed + offset > heartbeatRecheckInterval;
+  }
+
   /**
* Check if there are any expired heartbeats, and if so,
* whether any blocks have to be re-replicated.
@@ -307,6 +322,10 @@ class HeartbeatManager implements DatanodeStatistics {
   int numOfStaleStorages = 0;
   synchronized(this) {
 for (DatanodeDescriptor d : datanodes) {
+  // check if an excessive GC pause has occurred
+  if (shouldAbortHeartbeatCheck(0)) {
+return;
+  }
   if (dead == null && dm.isDatanodeDead(d)) {
 stats.incrExpiredHeartbeats();
 dead = d;
@@ -375,6 +394,7 @@ class HeartbeatManager implements DatanodeStatistics {
 @Override
 public void run() {
   while(namesystem.isRunning()) {
+restartHeartbeatStopWatch();
 try {
   final long now = Time.monotonicNow();
   if (lastHeartbeatCheck + heartbeatRecheckInterval < now) {
@@ -396,6 +416,12 @@ class HeartbeatManager implements DatanodeStatistics {
   Thread.sleep(5000);  // 5 seconds
 } catch (InterruptedException ie) {
 }
+// avoid declaring nodes dead for another cycle if a GC pause lasts
+// longer than the node recheck interval
+if (shouldAbortHeartbeatCheck(-5000)) {
+  LOG.warn("Skipping next heartbeat scan due to excessive pause");
+  lastHeartbeatCheck = Time.monotonicNow();
+}
   }
 }
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/4e7c6a65/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java
-

hadoop git commit: HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P. McCabe)

2015-09-25 Thread cmccabe
Repository: hadoop
Updated Branches:
  refs/heads/branch-2 d8a5d2b2f -> a6166aa55


HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp 
via Colin P. McCabe)

(cherry-picked from 4e7c6a653f108d44589f84d78a03d92ee0e8a3c3)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/a6166aa5
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/a6166aa5
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/a6166aa5

Branch: refs/heads/branch-2
Commit: a6166aa5523916f002b70ec9c731fcfe3389228f
Parents: d8a5d2b
Author: Colin Patrick Mccabe 
Authored: Fri Sep 25 15:25:42 2015 -0700
Committer: Colin Patrick Mccabe 
Committed: Fri Sep 25 15:33:02 2015 -0700

--
 hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt |  3 ++
 .../blockmanagement/HeartbeatManager.java   | 30 ++--
 .../blockmanagement/TestHeartbeatHandling.java  | 27 ++
 3 files changed, 58 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/a6166aa5/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
--
diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt 
b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
index 3df32dc00..682f37c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@@ -1071,6 +1071,9 @@ Release 2.8.0 - UNRELEASED
 HDFS-9123. Copying from the root to a subdirectory should be forbidden.
 (Wei-Chiu Chuang via Yongjun Zhang)
 
+HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn
+Sharp via Colin P. McCabe)
+
 Release 2.7.2 - UNRELEASED
 
   INCOMPATIBLE CHANGES

http://git-wip-us.apache.org/repos/asf/hadoop/blob/a6166aa5/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
index cc9365d..f2e9827 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
@@ -24,6 +24,7 @@ import java.util.IdentityHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.StorageType;
@@ -34,10 +35,13 @@ import org.apache.hadoop.hdfs.server.namenode.Namesystem;
 import org.apache.hadoop.hdfs.server.protocol.StorageReport;
 import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
 import org.apache.hadoop.util.Daemon;
+import org.apache.hadoop.util.StopWatch;
 import org.apache.hadoop.util.Time;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.annotations.VisibleForTesting;
+
 /**
  * Manage the heartbeats received from datanodes.
  * The datanode list and statistics are synchronized
@@ -62,8 +66,8 @@ class HeartbeatManager implements DatanodeStatistics {
   private final long heartbeatRecheckInterval;
   /** Heartbeat monitor thread */
   private final Daemon heartbeatThread = new Daemon(new Monitor());
+  private final StopWatch heartbeatStopWatch = new StopWatch();
 
-
   final Namesystem namesystem;
   final BlockManager blockManager;
 
@@ -260,7 +264,18 @@ class HeartbeatManager implements DatanodeStatistics {
   stats.add(node);
 }
   }
-  
+
+  @VisibleForTesting
+  void restartHeartbeatStopWatch() {
+heartbeatStopWatch.reset().start();
+  }
+
+  @VisibleForTesting
+  boolean shouldAbortHeartbeatCheck(long offset) {
+long elapsed = heartbeatStopWatch.now(TimeUnit.MILLISECONDS);
+return elapsed + offset > heartbeatRecheckInterval;
+  }
+
   /**
* Check if there are any expired heartbeats, and if so,
* whether any blocks have to be re-replicated.
@@ -307,6 +322,10 @@ class HeartbeatManager implements DatanodeStatistics {
   int numOfStaleStorages = 0;
   synchronized(this) {
 for (DatanodeDescriptor d : datanodes) {
+  // check if an excessive GC pause has occurred
+  if (shouldAbortHeartbeatCheck(0)) {
+return;
+  }
   if (dead == null && dm.isDatanodeDead(d)) {
 stats.incrExpiredHeartbeats();
 dead = d;
@@ -375,6 +394,7 @@ class HeartbeatManager implements DatanodeStatistics {
 @Override
 public void run() {
   while(namesystem.isRunning()) {
+   

hadoop git commit: HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P. McCabe)

2015-09-25 Thread cmccabe
Repository: hadoop
Updated Branches:
  refs/heads/trunk d3c49e766 -> 4e7c6a653


HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp 
via Colin P. McCabe)

Change-Id: Ib8420310e515bb98091de86ea5c4be354878d43c


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/4e7c6a65
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/4e7c6a65
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/4e7c6a65

Branch: refs/heads/trunk
Commit: 4e7c6a653f108d44589f84d78a03d92ee0e8a3c3
Parents: d3c49e7
Author: Colin Patrick Mccabe 
Authored: Fri Sep 25 15:25:42 2015 -0700
Committer: Colin Patrick Mccabe 
Committed: Fri Sep 25 15:25:42 2015 -0700

--
 .../blockmanagement/HeartbeatManager.java   | 30 ++--
 .../blockmanagement/TestHeartbeatHandling.java  | 27 ++
 2 files changed, 55 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/4e7c6a65/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
index cc9365d..f2e9827 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java
@@ -24,6 +24,7 @@ import java.util.IdentityHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.StorageType;
@@ -34,10 +35,13 @@ import org.apache.hadoop.hdfs.server.namenode.Namesystem;
 import org.apache.hadoop.hdfs.server.protocol.StorageReport;
 import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
 import org.apache.hadoop.util.Daemon;
+import org.apache.hadoop.util.StopWatch;
 import org.apache.hadoop.util.Time;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.annotations.VisibleForTesting;
+
 /**
  * Manage the heartbeats received from datanodes.
  * The datanode list and statistics are synchronized
@@ -62,8 +66,8 @@ class HeartbeatManager implements DatanodeStatistics {
   private final long heartbeatRecheckInterval;
   /** Heartbeat monitor thread */
   private final Daemon heartbeatThread = new Daemon(new Monitor());
+  private final StopWatch heartbeatStopWatch = new StopWatch();
 
-
   final Namesystem namesystem;
   final BlockManager blockManager;
 
@@ -260,7 +264,18 @@ class HeartbeatManager implements DatanodeStatistics {
   stats.add(node);
 }
   }
-  
+
+  @VisibleForTesting
+  void restartHeartbeatStopWatch() {
+heartbeatStopWatch.reset().start();
+  }
+
+  @VisibleForTesting
+  boolean shouldAbortHeartbeatCheck(long offset) {
+long elapsed = heartbeatStopWatch.now(TimeUnit.MILLISECONDS);
+return elapsed + offset > heartbeatRecheckInterval;
+  }
+
   /**
* Check if there are any expired heartbeats, and if so,
* whether any blocks have to be re-replicated.
@@ -307,6 +322,10 @@ class HeartbeatManager implements DatanodeStatistics {
   int numOfStaleStorages = 0;
   synchronized(this) {
 for (DatanodeDescriptor d : datanodes) {
+  // check if an excessive GC pause has occurred
+  if (shouldAbortHeartbeatCheck(0)) {
+return;
+  }
   if (dead == null && dm.isDatanodeDead(d)) {
 stats.incrExpiredHeartbeats();
 dead = d;
@@ -375,6 +394,7 @@ class HeartbeatManager implements DatanodeStatistics {
 @Override
 public void run() {
   while(namesystem.isRunning()) {
+restartHeartbeatStopWatch();
 try {
   final long now = Time.monotonicNow();
   if (lastHeartbeatCheck + heartbeatRecheckInterval < now) {
@@ -396,6 +416,12 @@ class HeartbeatManager implements DatanodeStatistics {
   Thread.sleep(5000);  // 5 seconds
 } catch (InterruptedException ie) {
 }
+// avoid declaring nodes dead for another cycle if a GC pause lasts
+// longer than the node recheck interval
+if (shouldAbortHeartbeatCheck(-5000)) {
+  LOG.warn("Skipping next heartbeat scan due to excessive pause");
+  lastHeartbeatCheck = Time.monotonicNow();
+}
   }
 }
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/4e7c6a65/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blo