hadoop git commit: HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P. McCabe)
Repository: hadoop Updated Branches: refs/heads/branch-2.7 70df729a1 -> c2350ec42 HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P. McCabe) (cherry-picked from 4e7c6a653f108d44589f84d78a03d92ee0e8a3c3) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/c2350ec4 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/c2350ec4 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/c2350ec4 Branch: refs/heads/branch-2.7 Commit: c2350ec42d9389fc6374a395574f4bdd646209c0 Parents: 70df729 Author: Colin Patrick Mccabe Authored: Fri Sep 25 15:25:42 2015 -0700 Committer: Konstantin V Shvachko Committed: Wed Aug 30 16:15:03 2017 -0700 -- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 ++ .../blockmanagement/HeartbeatManager.java | 30 ++-- .../blockmanagement/TestHeartbeatHandling.java | 27 ++ 3 files changed, 58 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hadoop/blob/c2350ec4/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 845a964..d1ebdcd 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -19,6 +19,9 @@ Release 2.7.5 - UNRELEASED HDFS-12157. Do fsyncDirectory(..) outside of FSDataset lock. (Vinayakumar B. via kihwal) +HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn +Sharp via Colin P. McCabe) + Release 2.7.4 - 2017-08-04 INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/c2350ec4/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java index dfc3c60..93e9b32 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hdfs.server.blockmanagement; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.TimeUnit; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.DFSConfigKeys; @@ -28,10 +29,13 @@ import org.apache.hadoop.hdfs.server.namenode.Namesystem; import org.apache.hadoop.hdfs.server.protocol.StorageReport; import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary; import org.apache.hadoop.util.Daemon; +import org.apache.hadoop.util.StopWatch; import org.apache.hadoop.util.Time; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.annotations.VisibleForTesting; + /** * Manage the heartbeats received from datanodes. * The datanode list and statistics are synchronized @@ -56,8 +60,8 @@ class HeartbeatManager implements DatanodeStatistics { private final long heartbeatRecheckInterval; /** Heartbeat monitor thread */ private final Daemon heartbeatThread = new Daemon(new Monitor()); + private final StopWatch heartbeatStopWatch = new StopWatch(); - final Namesystem namesystem; final BlockManager blockManager; @@ -245,7 +249,18 @@ class HeartbeatManager implements DatanodeStatistics { stats.add(node); } } - + + @VisibleForTesting + void restartHeartbeatStopWatch() { +heartbeatStopWatch.reset().start(); + } + + @VisibleForTesting + boolean shouldAbortHeartbeatCheck(long offset) { +long elapsed = heartbeatStopWatch.now(TimeUnit.MILLISECONDS); +return elapsed + offset > heartbeatRecheckInterval; + } + /** * Check if there are any expired heartbeats, and if so, * whether any blocks have to be re-replicated. @@ -292,6 +307,10 @@ class HeartbeatManager implements DatanodeStatistics { int numOfStaleStorages = 0; synchronized(this) { for (DatanodeDescriptor d : datanodes) { + // check if an excessive GC pause has occurred + if (shouldAbortHeartbeatCheck(0)) { +return; + } if (dead == null && dm.isDatanodeDead(d)) { stats.incrExpiredHeartbeats(); dead = d; @@ -360,6 +379,7 @@ class HeartbeatManager implements DatanodeStatistics { @Override public void run() { while(namesystem.isRunning()) { +restartHea
[06/50] [abbrv] hadoop git commit: HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P. McCabe)
HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P. McCabe) Change-Id: Ib8420310e515bb98091de86ea5c4be354878d43c Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/4e7c6a65 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/4e7c6a65 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/4e7c6a65 Branch: refs/heads/HDFS-7240 Commit: 4e7c6a653f108d44589f84d78a03d92ee0e8a3c3 Parents: d3c49e7 Author: Colin Patrick Mccabe Authored: Fri Sep 25 15:25:42 2015 -0700 Committer: Colin Patrick Mccabe Committed: Fri Sep 25 15:25:42 2015 -0700 -- .../blockmanagement/HeartbeatManager.java | 30 ++-- .../blockmanagement/TestHeartbeatHandling.java | 27 ++ 2 files changed, 55 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hadoop/blob/4e7c6a65/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java index cc9365d..f2e9827 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java @@ -24,6 +24,7 @@ import java.util.IdentityHashMap; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.TimeUnit; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.StorageType; @@ -34,10 +35,13 @@ import org.apache.hadoop.hdfs.server.namenode.Namesystem; import org.apache.hadoop.hdfs.server.protocol.StorageReport; import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary; import org.apache.hadoop.util.Daemon; +import org.apache.hadoop.util.StopWatch; import org.apache.hadoop.util.Time; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.annotations.VisibleForTesting; + /** * Manage the heartbeats received from datanodes. * The datanode list and statistics are synchronized @@ -62,8 +66,8 @@ class HeartbeatManager implements DatanodeStatistics { private final long heartbeatRecheckInterval; /** Heartbeat monitor thread */ private final Daemon heartbeatThread = new Daemon(new Monitor()); + private final StopWatch heartbeatStopWatch = new StopWatch(); - final Namesystem namesystem; final BlockManager blockManager; @@ -260,7 +264,18 @@ class HeartbeatManager implements DatanodeStatistics { stats.add(node); } } - + + @VisibleForTesting + void restartHeartbeatStopWatch() { +heartbeatStopWatch.reset().start(); + } + + @VisibleForTesting + boolean shouldAbortHeartbeatCheck(long offset) { +long elapsed = heartbeatStopWatch.now(TimeUnit.MILLISECONDS); +return elapsed + offset > heartbeatRecheckInterval; + } + /** * Check if there are any expired heartbeats, and if so, * whether any blocks have to be re-replicated. @@ -307,6 +322,10 @@ class HeartbeatManager implements DatanodeStatistics { int numOfStaleStorages = 0; synchronized(this) { for (DatanodeDescriptor d : datanodes) { + // check if an excessive GC pause has occurred + if (shouldAbortHeartbeatCheck(0)) { +return; + } if (dead == null && dm.isDatanodeDead(d)) { stats.incrExpiredHeartbeats(); dead = d; @@ -375,6 +394,7 @@ class HeartbeatManager implements DatanodeStatistics { @Override public void run() { while(namesystem.isRunning()) { +restartHeartbeatStopWatch(); try { final long now = Time.monotonicNow(); if (lastHeartbeatCheck + heartbeatRecheckInterval < now) { @@ -396,6 +416,12 @@ class HeartbeatManager implements DatanodeStatistics { Thread.sleep(5000); // 5 seconds } catch (InterruptedException ie) { } +// avoid declaring nodes dead for another cycle if a GC pause lasts +// longer than the node recheck interval +if (shouldAbortHeartbeatCheck(-5000)) { + LOG.warn("Skipping next heartbeat scan due to excessive pause"); + lastHeartbeatCheck = Time.monotonicNow(); +} } } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/4e7c6a65/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java -
hadoop git commit: HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P. McCabe)
Repository: hadoop Updated Branches: refs/heads/branch-2 d8a5d2b2f -> a6166aa55 HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P. McCabe) (cherry-picked from 4e7c6a653f108d44589f84d78a03d92ee0e8a3c3) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/a6166aa5 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/a6166aa5 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/a6166aa5 Branch: refs/heads/branch-2 Commit: a6166aa5523916f002b70ec9c731fcfe3389228f Parents: d8a5d2b Author: Colin Patrick Mccabe Authored: Fri Sep 25 15:25:42 2015 -0700 Committer: Colin Patrick Mccabe Committed: Fri Sep 25 15:33:02 2015 -0700 -- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 ++ .../blockmanagement/HeartbeatManager.java | 30 ++-- .../blockmanagement/TestHeartbeatHandling.java | 27 ++ 3 files changed, 58 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hadoop/blob/a6166aa5/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 3df32dc00..682f37c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -1071,6 +1071,9 @@ Release 2.8.0 - UNRELEASED HDFS-9123. Copying from the root to a subdirectory should be forbidden. (Wei-Chiu Chuang via Yongjun Zhang) +HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn +Sharp via Colin P. McCabe) + Release 2.7.2 - UNRELEASED INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/a6166aa5/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java index cc9365d..f2e9827 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java @@ -24,6 +24,7 @@ import java.util.IdentityHashMap; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.TimeUnit; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.StorageType; @@ -34,10 +35,13 @@ import org.apache.hadoop.hdfs.server.namenode.Namesystem; import org.apache.hadoop.hdfs.server.protocol.StorageReport; import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary; import org.apache.hadoop.util.Daemon; +import org.apache.hadoop.util.StopWatch; import org.apache.hadoop.util.Time; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.annotations.VisibleForTesting; + /** * Manage the heartbeats received from datanodes. * The datanode list and statistics are synchronized @@ -62,8 +66,8 @@ class HeartbeatManager implements DatanodeStatistics { private final long heartbeatRecheckInterval; /** Heartbeat monitor thread */ private final Daemon heartbeatThread = new Daemon(new Monitor()); + private final StopWatch heartbeatStopWatch = new StopWatch(); - final Namesystem namesystem; final BlockManager blockManager; @@ -260,7 +264,18 @@ class HeartbeatManager implements DatanodeStatistics { stats.add(node); } } - + + @VisibleForTesting + void restartHeartbeatStopWatch() { +heartbeatStopWatch.reset().start(); + } + + @VisibleForTesting + boolean shouldAbortHeartbeatCheck(long offset) { +long elapsed = heartbeatStopWatch.now(TimeUnit.MILLISECONDS); +return elapsed + offset > heartbeatRecheckInterval; + } + /** * Check if there are any expired heartbeats, and if so, * whether any blocks have to be re-replicated. @@ -307,6 +322,10 @@ class HeartbeatManager implements DatanodeStatistics { int numOfStaleStorages = 0; synchronized(this) { for (DatanodeDescriptor d : datanodes) { + // check if an excessive GC pause has occurred + if (shouldAbortHeartbeatCheck(0)) { +return; + } if (dead == null && dm.isDatanodeDead(d)) { stats.incrExpiredHeartbeats(); dead = d; @@ -375,6 +394,7 @@ class HeartbeatManager implements DatanodeStatistics { @Override public void run() { while(namesystem.isRunning()) { +
hadoop git commit: HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P. McCabe)
Repository: hadoop Updated Branches: refs/heads/trunk d3c49e766 -> 4e7c6a653 HDFS-9107. Prevent NN's unrecoverable death spiral after full GC (Daryn Sharp via Colin P. McCabe) Change-Id: Ib8420310e515bb98091de86ea5c4be354878d43c Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/4e7c6a65 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/4e7c6a65 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/4e7c6a65 Branch: refs/heads/trunk Commit: 4e7c6a653f108d44589f84d78a03d92ee0e8a3c3 Parents: d3c49e7 Author: Colin Patrick Mccabe Authored: Fri Sep 25 15:25:42 2015 -0700 Committer: Colin Patrick Mccabe Committed: Fri Sep 25 15:25:42 2015 -0700 -- .../blockmanagement/HeartbeatManager.java | 30 ++-- .../blockmanagement/TestHeartbeatHandling.java | 27 ++ 2 files changed, 55 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hadoop/blob/4e7c6a65/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java index cc9365d..f2e9827 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/HeartbeatManager.java @@ -24,6 +24,7 @@ import java.util.IdentityHashMap; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.TimeUnit; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.StorageType; @@ -34,10 +35,13 @@ import org.apache.hadoop.hdfs.server.namenode.Namesystem; import org.apache.hadoop.hdfs.server.protocol.StorageReport; import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary; import org.apache.hadoop.util.Daemon; +import org.apache.hadoop.util.StopWatch; import org.apache.hadoop.util.Time; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.annotations.VisibleForTesting; + /** * Manage the heartbeats received from datanodes. * The datanode list and statistics are synchronized @@ -62,8 +66,8 @@ class HeartbeatManager implements DatanodeStatistics { private final long heartbeatRecheckInterval; /** Heartbeat monitor thread */ private final Daemon heartbeatThread = new Daemon(new Monitor()); + private final StopWatch heartbeatStopWatch = new StopWatch(); - final Namesystem namesystem; final BlockManager blockManager; @@ -260,7 +264,18 @@ class HeartbeatManager implements DatanodeStatistics { stats.add(node); } } - + + @VisibleForTesting + void restartHeartbeatStopWatch() { +heartbeatStopWatch.reset().start(); + } + + @VisibleForTesting + boolean shouldAbortHeartbeatCheck(long offset) { +long elapsed = heartbeatStopWatch.now(TimeUnit.MILLISECONDS); +return elapsed + offset > heartbeatRecheckInterval; + } + /** * Check if there are any expired heartbeats, and if so, * whether any blocks have to be re-replicated. @@ -307,6 +322,10 @@ class HeartbeatManager implements DatanodeStatistics { int numOfStaleStorages = 0; synchronized(this) { for (DatanodeDescriptor d : datanodes) { + // check if an excessive GC pause has occurred + if (shouldAbortHeartbeatCheck(0)) { +return; + } if (dead == null && dm.isDatanodeDead(d)) { stats.incrExpiredHeartbeats(); dead = d; @@ -375,6 +394,7 @@ class HeartbeatManager implements DatanodeStatistics { @Override public void run() { while(namesystem.isRunning()) { +restartHeartbeatStopWatch(); try { final long now = Time.monotonicNow(); if (lastHeartbeatCheck + heartbeatRecheckInterval < now) { @@ -396,6 +416,12 @@ class HeartbeatManager implements DatanodeStatistics { Thread.sleep(5000); // 5 seconds } catch (InterruptedException ie) { } +// avoid declaring nodes dead for another cycle if a GC pause lasts +// longer than the node recheck interval +if (shouldAbortHeartbeatCheck(-5000)) { + LOG.warn("Skipping next heartbeat scan due to excessive pause"); + lastHeartbeatCheck = Time.monotonicNow(); +} } } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/4e7c6a65/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blo