Repository: hbase Updated Branches: refs/heads/branch-1 113554ded -> 6e0ee4efa refs/heads/branch-1.4 964a1d614 -> b22095eea refs/heads/branch-2 79ec1a1fd -> 8a5537b5f refs/heads/branch-2.1 f9d7ac2d5 -> 3df8b6f7b refs/heads/master 79fe878a3 -> 42aa3dd46
HBASE-18549 Add metrics for failed replication queue recovery Signed-off-by: Andrew Purtell <apurt...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/42aa3dd4 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/42aa3dd4 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/42aa3dd4 Branch: refs/heads/master Commit: 42aa3dd463c0d30a9b940d296b87316b5c67e1f5 Parents: 79fe878 Author: Xu Cang <xc...@salesforce.com> Authored: Wed Aug 29 15:49:51 2018 -0700 Committer: Andrew Purtell <apurt...@apache.org> Committed: Mon Oct 1 18:38:55 2018 -0700 ---------------------------------------------------------------------- .../regionserver/MetricsReplicationSourceSource.java | 2 ++ .../MetricsReplicationGlobalSourceSource.java | 8 +++++++- .../MetricsReplicationSourceSourceImpl.java | 3 +++ .../hbase/replication/ReplicationQueueStorage.java | 7 +++++++ .../hbase/replication/ZKReplicationQueueStorage.java | 3 ++- .../hbase/replication/regionserver/MetricsSource.java | 4 ++++ .../regionserver/ReplicationSourceManager.java | 10 +++++++++- .../hbase/replication/TestReplicationEndpoint.java | 14 +++++++++++++- 8 files changed, 47 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/42aa3dd4/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java ---------------------------------------------------------------------- diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java index 1045113..61e9431 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java @@ -53,6 +53,7 @@ public interface MetricsReplicationSourceSource extends BaseSource { public static final String SOURCE_REPEATED_LOG_FILE_BYTES = "source.repeatedLogFileBytes"; public static final String SOURCE_COMPLETED_LOGS = "source.completedLogs"; public static final String SOURCE_COMPLETED_RECOVERY_QUEUES = "source.completedRecoverQueues"; + public static final String SOURCE_FAILED_RECOVERY_QUEUES = "source.failedRecoverQueues"; void setLastShippedAge(long age); void incrSizeOfLogQueue(int size); @@ -76,4 +77,5 @@ public interface MetricsReplicationSourceSource extends BaseSource { void incrRepeatedFileBytes(final long bytes); void incrCompletedWAL(); void incrCompletedRecoveryQueue(); + void incrFailedRecoveryQueue(); } http://git-wip-us.apache.org/repos/asf/hbase/blob/42aa3dd4/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java ---------------------------------------------------------------------- diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java index 9a86cf2..4e8c810 100644 --- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java +++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationGlobalSourceSource.java @@ -52,6 +52,7 @@ public class MetricsReplicationGlobalSourceSource implements MetricsReplicationS private final MutableFastCounter repeatedFileBytes; private final MutableFastCounter completedWAL; private final MutableFastCounter completedRecoveryQueue; + private final MutableFastCounter failedRecoveryQueue; public MetricsReplicationGlobalSourceSource(MetricsReplicationSourceImpl rms) { this.rms = rms; @@ -89,6 +90,8 @@ public class MetricsReplicationGlobalSourceSource implements MetricsReplicationS completedWAL = rms.getMetricsRegistry().getCounter(SOURCE_COMPLETED_LOGS, 0L); completedRecoveryQueue = rms.getMetricsRegistry() .getCounter(SOURCE_COMPLETED_RECOVERY_QUEUES, 0L); + failedRecoveryQueue = rms.getMetricsRegistry() + .getCounter(SOURCE_FAILED_RECOVERY_QUEUES, 0L); } @Override public void setLastShippedAge(long age) { @@ -199,7 +202,10 @@ public class MetricsReplicationGlobalSourceSource implements MetricsReplicationS public void incrCompletedRecoveryQueue() { completedRecoveryQueue.incr(1L); } - + @Override + public void incrFailedRecoveryQueue() { + failedRecoveryQueue.incr(1L); + } @Override public void init() { rms.init(); http://git-wip-us.apache.org/repos/asf/hbase/blob/42aa3dd4/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java ---------------------------------------------------------------------- diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java index 719c916..0ad5052 100644 --- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java +++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSourceImpl.java @@ -258,6 +258,9 @@ public class MetricsReplicationSourceSourceImpl implements MetricsReplicationSou } @Override + public void incrFailedRecoveryQueue() {/*no op*/} + + @Override public void init() { rms.init(); } http://git-wip-us.apache.org/repos/asf/hbase/blob/42aa3dd4/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java ---------------------------------------------------------------------- diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java index 84653ad..59278e9 100644 --- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ReplicationQueueStorage.java @@ -202,4 +202,11 @@ public interface ReplicationQueueStorage { * created hfile references during the call may not be included. */ Set<String> getAllHFileRefs() throws ReplicationException; + + /** + * Get full znode name for given region server + * @param serverName the name of the region server + * @return full znode name + */ + String getRsNode(ServerName serverName); } http://git-wip-us.apache.org/repos/asf/hbase/blob/42aa3dd4/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorage.java ---------------------------------------------------------------------- diff --git a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorage.java b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorage.java index cca8bfc..68f2adc 100644 --- a/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorage.java +++ b/hbase-replication/src/main/java/org/apache/hadoop/hbase/replication/ZKReplicationQueueStorage.java @@ -118,7 +118,8 @@ class ZKReplicationQueueStorage extends ZKReplicationStorageBase .get(ZOOKEEPER_ZNODE_REPLICATION_REGIONS_KEY, ZOOKEEPER_ZNODE_REPLICATION_REGIONS_DEFAULT)); } - private String getRsNode(ServerName serverName) { + @Override + public String getRsNode(ServerName serverName) { return ZNodePaths.joinZNode(queuesZNode, serverName.getServerName()); } http://git-wip-us.apache.org/repos/asf/hbase/blob/42aa3dd4/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java index 906f0c6..830ebe1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsSource.java @@ -326,6 +326,10 @@ public class MetricsSource implements BaseSource { globalSourceSource.incrCompletedRecoveryQueue(); } + public void incrFailedRecoveryQueue() { + globalSourceSource.incrFailedRecoveryQueue(); + } + @Override public void init() { singleSourceSource.init(); http://git-wip-us.apache.org/repos/asf/hbase/blob/42aa3dd4/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java index 428ec98..5756cbc 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java @@ -45,6 +45,7 @@ import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.CompatibilitySingletonFactory; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.ServerName; @@ -820,6 +821,8 @@ public class ReplicationSourceManager implements ReplicationListener { try { this.executor.execute(transfer); } catch (RejectedExecutionException ex) { + CompatibilitySingletonFactory.getInstance(MetricsReplicationSourceFactory.class) + .getGlobalSource().incrFailedRecoveryQueue(); LOG.info("Cancelling the transfer of " + deadRS + " because of " + ex.getMessage()); } } @@ -891,7 +894,12 @@ public class ReplicationSourceManager implements ReplicationListener { queueStorage.removeReplicatorIfQueueIsEmpty(deadRS); } } catch (ReplicationException e) { - server.abort("Failed to claim queue from dead regionserver", e); + LOG.error(String.format("ReplicationException: cannot claim dead region (%s)'s " + + "replication queue. Znode : (%s)" + + " Possible solution: check if znode size exceeds jute.maxBuffer value. " + + " If so, increase it for both client and server side." + e), deadRS, + queueStorage.getRsNode(deadRS)); + server.abort("Failed to claim queue from dead regionserver.", e); return; } // Copying over the failed queue is completed. http://git-wip-us.apache.org/repos/asf/hbase/blob/42aa3dd4/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationEndpoint.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationEndpoint.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationEndpoint.java index 5d833cc..03fbb59 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationEndpoint.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationEndpoint.java @@ -17,7 +17,9 @@ */ package org.apache.hadoop.hbase.replication; +import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @@ -316,11 +318,17 @@ public class TestReplicationEndpoint extends TestReplicationBase { MetricsReplicationSourceImpl globalRms = mock(MetricsReplicationSourceImpl.class); when(globalRms.getMetricsRegistry()).thenReturn(mockRegistry); + MetricsReplicationSourceSource singleSourceSource = new MetricsReplicationSourceSourceImpl(singleRms, id); MetricsReplicationSourceSource globalSourceSource = new MetricsReplicationGlobalSourceSource(globalRms); + MetricsReplicationSourceSource spyglobalSourceSource = spy(globalSourceSource); + doNothing().when(spyglobalSourceSource).incrFailedRecoveryQueue(); + Map<String, MetricsReplicationSourceSource> singleSourceSourceByTable = new HashMap<>(); - MetricsSource source = new MetricsSource(id, singleSourceSource, globalSourceSource, + MetricsSource source = new MetricsSource(id, singleSourceSource, spyglobalSourceSource, singleSourceSourceByTable); + + String gaugeName = "gauge"; String singleGaugeName = "source.id." + gaugeName; String globalGaugeName = "source." + gaugeName; @@ -340,6 +348,8 @@ public class TestReplicationEndpoint extends TestReplicationBase { source.removeMetric(gaugeName); source.setGauge(gaugeName, delta); source.updateHistogram(counterName, count); + source.incrFailedRecoveryQueue(); + verify(singleRms).decGauge(singleGaugeName, delta); verify(globalRms).decGauge(globalGaugeName, delta); @@ -357,6 +367,7 @@ public class TestReplicationEndpoint extends TestReplicationBase { verify(globalRms).setGauge(globalGaugeName, delta); verify(singleRms).updateHistogram(singleCounterName, count); verify(globalRms).updateHistogram(globalCounterName, count); + verify(spyglobalSourceSource).incrFailedRecoveryQueue(); //check singleSourceSourceByTable metrics. // singleSourceSourceByTable map entry will be created only @@ -373,6 +384,7 @@ public class TestReplicationEndpoint extends TestReplicationBase { // cannot put more concreate value here to verify because the age is arbitrary. // as long as it's greater than 0, we see it as correct answer. Assert.assertTrue(msr.getLastShippedAge() > 0); + } private void doPut(byte[] row) throws IOException {