HBASE-21126 Configurable number of allowed failures for ZooKeeper Canary Signed-off-by: Josh Elser <els...@apache.org>
Conflicts: hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/09069df2 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/09069df2 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/09069df2 Branch: refs/heads/branch-1.3 Commit: 09069df2f8a9cb19ce368a54770d333f0e36fe5d Parents: 1ecfca3 Author: David Manning <david.mann...@salesforce.com> Authored: Fri Aug 31 18:32:15 2018 -0700 Committer: Andrew Purtell <apurt...@apache.org> Committed: Wed Dec 12 19:25:40 2018 -0800 ---------------------------------------------------------------------- .../org/apache/hadoop/hbase/tool/Canary.java | 51 +++++++++++++++----- .../hadoop/hbase/tool/TestCanaryTool.java | 35 +++++++++----- 2 files changed, 62 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/09069df2/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java index dcaa057..081ef90 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/Canary.java @@ -580,6 +580,7 @@ public final class Canary implements Tool { private boolean failOnError = true; private boolean regionServerMode = false; private boolean zookeeperMode = false; + private long permittedFailures = 0; private boolean regionServerAllRegions = false; private boolean writeSniffing = false; private long configuredWriteTableTimeout = DEFAULT_TIMEOUT; @@ -723,6 +724,19 @@ public final class Canary implements Tool { } this.configuredReadTableTimeouts.put(nameTimeout[0], timeoutVal); } + } else if (cmd.equals("-permittedZookeeperFailures")) { + i++; + + if (i == args.length) { + System.err.println("-permittedZookeeperFailures needs a numeric value argument."); + printUsageAndExit(); + } + try { + this.permittedFailures = Long.parseLong(args[i]); + } catch (NumberFormatException e) { + System.err.println("-permittedZookeeperFailures needs a numeric value argument."); + printUsageAndExit(); + } } else { // no options match System.err.println(cmd + " options is invalid."); @@ -744,6 +758,10 @@ public final class Canary implements Tool { printUsageAndExit(); } } + if (this.permittedFailures != 0 && !this.zookeeperMode) { + System.err.println("-permittedZookeeperFailures requires -zookeeper mode."); + printUsageAndExit(); + } if (!this.configuredReadTableTimeouts.isEmpty() && (this.regionServerMode || this.zookeeperMode)) { System.err.println("-readTableTimeouts can only be configured in region mode."); printUsageAndExit(); @@ -842,6 +860,8 @@ public final class Canary implements Tool { System.err.println(" only works in regionserver mode."); System.err.println(" -zookeeper Tries to grab zookeeper.znode.parent "); System.err.println(" on each zookeeper instance"); + System.err.println(" -permittedZookeeperFailures <N> Ignore first N failures when attempting to "); + System.err.println(" connect to individual zookeeper nodes in the ensemble"); System.err.println(" -daemon Continuous check at defined intervals."); System.err.println(" -interval <N> Interval between checks (sec)"); System.err.println(" -e Use table/regionserver as regular expression"); @@ -884,17 +904,18 @@ public final class Canary implements Tool { monitor = new RegionServerMonitor(connection, monitorTargets, this.useRegExp, (StdOutSink) this.sink, this.executor, this.regionServerAllRegions, - this.treatFailureAsError); + this.treatFailureAsError, this.permittedFailures); } else if (this.sink instanceof ZookeeperStdOutSink || this.zookeeperMode) { monitor = new ZookeeperMonitor(connection, monitorTargets, this.useRegExp, - (StdOutSink) this.sink, this.executor, this.treatFailureAsError); + (StdOutSink) this.sink, this.executor, this.treatFailureAsError, + this.permittedFailures); } else { monitor = new RegionMonitor(connection, monitorTargets, this.useRegExp, (StdOutSink) this.sink, this.executor, this.writeSniffing, this.writeTableName, this.treatFailureAsError, this.configuredReadTableTimeouts, - this.configuredWriteTableTimeout); + this.configuredWriteTableTimeout, this.permittedFailures); } return monitor; } @@ -911,6 +932,7 @@ public final class Canary implements Tool { protected boolean done = false; protected int errorCode = 0; + protected long allowedFailures = 0; protected Sink sink; protected ExecutorService executor; @@ -927,7 +949,8 @@ public final class Canary implements Tool { return true; } if (treatFailureAsError && - (sink.getReadFailureCount() > 0 || sink.getWriteFailureCount() > 0)) { + (sink.getReadFailureCount() > allowedFailures || sink.getWriteFailureCount() > allowedFailures)) { + LOG.error("Too many failures detected, treating failure as error, failing the Canary."); errorCode = FAILURE_EXIT_CODE; return true; } @@ -940,7 +963,7 @@ public final class Canary implements Tool { } protected Monitor(Connection connection, String[] monitorTargets, boolean useRegExp, Sink sink, - ExecutorService executor, boolean treatFailureAsError) { + ExecutorService executor, boolean treatFailureAsError, long allowedFailures) { if (null == connection) throw new IllegalArgumentException("connection shall not be null"); this.connection = connection; @@ -949,6 +972,7 @@ public final class Canary implements Tool { this.treatFailureAsError = treatFailureAsError; this.sink = sink; this.executor = executor; + this.allowedFailures = allowedFailures; } @Override @@ -991,8 +1015,8 @@ public final class Canary implements Tool { public RegionMonitor(Connection connection, String[] monitorTargets, boolean useRegExp, StdOutSink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName, boolean treatFailureAsError, HashMap<String, Long> configuredReadTableTimeouts, - long configuredWriteTableTimeout) { - super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError); + long configuredWriteTableTimeout, long allowedFailures) { + super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures); Configuration conf = connection.getConfiguration(); this.writeSniffing = writeSniffing; this.writeTableName = writeTableName; @@ -1286,8 +1310,8 @@ public final class Canary implements Tool { private final int timeout; protected ZookeeperMonitor(Connection connection, String[] monitorTargets, boolean useRegExp, - StdOutSink sink, ExecutorService executor, boolean treatFailureAsError) { - super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError); + StdOutSink sink, ExecutorService executor, boolean treatFailureAsError, long allowedFailures) { + super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures); Configuration configuration = connection.getConfiguration(); znode = configuration.get(ZOOKEEPER_ZNODE_PARENT, @@ -1300,6 +1324,11 @@ public final class Canary implements Tool { for (InetSocketAddress server : parser.getServerAddresses()) { hosts.add(server.toString()); } + if (allowedFailures > (hosts.size() - 1) / 2) { + LOG.warn(String.format("Confirm allowable number of failed ZooKeeper nodes, as quorum will " + + "already be lost. Setting of %d failures is unexpected for %d ensemble size.", + allowedFailures, hosts.size())); + } } @Override public void run() { @@ -1348,8 +1377,8 @@ public final class Canary implements Tool { public RegionServerMonitor(Connection connection, String[] monitorTargets, boolean useRegExp, StdOutSink sink, ExecutorService executor, boolean allRegions, - boolean treatFailureAsError) { - super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError); + boolean treatFailureAsError, long allowedFailures) { + super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures); this.allRegions = allRegions; } http://git-wip-us.apache.org/repos/asf/hbase/blob/09069df2/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java index b89561f..8654a51 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java @@ -83,20 +83,14 @@ public class TestCanaryTool { @Test public void testBasicZookeeperCanaryWorks() throws Exception { - Integer port = - Iterables.getOnlyElement(testingUtility.getZkCluster().getClientPortList(), null); - testingUtility.getConfiguration().set(HConstants.ZOOKEEPER_QUORUM, - "localhost:" + port + "/hbase"); - ExecutorService executor = new ScheduledThreadPoolExecutor(2); - Canary.ZookeeperStdOutSink sink = spy(new Canary.ZookeeperStdOutSink()); - Canary canary = new Canary(executor, sink); - String[] args = { "-t", "10000", "-zookeeper" }; - ToolRunner.run(testingUtility.getConfiguration(), canary, args); + final String[] args = { "-t", "10000", "-zookeeper" }; + testZookeeperCanaryWithArgs(args); + } - String baseZnode = testingUtility.getConfiguration() - .get(HConstants.ZOOKEEPER_ZNODE_PARENT, HConstants.DEFAULT_ZOOKEEPER_ZNODE_PARENT); - verify(sink, atLeastOnce()) - .publishReadTiming(eq(baseZnode), eq("localhost:" + port), anyLong()); + @Test + public void testZookeeperCanaryPermittedFailuresArgumentWorks() throws Exception { + final String[] args = { "-t", "10000", "-zookeeper", "-treatFailureAsError", "-permittedZookeeperFailures", "1" }; + testZookeeperCanaryWithArgs(args); } @Test @@ -237,4 +231,19 @@ public class TestCanaryTool { assertEquals("verify no read error count", 0, canary.getReadFailures().size()); } + private void testZookeeperCanaryWithArgs(String[] args) throws Exception { + Integer port = + Iterables.getOnlyElement(testingUtility.getZkCluster().getClientPortList(), null); + testingUtility.getConfiguration().set(HConstants.ZOOKEEPER_QUORUM, + "localhost:" + port + "/hbase"); + ExecutorService executor = new ScheduledThreadPoolExecutor(2); + Canary.ZookeeperStdOutSink sink = spy(new Canary.ZookeeperStdOutSink()); + Canary canary = new Canary(executor, sink); + assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args)); + + String baseZnode = testingUtility.getConfiguration() + .get(HConstants.ZOOKEEPER_ZNODE_PARENT, HConstants.DEFAULT_ZOOKEEPER_ZNODE_PARENT); + verify(sink, atLeastOnce()) + .publishReadTiming(eq(baseZnode), eq("localhost:" + port), anyLong()); + } } \ No newline at end of file