HBASE-21494 NPE when loading RecoverStandByProcedure
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/b329e6e3 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/b329e6e3 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/b329e6e3 Branch: refs/heads/HBASE-20952 Commit: b329e6e3f271bc22ec4a6f4dd71a8e8b422db3d0 Parents: f555258 Author: zhangduo <zhang...@apache.org> Authored: Sun Nov 18 15:58:53 2018 +0800 Committer: Duo Zhang <zhang...@apache.org> Committed: Mon Nov 19 09:35:18 2018 +0800 ---------------------------------------------------------------------- .../org/apache/hadoop/hbase/master/HMaster.java | 4 +- .../SyncReplicationReplayWALManager.java | 4 +- .../TestRegisterPeerWorkerWhenRestarting.java | 127 +++++++++++++++++++ ...tPeerSyncReplicationStateProcedureRetry.java | 5 + 4 files changed, 137 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/b329e6e3/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 31dc208..e1d3740 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -778,7 +778,6 @@ public class HMaster extends HRegionServer implements MasterServices { this.splitOrMergeTracker.start(); this.replicationPeerManager = ReplicationPeerManager.create(zooKeeper, conf); - this.syncReplicationReplayWALManager = new SyncReplicationReplayWALManager(this); this.drainingServerTracker = new DrainingServerTracker(zooKeeper, this, this.serverManager); this.drainingServerTracker.start(); @@ -949,7 +948,10 @@ public class HMaster extends HRegionServer implements MasterServices { } status.setStatus("Initialize ServerManager and schedule SCP for crash servers"); + // The below two managers must be created before loading procedures, as they will be used during + // loading. this.serverManager = createServerManager(this); + this.syncReplicationReplayWALManager = new SyncReplicationReplayWALManager(this); createProcedureExecutor(); @SuppressWarnings("rawtypes") Map<Class<? extends Procedure>, List<Procedure<MasterProcedureEnv>>> procsByType = http://git-wip-us.apache.org/repos/asf/hbase/blob/b329e6e3/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/SyncReplicationReplayWALManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/SyncReplicationReplayWALManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/SyncReplicationReplayWALManager.java index 89e97bb..ae624b1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/SyncReplicationReplayWALManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/SyncReplicationReplayWALManager.java @@ -146,12 +146,12 @@ public class SyncReplicationReplayWALManager { this.fs = services.getMasterFileSystem().getWALFileSystem(); this.walRootDir = services.getMasterFileSystem().getWALRootDir(); this.remoteWALDir = new Path(this.walRootDir, ReplicationUtils.REMOTE_WAL_DIR_NAME); - MasterProcedureScheduler scheduler = - services.getMasterProcedureExecutor().getEnvironment().getProcedureScheduler(); serverManager.registerListener(new ServerListener() { @Override public void serverAdded(ServerName serverName) { + MasterProcedureScheduler scheduler = + services.getMasterProcedureExecutor().getEnvironment().getProcedureScheduler(); for (UsedReplayWorkersForPeer usedWorkers : usedWorkersByPeer.values()) { synchronized (usedWorkers) { usedWorkers.wake(scheduler); http://git-wip-us.apache.org/repos/asf/hbase/blob/b329e6e3/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestRegisterPeerWorkerWhenRestarting.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestRegisterPeerWorkerWhenRestarting.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestRegisterPeerWorkerWhenRestarting.java new file mode 100644 index 0000000..72aa32d --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestRegisterPeerWorkerWhenRestarting.java @@ -0,0 +1,127 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.replication; + +import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RecoverStandbyState.DISPATCH_WALS_VALUE; +import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RecoverStandbyState.UNREGISTER_PEER_FROM_WORKER_STORAGE_VALUE; +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.io.UncheckedIOException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.replication.SyncReplicationState; +import org.apache.hadoop.hbase.replication.SyncReplicationTestBase; +import org.apache.hadoop.hbase.testclassification.LargeTests; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; +import org.apache.zookeeper.KeeperException; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +/** + * Testcase for HBASE-21494. + */ +@Category({ MasterTests.class, LargeTests.class }) +public class TestRegisterPeerWorkerWhenRestarting extends SyncReplicationTestBase { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestRegisterPeerWorkerWhenRestarting.class); + + private static volatile boolean FAIL = false; + + public static final class HMasterForTest extends HMaster { + + public HMasterForTest(Configuration conf) throws IOException, KeeperException { + super(conf); + } + + @Override + public void remoteProcedureCompleted(long procId) { + if (FAIL && getMasterProcedureExecutor() + .getProcedure(procId) instanceof SyncReplicationReplayWALRemoteProcedure) { + throw new RuntimeException("Inject error"); + } + super.remoteProcedureCompleted(procId); + } + } + + @BeforeClass + public static void setUp() throws Exception { + UTIL2.getConfiguration().setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class); + SyncReplicationTestBase.setUp(); + } + + @Test + public void testRestart() throws Exception { + UTIL2.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID, + SyncReplicationState.STANDBY); + UTIL1.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID, + SyncReplicationState.ACTIVE); + + UTIL1.getAdmin().disableReplicationPeer(PEER_ID); + write(UTIL1, 0, 100); + Thread.sleep(2000); + // peer is disabled so no data have been replicated + verifyNotReplicatedThroughRegion(UTIL2, 0, 100); + + // transit the A to DA first to avoid too many error logs. + UTIL1.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID, + SyncReplicationState.DOWNGRADE_ACTIVE); + HMaster master = UTIL2.getHBaseCluster().getMaster(); + // make sure the transiting can not succeed + FAIL = true; + ProcedureExecutor<MasterProcedureEnv> procExec = master.getMasterProcedureExecutor(); + Thread t = new Thread() { + + @Override + public void run() { + try { + UTIL2.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID, + SyncReplicationState.DOWNGRADE_ACTIVE); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + }; + t.start(); + // wait until we are in the states where we need to register peer worker when restarting + UTIL2.waitFor(60000, + () -> procExec.getProcedures().stream().filter(p -> p instanceof RecoverStandbyProcedure) + .map(p -> (RecoverStandbyProcedure) p) + .anyMatch(p -> p.getCurrentStateId() == DISPATCH_WALS_VALUE || + p.getCurrentStateId() == UNREGISTER_PEER_FROM_WORKER_STORAGE_VALUE)); + // failover to another master + MasterThread mt = UTIL2.getMiniHBaseCluster().getMasterThread(); + mt.getMaster().abort("for testing"); + mt.join(); + FAIL = false; + t.join(); + // make sure the new master can finish the transiting + assertEquals(SyncReplicationState.DOWNGRADE_ACTIVE, + UTIL2.getAdmin().getReplicationPeerSyncReplicationState(PEER_ID)); + verify(UTIL2, 0, 100); + } +} http://git-wip-us.apache.org/repos/asf/hbase/blob/b329e6e3/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestTransitPeerSyncReplicationStateProcedureRetry.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestTransitPeerSyncReplicationStateProcedureRetry.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestTransitPeerSyncReplicationStateProcedureRetry.java index 1c4a819..9b73039 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestTransitPeerSyncReplicationStateProcedureRetry.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/replication/TestTransitPeerSyncReplicationStateProcedureRetry.java @@ -17,6 +17,8 @@ */ package org.apache.hadoop.hbase.master.replication; +import static org.junit.Assert.assertEquals; + import java.io.IOException; import java.io.UncheckedIOException; import org.apache.hadoop.hbase.HBaseClassTestRule; @@ -90,5 +92,8 @@ public class TestTransitPeerSyncReplicationStateProcedureRetry extends SyncRepli .mapToLong(Procedure::getProcId).min().getAsLong(); MasterProcedureTestingUtility.testRecoveryAndDoubleExecution(procExec, procId); ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, false); + assertEquals(SyncReplicationState.DOWNGRADE_ACTIVE, + UTIL2.getAdmin().getReplicationPeerSyncReplicationState(PEER_ID)); + verify(UTIL2, 0, 100); } }