SwaraliJoshi commented on code in PR #8427: URL: https://github.com/apache/hbase/pull/8427#discussion_r3497004058
########## hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestOpenRegionProcedureRestoreFailedOpen.java: ########## @@ -0,0 +1,245 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.assignment; + +import static org.junit.Assert.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.hadoop.hbase.Cell; +import org.apache.hadoop.hbase.CellUtil; +import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.SingleProcessHBaseCluster; +import org.apache.hadoop.hbase.StartTestingClusterOption; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.coprocessor.CoprocessorHost; +import org.apache.hadoop.hbase.coprocessor.ObserverContext; +import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor; +import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment; +import org.apache.hadoop.hbase.coprocessor.RegionObserver; +import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.master.RegionState.State; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureTestingUtility; +import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.testclassification.LargeTests; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.wal.WALEdit; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * End-to-end reproduction of HBASE-29364 on a real mini cluster. + * Narrative (mirrors the incident): + * A region is open and serving. + * Its RegionServer is killed; ServerCrashProcedure reassigns the region (TRSP). + * The reassignment open fails (FAILED_OPEN); the OpenRegionProcedure + * records REPORT_SUCCEED + transitionCode=FAILED_OPEN in the procedure store. + * The active Master "dies" before the procedure can be wrapped up, and a backup Master takes + * over (real masterFailover). + * The new Master replays the procedure during meta-load and -- because of the bug -- + * marks the region OPEN instead of retrying it. + * The only non-organic part is the freeze: to deterministically keep the procedure in + * REPORT_SUCCEED across the failover, a coprocessor on hbase:meta fails the + * persistToMeta write that runs immediately after the FAILED_OPEN (this is exactly + * the JIRA condition, where hbase:meta was unavailable at that moment). + * This test asserts the buggy behaviour, so it PASSES against the unfixed code. After the + * fix lands it must be inverted (the region must NOT come back OPEN). + */ +@Tag(MasterTests.TAG) +@Tag(LargeTests.TAG) +public class TestOpenRegionProcedureRestoreFailedOpen { + + private static final Logger LOG = + LoggerFactory.getLogger(TestOpenRegionProcedureRestoreFailedOpen.class); + + private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); + private static final TableName TABLE_NAME = TableName.valueOf("t2-failed-open"); + private static final byte[] CF = Bytes.toBytes("cf"); + + @BeforeAll + public static void setUp() throws Exception { + UTIL.getConfiguration().setStrings(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, + FailOpenAndBlockMetaCP.class.getName()); + // Do not give up on the open; we want it stuck retrying, not failed-for-good. + UTIL.getConfiguration().setInt(AssignmentManager.ASSIGN_MAX_ATTEMPTS, 1000); + // We kill an RS mid-test; the failover master must not block its init waiting for the full + // RegionServer count to come back. + UTIL.getConfiguration().setInt("hbase.master.wait.on.regionservers.mintostart", 1); + StartTestingClusterOption option = + StartTestingClusterOption.builder().numMasters(2).numRegionServers(3).build(); + UTIL.startMiniCluster(option); + UTIL.getAdmin().balancerSwitch(false, true); + } + + @AfterAll + public static void tearDown() throws Exception { + // Stop injecting failures so the cluster can shut down cleanly. + FailOpenAndBlockMetaCP.armed = false; + UTIL.shutdownMiniCluster(); + } + + @Test + public void testFailedOpenReplayedAsOpen() throws Exception { + SingleProcessHBaseCluster cluster = UTIL.getMiniHBaseCluster(); + + UTIL.createTable(TABLE_NAME, CF); + UTIL.waitTableAvailable(TABLE_NAME); + RegionInfo hri = UTIL.getAdmin().getRegions(TABLE_NAME).get(0); + + // Arm the injection: from now on, opens of this region FAIL, and the 2nd OPENING write to this + // region's hbase:meta row (the post-FAILED_OPEN persistToMeta) is blocked. + FailOpenAndBlockMetaCP.targetRegionMetaRow = hri.getRegionName(); + FailOpenAndBlockMetaCP.failOpenTable = TABLE_NAME; + FailOpenAndBlockMetaCP.armed = true; + + // Kill the RegionServer hosting the region -> ServerCrashProcedure -> reassignment (TRSP). + ServerName rsWithRegion = findRegionServer(cluster, hri); + LOG.info("Killing RS hosting {}: {}", hri.getEncodedName(), rsWithRegion); + cluster.killRegionServer(rsWithRegion); + cluster.waitForRegionServerToStop(rsWithRegion, 60_000); + + // Wait until the reassignment open has failed and the procedure is frozen in REPORT_SUCCEED + // (its post-FAILED_OPEN persistToMeta is blocked in the meta coprocessor). + LOG.info("Waiting for the FAILED_OPEN procedure to freeze in REPORT_SUCCEED..."); + org.junit.jupiter.api.Assertions.assertTrue( + FailOpenAndBlockMetaCP.reachedBlock.await(120, TimeUnit.SECONDS), + "Timed out waiting for the FAILED_OPEN persistToMeta to be reached"); + + // The region must NOT be OPEN at this point - the open failed. + State beforeFailover = regionState(cluster.getMaster(), hri); + LOG.info("State before failover (active master): {}", beforeFailover); Review Comment: @virajjasani Updated sir, could you please check now? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
