[ https://issues.apache.org/jira/browse/HBASE-28241?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17793243#comment-17793243 ]
ruanhui commented on HBASE-28241: --------------------------------- I wrote an UT and reproduced the problem, here is the UT code: {code:java} @Test public void testMergingRegionWhileTakingSnapshot() throws Exception { // take snapshot ProcedureExecutor<MasterProcedureEnv> procExec = master.getMasterProcedureExecutor(); // long snapshotProcId = procExec.submitProcedure( // new DelaySnapshotProcedure(procExec.getEnvironment(), snapshot)); long snapshotProcId = master.getSnapshotManager().takeSnapshot(snapshotProto, HConstants.NO_NONCE, HConstants.NO_NONCE); // merge region List<RegionInfo> regions = master.getAssignmentManager().getTableRegions(TABLE_NAME, true) .stream().sorted(RegionInfo.COMPARATOR).collect(Collectors.toList()); MergeTableRegionsProcedure mergeProc = new MergeTableRegionsProcedure(procExec.getEnvironment(), new RegionInfo[] { regions.get(0), regions.get(1) }, false); long mergeProcId = procExec.submitProcedure(mergeProc); // wait procedure finish ProcedureTestingUtility.waitProcedure(master.getMasterProcedureExecutor(), mergeProcId); ProcedureTestingUtility.waitProcedure(master.getMasterProcedureExecutor(), snapshotProcId); } // to make the problem easier to reproduce, this class will wait 10 seconds before scheduling // snapshot region procedure to make sure that we have merge the region and remove the parent regionNode public static final class DelaySnapshotProcedure extends SnapshotProcedure { public DelaySnapshotProcedure() { } public DelaySnapshotProcedure(final MasterProcedureEnv env, final SnapshotProtos.SnapshotDescription snapshot) { super(env, snapshot); } @Override protected Flow executeFromState(MasterProcedureEnv env, MasterProcedureProtos.SnapshotState state) throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException { Flow flow = super.executeFromState(env, state); if (state == SNAPSHOT_SNAPSHOT_ONLINE_REGIONS) { TimeUnit.SECONDS.sleep(10); } return flow; } }{code} and got the following exception {code:java} 2023-12-05T19:29:08,270 ERROR [PEWorker-3 {}] procedure2.ProcedureExecutor(1784): CODE-BUG: Uncaught runtime exception: pid=33, ppid=25, state=RUNNABLE, hasLock=true; SnapshotRegionProcedure 18f01fba77da8a1e1ad836abf49a62a2 java.lang.NullPointerException: null at org.apache.hadoop.hbase.master.procedure.SnapshotRegionProcedure.execute(SnapshotRegionProcedure.java:160) ~[classes/:?] at org.apache.hadoop.hbase.master.procedure.SnapshotRegionProcedure.execute(SnapshotRegionProcedure.java:58) ~[classes/:?] at org.apache.hadoop.hbase.procedure2.Procedure.doExecute(Procedure.java:941) ~[classes/:?] at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.execProcedure(ProcedureExecutor.java:1765) ~[classes/:?] at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.executeProcedure(ProcedureExecutor.java:1443) ~[classes/:?] at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.access$1000(ProcedureExecutor.java:77) ~[classes/:?] at org.apache.hadoop.hbase.procedure2.ProcedureExecutor$WorkerThread.runProcedure(ProcedureExecutor.java:2091) ~[classes/:?] at org.apache.hadoop.hbase.trace.TraceUtil.trace(TraceUtil.java:216) ~[classes/:?] at org.apache.hadoop.hbase.procedure2.ProcedureExecutor$WorkerThread.run(ProcedureExecutor.java:2118) ~[classes/:?] {code} After correct the method, I got the expected failed merge procedure. {code:java} 2023-12-05T19:44:58,207 ERROR [PEWorker-4 {}] assignment.MergeTableRegionsProcedure(249): Error trying to merge [6421d004d9e61c8c30e49b3af4b923de, 7ef0155567c0060bad76b0c82990f010] in SPTestTable (in state=MERGE_TABLE_REGIONS_PREPARE) org.apache.hadoop.hbase.exceptions.MergeRegionException: Skip merging regions [6421d004d9e61c8c30e49b3af4b923de, 7ef0155567c0060bad76b0c82990f010], because we are snapshotting SPTestTable at org.apache.hadoop.hbase.master.assignment.MergeTableRegionsProcedure.prepareMergeRegion(MergeTableRegionsProcedure.java:447) ~[classes/:?] at org.apache.hadoop.hbase.master.assignment.MergeTableRegionsProcedure.executeFromState(MergeTableRegionsProcedure.java:193) ~[classes/:?] at org.apache.hadoop.hbase.master.assignment.MergeTableRegionsProcedure.executeFromState(MergeTableRegionsProcedure.java:78) ~[classes/:?] at org.apache.hadoop.hbase.procedure2.StateMachineProcedure.execute(StateMachineProcedure.java:188) ~[classes/:?] at org.apache.hadoop.hbase.procedure2.Procedure.doExecute(Procedure.java:941) ~[classes/:?] at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.execProcedure(ProcedureExecutor.java:1765) ~[classes/:?] at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.executeProcedure(ProcedureExecutor.java:1443) ~[classes/:?] at org.apache.hadoop.hbase.procedure2.ProcedureExecutor.access$1000(ProcedureExecutor.java:77) ~[classes/:?] at org.apache.hadoop.hbase.procedure2.ProcedureExecutor$WorkerThread.runProcedure(ProcedureExecutor.java:2091) ~[classes/:?] at org.apache.hadoop.hbase.trace.TraceUtil.trace(TraceUtil.java:216) ~[classes/:?] at org.apache.hadoop.hbase.procedure2.ProcedureExecutor$WorkerThread.run(ProcedureExecutor.java:2118) ~[classes/:?] {code} > The snapshot operation encountered an NPE and failed. > ----------------------------------------------------- > > Key: HBASE-28241 > URL: https://issues.apache.org/jira/browse/HBASE-28241 > Project: HBase > Issue Type: Bug > Affects Versions: 3.0.0-beta-1 > Reporter: Haiping lv > Assignee: Haiping lv > Priority: Major > > Executing a merge region before the snapshot operation is completed will > result in an NPE error for the snapshot operation and cause it to fail > Triggering logic > # Execute the snapshot command. > # Merge regions before the snapshot is completed. > # After the merge is completed, the two parent regions will be cleaned up. > # An NPE will be reported when the snapshot operation is performed on that > parent region. The log is as follow: > {code:java} > 2023-11-20T23:26:07,061 ERROR [PEWorker-13] procedure2.ProcedureExecutor: > CODE-BUG: Uncaught runtime exception: pid=238720, ppid=238714, > state=RUNNABLE, hasLock=true; SnapshotRegionProcedure > c47539f1d6032ba1a037f5279c22baa0 > java.lang.NullPointerException: null > at > org.apache.hadoop.hbase.master.procedure.SnapshotRegionProcedure.execute(SnapshotRegionProcedure.java:160) > ~[hbase-server-3.0.0-beta-1-SNAPSHOT.jar:3.0.0-beta-1-SNAPSHOT] > at > org.apache.hadoop.hbase.master.procedure.SnapshotRegionProcedure.execute(SnapshotRegionProcedure.java:58) > ~[hbase-server-3.0.0-beta-1-SNAPSHOT.jar:3.0.0-beta-1-SNAPSHOT] > at > org.apache.hadoop.hbase.procedure2.Procedure.doExecute(Procedure.java:921) > ~[hbase-procedure-3.0.0-beta-1-SNAPSHOT.jar:3.0.0-beta-1-SNAPSHOT] > at > org.apache.hadoop.hbase.procedure2.ProcedureExecutor.execProcedure(ProcedureExecutor.java:1649) > ~[hbase-procedure-3.0.0-beta-1-SNAPSHOT.jar:3.0.0-beta-1-SNAPSHOT] > at > org.apache.hadoop.hbase.procedure2.ProcedureExecutor.executeProcedure(ProcedureExecutor.java:1395) > ~[hbase-procedure-3.0.0-beta-1-SNAPSHOT.jar:3.0.0-beta-1-SNAPSHOT] > at > org.apache.hadoop.hbase.procedure2.ProcedureExecutor.access$1000(ProcedureExecutor.java:75) > ~[hbase-procedure-3.0.0-beta-1-SNAPSHOT.jar:3.0.0-beta-1-SNAPSHOT] > at > org.apache.hadoop.hbase.procedure2.ProcedureExecutor$WorkerThread.runProcedure(ProcedureExecutor.java:1961) > ~[hbase-procedure-3.0.0-beta-1-SNAPSHOT.jar:3.0.0-beta-1-SNAPSHOT] > at org.apache.hadoop.hbase.trace.TraceUtil.trace(TraceUtil.java:216) > ~[hbase-common-3.0.0-beta-1-SNAPSHOT.jar:3.0.0-beta-1-SNAPSHOT] > at > org.apache.hadoop.hbase.procedure2.ProcedureExecutor$WorkerThread.run(ProcedureExecutor.java:1988) > ~[hbase-procedure-3.0.0-beta-1-SNAPSHOT.jar:3.0.0-beta-1-SNAPSHOT] {code} -- This message was sent by Atlassian Jira (v8.20.10#820010)