Re: [PR] HBASE-29776: Log filtering in IncrementalBackupManager can lead to data loss [hbase]

via GitHub Tue, 10 Mar 2026 06:33:17 -0700


DieterDePaepe commented on code in PR #7582:
URL: https://github.com/apache/hbase/pull/7582#discussion_r2906666835



##########
hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/FullTableBackupClient.java:
##########
@@ -181,6 +189,50 @@ public void execute() throws IOException {
       // set overall backup status: complete. Here we make sure to complete 
the backup.
       // After this checkpoint, even if entering cancel process, will let the 
backup finished
       backupInfo.setState(BackupState.COMPLETE);
+
+      // Scan oldlogs for dead/decommissioned hosts and add their max WAL 
timestamps
+      // to newTimestamps. This ensures subsequent incremental backups won't 
try to back up
+      // WALs that are already covered by this full backup's snapshot.
+      Path walRootDir = CommonFSUtils.getWALRootDir(conf);
+      Path logDir = new Path(walRootDir, HConstants.HREGION_LOGDIR_NAME);
+      Path oldLogDir = new Path(walRootDir, HConstants.HREGION_OLDLOGDIR_NAME);
+      FileSystem fs = walRootDir.getFileSystem(conf);
+
+      List<FileStatus> allLogs = new ArrayList<>();
+      for (FileStatus hostLogDir : fs.listStatus(logDir)) {
+        String host = 
BackupUtils.parseHostNameFromLogFile(hostLogDir.getPath());
+        if (host == null) {

Review Comment:
   This check also occurs in the code below, on both the the new and old logs. 
So you can just drop this check here.



##########
hbase-backup/src/test/java/org/apache/hadoop/hbase/backup/TestBackupOfflineRS.java:
##########
@@ -0,0 +1,324 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.backup;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.util.List;
+import java.util.Map;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtil;
+import org.apache.hadoop.hbase.SingleProcessHBaseCluster;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.backup.impl.BackupSystemTable;
+import org.apache.hadoop.hbase.client.Admin;
+import org.apache.hadoop.hbase.client.Connection;
+import org.apache.hadoop.hbase.client.ConnectionFactory;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.regionserver.HRegionServer;
+import org.apache.hadoop.hbase.testclassification.LargeTests;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
+
+/**
+ * Tests that WAL files from offline/inactive RegionServers are handled 
correctly during backup.
+ * Specifically verifies that WALs from an offline RS are:
+ * <ol>
+ * <li>Backed up once in the first backup after the RS goes offline</li>
+ * <li>NOT re-backed up in subsequent backups</li>
+ * </ol>
+ */
+@Category(LargeTests.class)
+public class TestBackupOfflineRS extends TestBackupBase {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestBackupOfflineRS.class);
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(TestBackupOfflineRS.class);
+
+  @BeforeClass
+  public static void setUp() throws Exception {
+    TEST_UTIL = new HBaseTestingUtil();
+    conf1 = TEST_UTIL.getConfiguration();
+    conf1.setInt("hbase.regionserver.info.port", -1);
+    autoRestoreOnFailure = true;
+    useSecondCluster = false;
+    setUpHelper();
+    // Start an additional RS so we have at least 2
+    TEST_UTIL.getMiniHBaseCluster().startRegionServer();
+    TEST_UTIL.waitTableAvailable(table1);
+  }
+
+  @Before
+  public void cleanupBetweenTests() throws Exception {
+    SingleProcessHBaseCluster cluster = TEST_UTIL.getMiniHBaseCluster();
+    while (cluster.getNumLiveRegionServers() < 2) {
+      cluster.startRegionServer();
+      Thread.sleep(2000);
+    }
+    TEST_UTIL.waitTableAvailable(table1);
+    try (BackupSystemTable sysTable = new 
BackupSystemTable(TEST_UTIL.getConnection())) {
+      try {
+        sysTable.finishBackupExclusiveOperation();
+      } catch (Exception ignored) {
+      }
+      for (BackupInfo info : sysTable.getBackupHistory()) {
+        try {
+          sysTable.deleteBackupInfo(info.getBackupId());
+        } catch (Exception ignored) {
+        }
+      }
+      try {
+        sysTable.deleteIncrementalBackupTableSet(BACKUP_ROOT_DIR);
+      } catch (Exception ignored) {
+      }
+    }
+  }
+
+  /**
+   * Tests that when a full backup is taken while an RS is offline (with WALs 
in oldlogs), the
+   * offline host's timestamps are recorded so subsequent incremental backups 
don't re-include those
+   * WALs.
+   */
+  @Test
+  public void testBackupWithOfflineRS() throws Exception {
+    LOG.info("Starting testFullBackupWithOfflineRS");
+
+    SingleProcessHBaseCluster cluster = TEST_UTIL.getMiniHBaseCluster();
+    List<TableName> tables = Lists.newArrayList(table1);
+
+    if (cluster.getNumLiveRegionServers() < 2) {
+      cluster.startRegionServer();
+      Thread.sleep(2000);
+    }
+
+    LOG.info("Inserting data to generate WAL entries");
+    try (Connection conn = ConnectionFactory.createConnection(conf1)) {
+      insertIntoTable(conn, table1, famName, 2, 100);
+    }
+
+    int rsToStop = 0;
+    HRegionServer rsBeforeStop = cluster.getRegionServer(rsToStop);
+    String offlineHost =
+      rsBeforeStop.getServerName().getHostname() + ":" + 
rsBeforeStop.getServerName().getPort();
+    LOG.info("Stopping RS: {}", offlineHost);
+
+    cluster.stopRegionServer(rsToStop);
+    // Wait for WALs to be moved to oldlogs
+    Thread.sleep(5000);
+
+    LOG.info("Taking full backup (with offline RS WALs in oldlogs)");
+    String fullBackupId = fullTableBackup(tables);
+    assertTrue("Full backup should succeed", checkSucceeded(fullBackupId));
+
+    try (BackupSystemTable sysTable = new 
BackupSystemTable(TEST_UTIL.getConnection())) {
+      Map<TableName, Map<String, Long>> timestamps = 
sysTable.readLogTimestampMap(BACKUP_ROOT_DIR);
+      Map<String, Long> rsTimestamps = timestamps.get(table1);
+      LOG.info("RS timestamps after full backup: {}", rsTimestamps);
+
+      Long tsAfterFullBackup = rsTimestamps.get(offlineHost);
+      assertNotNull("Offline host should have timestamp recorded in trslm 
after full backup",
+        tsAfterFullBackup);
+
+      LOG.info("Taking incremental backup (should NOT include offline RS 
WALs)");
+      String incrBackupId = incrementalTableBackup(tables);
+      assertTrue("Incremental backup should succeed", 
checkSucceeded(incrBackupId));
+
+      timestamps = sysTable.readLogTimestampMap(BACKUP_ROOT_DIR);
+      rsTimestamps = timestamps.get(table1);
+      assertFalse("Offline host should not have a boundary ",
+        rsTimestamps.containsKey(offlineHost));
+    }
+  }
+
+  /**
+   * Tests that WALs written to an RS after a full backup are correctly 
included in the subsequent
+   * incremental backup, even if that RS has gone offline before the 
incremental runs.
+   */
+  @Test
+  public void testRSGoesOfflineAfterFullBackupBeforeIncremental() throws 
Exception {
+    SingleProcessHBaseCluster cluster = TEST_UTIL.getMiniHBaseCluster();
+    List<TableName> tables = Lists.newArrayList(table1);
+
+    if (cluster.getNumLiveRegionServers() < 2) {
+      cluster.startRegionServer();
+      Thread.sleep(2000);
+    }
+
+    try (Connection conn = ConnectionFactory.createConnection(conf1)) {
+      insertIntoTable(conn, table1, famName, 3, 50);
+    }
+
+    String fullBackupId = fullTableBackup(tables);
+    assertTrue("Full backup should succeed", checkSucceeded(fullBackupId));
+
+    // Insert more data after the full backup to generate WALs that belong in 
the incremental
+    try (Connection conn = ConnectionFactory.createConnection(conf1)) {
+      insertIntoTable(conn, table1, famName, 4, 50);
+    }
+
+    // Pick a live RS, force a roll to ensure WAL files exist after the full 
backup's boundary,
+    // then stop it before the incremental runs
+    HRegionServer rsBeforeStop = 
cluster.getLiveRegionServerThreads().get(0).getRegionServer();
+    rsBeforeStop.getWalRoller().requestRollAll();
+    rsBeforeStop.getWalRoller().waitUntilWalRollFinished();
+    String offlineHost =
+      rsBeforeStop.getServerName().getHostname() + ":" + 
rsBeforeStop.getServerName().getPort();
+
+    cluster.stopRegionServer(rsBeforeStop.getServerName());
+    Thread.sleep(5000);
+
+    String incrBackupId = incrementalTableBackup(tables);
+    assertTrue("Incremental backup should succeed", 
checkSucceeded(incrBackupId));
+
+    try (BackupSystemTable sysTable = new 
BackupSystemTable(TEST_UTIL.getConnection())) {
+      Map<TableName, Map<String, Long>> timestamps = 
sysTable.readLogTimestampMap(BACKUP_ROOT_DIR);
+      Map<String, Long> rsTimestamps = timestamps.get(table1);
+      assertNotNull(
+        "Offline RS should have a timestamp boundary after the incremental 
that backed up its WALs",
+        rsTimestamps.get(offlineHost));
+
+      String incrBackupId2 = incrementalTableBackup(tables);
+      assertTrue("Second incremental backup should succeed", 
checkSucceeded(incrBackupId2));
+
+      timestamps = sysTable.readLogTimestampMap(BACKUP_ROOT_DIR);
+      rsTimestamps = timestamps.get(table1);
+      assertFalse("Offline RS should not have a boundary after all its WALs 
have been backed up",
+        rsTimestamps.containsKey(offlineHost));
+    }
+  }
+
+  /**
+   * Tests that a brand-new RS that comes online and goes offline before any 
backup correctly has
+   * its WALs covered by the full backup. This prevents subsequent incremental 
backups from
+   * re-including those WALs.
+   */
+  @Test
+  public void testTransientRSBeforeFullBackup() throws Exception {
+    SingleProcessHBaseCluster cluster = TEST_UTIL.getMiniHBaseCluster();
+    List<TableName> tables = Lists.newArrayList(table1);
+
+    // Start a new RS that will be online briefly and then go offline before 
the full backup
+    HRegionServer transientRS = 
cluster.startRegionServerAndWait(10000).getRegionServer();
+    try (Admin admin = TEST_UTIL.getConnection().getAdmin()) {
+      List<RegionInfo> regions = admin.getRegions(table1);
+      if (!regions.isEmpty()) {

Review Comment:
   I think this check cannot be empty - a default region gets created when a 
table is created. Having this check here hints that it would be a valid option 
to have no regions - but in that case there would be no guarantee of an active 
region being present on transientRS.
   
   I suggest to replace this with an `assertFalse`.



##########
hbase-backup/src/test/java/org/apache/hadoop/hbase/backup/TestBackupOfflineRS.java:
##########
@@ -0,0 +1,324 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.backup;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.util.List;
+import java.util.Map;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtil;
+import org.apache.hadoop.hbase.SingleProcessHBaseCluster;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.backup.impl.BackupSystemTable;
+import org.apache.hadoop.hbase.client.Admin;
+import org.apache.hadoop.hbase.client.Connection;
+import org.apache.hadoop.hbase.client.ConnectionFactory;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.regionserver.HRegionServer;
+import org.apache.hadoop.hbase.testclassification.LargeTests;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
+
+/**
+ * Tests that WAL files from offline/inactive RegionServers are handled 
correctly during backup.
+ * Specifically verifies that WALs from an offline RS are:
+ * <ol>
+ * <li>Backed up once in the first backup after the RS goes offline</li>
+ * <li>NOT re-backed up in subsequent backups</li>
+ * </ol>
+ */
+@Category(LargeTests.class)
+public class TestBackupOfflineRS extends TestBackupBase {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestBackupOfflineRS.class);
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(TestBackupOfflineRS.class);
+
+  @BeforeClass
+  public static void setUp() throws Exception {
+    TEST_UTIL = new HBaseTestingUtil();
+    conf1 = TEST_UTIL.getConfiguration();
+    conf1.setInt("hbase.regionserver.info.port", -1);
+    autoRestoreOnFailure = true;
+    useSecondCluster = false;
+    setUpHelper();
+    // Start an additional RS so we have at least 2
+    TEST_UTIL.getMiniHBaseCluster().startRegionServer();
+    TEST_UTIL.waitTableAvailable(table1);
+  }
+
+  @Before
+  public void cleanupBetweenTests() throws Exception {
+    SingleProcessHBaseCluster cluster = TEST_UTIL.getMiniHBaseCluster();
+    while (cluster.getNumLiveRegionServers() < 2) {
+      cluster.startRegionServer();
+      Thread.sleep(2000);
+    }
+    TEST_UTIL.waitTableAvailable(table1);
+    try (BackupSystemTable sysTable = new 
BackupSystemTable(TEST_UTIL.getConnection())) {
+      try {
+        sysTable.finishBackupExclusiveOperation();
+      } catch (Exception ignored) {
+      }
+      for (BackupInfo info : sysTable.getBackupHistory()) {
+        try {
+          sysTable.deleteBackupInfo(info.getBackupId());
+        } catch (Exception ignored) {
+        }
+      }
+      try {
+        sysTable.deleteIncrementalBackupTableSet(BACKUP_ROOT_DIR);
+      } catch (Exception ignored) {
+      }
+    }
+  }
+
+  /**
+   * Tests that when a full backup is taken while an RS is offline (with WALs 
in oldlogs), the
+   * offline host's timestamps are recorded so subsequent incremental backups 
don't re-include those
+   * WALs.
+   */
+  @Test
+  public void testBackupWithOfflineRS() throws Exception {
+    LOG.info("Starting testFullBackupWithOfflineRS");
+
+    SingleProcessHBaseCluster cluster = TEST_UTIL.getMiniHBaseCluster();
+    List<TableName> tables = Lists.newArrayList(table1);
+
+    if (cluster.getNumLiveRegionServers() < 2) {
+      cluster.startRegionServer();
+      Thread.sleep(2000);
+    }
+
+    LOG.info("Inserting data to generate WAL entries");
+    try (Connection conn = ConnectionFactory.createConnection(conf1)) {
+      insertIntoTable(conn, table1, famName, 2, 100);
+    }
+
+    int rsToStop = 0;
+    HRegionServer rsBeforeStop = cluster.getRegionServer(rsToStop);
+    String offlineHost =
+      rsBeforeStop.getServerName().getHostname() + ":" + 
rsBeforeStop.getServerName().getPort();
+    LOG.info("Stopping RS: {}", offlineHost);
+
+    cluster.stopRegionServer(rsToStop);
+    // Wait for WALs to be moved to oldlogs
+    Thread.sleep(5000);
+
+    LOG.info("Taking full backup (with offline RS WALs in oldlogs)");
+    String fullBackupId = fullTableBackup(tables);
+    assertTrue("Full backup should succeed", checkSucceeded(fullBackupId));
+
+    try (BackupSystemTable sysTable = new 
BackupSystemTable(TEST_UTIL.getConnection())) {
+      Map<TableName, Map<String, Long>> timestamps = 
sysTable.readLogTimestampMap(BACKUP_ROOT_DIR);
+      Map<String, Long> rsTimestamps = timestamps.get(table1);
+      LOG.info("RS timestamps after full backup: {}", rsTimestamps);
+
+      Long tsAfterFullBackup = rsTimestamps.get(offlineHost);
+      assertNotNull("Offline host should have timestamp recorded in trslm 
after full backup",
+        tsAfterFullBackup);
+
+      LOG.info("Taking incremental backup (should NOT include offline RS 
WALs)");
+      String incrBackupId = incrementalTableBackup(tables);
+      assertTrue("Incremental backup should succeed", 
checkSucceeded(incrBackupId));
+
+      timestamps = sysTable.readLogTimestampMap(BACKUP_ROOT_DIR);
+      rsTimestamps = timestamps.get(table1);
+      assertFalse("Offline host should not have a boundary ",
+        rsTimestamps.containsKey(offlineHost));
+    }
+  }
+
+  /**
+   * Tests that WALs written to an RS after a full backup are correctly 
included in the subsequent
+   * incremental backup, even if that RS has gone offline before the 
incremental runs.
+   */
+  @Test
+  public void testRSGoesOfflineAfterFullBackupBeforeIncremental() throws 
Exception {
+    SingleProcessHBaseCluster cluster = TEST_UTIL.getMiniHBaseCluster();
+    List<TableName> tables = Lists.newArrayList(table1);
+
+    if (cluster.getNumLiveRegionServers() < 2) {
+      cluster.startRegionServer();
+      Thread.sleep(2000);
+    }
+
+    try (Connection conn = ConnectionFactory.createConnection(conf1)) {
+      insertIntoTable(conn, table1, famName, 3, 50);
+    }
+
+    String fullBackupId = fullTableBackup(tables);
+    assertTrue("Full backup should succeed", checkSucceeded(fullBackupId));
+
+    // Insert more data after the full backup to generate WALs that belong in 
the incremental
+    try (Connection conn = ConnectionFactory.createConnection(conf1)) {
+      insertIntoTable(conn, table1, famName, 4, 50);
+    }
+
+    // Pick a live RS, force a roll to ensure WAL files exist after the full 
backup's boundary,
+    // then stop it before the incremental runs
+    HRegionServer rsBeforeStop = 
cluster.getLiveRegionServerThreads().get(0).getRegionServer();
+    rsBeforeStop.getWalRoller().requestRollAll();
+    rsBeforeStop.getWalRoller().waitUntilWalRollFinished();
+    String offlineHost =
+      rsBeforeStop.getServerName().getHostname() + ":" + 
rsBeforeStop.getServerName().getPort();
+
+    cluster.stopRegionServer(rsBeforeStop.getServerName());
+    Thread.sleep(5000);
+
+    String incrBackupId = incrementalTableBackup(tables);
+    assertTrue("Incremental backup should succeed", 
checkSucceeded(incrBackupId));
+
+    try (BackupSystemTable sysTable = new 
BackupSystemTable(TEST_UTIL.getConnection())) {
+      Map<TableName, Map<String, Long>> timestamps = 
sysTable.readLogTimestampMap(BACKUP_ROOT_DIR);
+      Map<String, Long> rsTimestamps = timestamps.get(table1);
+      assertNotNull(
+        "Offline RS should have a timestamp boundary after the incremental 
that backed up its WALs",
+        rsTimestamps.get(offlineHost));
+
+      String incrBackupId2 = incrementalTableBackup(tables);
+      assertTrue("Second incremental backup should succeed", 
checkSucceeded(incrBackupId2));
+
+      timestamps = sysTable.readLogTimestampMap(BACKUP_ROOT_DIR);
+      rsTimestamps = timestamps.get(table1);
+      assertFalse("Offline RS should not have a boundary after all its WALs 
have been backed up",
+        rsTimestamps.containsKey(offlineHost));
+    }
+  }
+
+  /**
+   * Tests that a brand-new RS that comes online and goes offline before any 
backup correctly has
+   * its WALs covered by the full backup. This prevents subsequent incremental 
backups from
+   * re-including those WALs.
+   */
+  @Test
+  public void testTransientRSBeforeFullBackup() throws Exception {
+    SingleProcessHBaseCluster cluster = TEST_UTIL.getMiniHBaseCluster();
+    List<TableName> tables = Lists.newArrayList(table1);
+
+    // Start a new RS that will be online briefly and then go offline before 
the full backup
+    HRegionServer transientRS = 
cluster.startRegionServerAndWait(10000).getRegionServer();
+    try (Admin admin = TEST_UTIL.getConnection().getAdmin()) {
+      List<RegionInfo> regions = admin.getRegions(table1);
+      if (!regions.isEmpty()) {
+        admin.move(regions.get(0).getEncodedNameAsBytes(), 
transientRS.getServerName());
+        TEST_UTIL.waitUntilAllRegionsAssigned(table1);
+      }
+    }
+    try (Connection conn = ConnectionFactory.createConnection(conf1)) {
+      insertIntoTable(conn, table1, famName, 5, 50);
+    }
+    transientRS.getWalRoller().requestRollAll();
+    transientRS.getWalRoller().waitUntilWalRollFinished();
+    String transientHost =
+      transientRS.getServerName().getHostname() + ":" + 
transientRS.getServerName().getPort();
+
+    cluster.stopRegionServer(transientRS.getServerName());
+    Thread.sleep(5000);
+
+    String fullBackupId = fullTableBackup(tables);
+    assertTrue("Full backup should succeed", checkSucceeded(fullBackupId));
+
+    try (BackupSystemTable sysTable = new 
BackupSystemTable(TEST_UTIL.getConnection())) {
+      Map<TableName, Map<String, Long>> timestamps = 
sysTable.readLogTimestampMap(BACKUP_ROOT_DIR);
+      Map<String, Long> rsTimestamps = timestamps.get(table1);
+      assertNotNull(
+        "Transient RS that went offline before full backup should have its WAL 
boundary recorded",
+        rsTimestamps.get(transientHost));
+
+      String incrBackupId = incrementalTableBackup(tables);
+      assertTrue("Incremental backup after transient RS went offline should 
succeed",
+        checkSucceeded(incrBackupId));
+
+      timestamps = sysTable.readLogTimestampMap(BACKUP_ROOT_DIR);
+      rsTimestamps = timestamps.get(table1);
+      assertFalse(
+        "Transient RS should not have a boundary after the full backup covered 
all its WALs",
+        rsTimestamps.containsKey(transientHost));
+    }
+  }
+
+  /**
+   * Tests that WALs from an RS that comes online and goes offline between a 
full backup and an
+   * incremental backup are correctly included in the incremental backup and 
not re-included in
+   * subsequent incremental backups.
+   */
+  @Test
+  public void testTransientRSAfterFullBackupBeforeIncremental() throws 
Exception {
+    SingleProcessHBaseCluster cluster = TEST_UTIL.getMiniHBaseCluster();
+    List<TableName> tables = Lists.newArrayList(table1);
+
+    try (Connection conn = ConnectionFactory.createConnection(conf1)) {
+      insertIntoTable(conn, table1, famName, 6, 50);
+    }
+
+    String fullBackupId = fullTableBackup(tables);
+    assertTrue("Full backup should succeed", checkSucceeded(fullBackupId));
+
+    // Start a new RS after the full backup, move a region to it, let it 
accumulate WALs, then stop
+    HRegionServer transientRS = 
cluster.startRegionServerAndWait(10000).getRegionServer();
+    try (Admin admin = TEST_UTIL.getConnection().getAdmin()) {
+      List<RegionInfo> regions = admin.getRegions(table1);
+      if (!regions.isEmpty()) {

Review Comment:
   Same remark as line 230.



##########
hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/FullTableBackupClient.java:
##########
@@ -181,6 +189,50 @@ public void execute() throws IOException {
       // set overall backup status: complete. Here we make sure to complete 
the backup.
       // After this checkpoint, even if entering cancel process, will let the 
backup finished
       backupInfo.setState(BackupState.COMPLETE);
+
+      // Scan oldlogs for dead/decommissioned hosts and add their max WAL 
timestamps
+      // to newTimestamps. This ensures subsequent incremental backups won't 
try to back up
+      // WALs that are already covered by this full backup's snapshot.
+      Path walRootDir = CommonFSUtils.getWALRootDir(conf);
+      Path logDir = new Path(walRootDir, HConstants.HREGION_LOGDIR_NAME);
+      Path oldLogDir = new Path(walRootDir, HConstants.HREGION_OLDLOGDIR_NAME);
+      FileSystem fs = walRootDir.getFileSystem(conf);
+
+      List<FileStatus> allLogs = new ArrayList<>();
+      for (FileStatus hostLogDir : fs.listStatus(logDir)) {
+        String host = 
BackupUtils.parseHostNameFromLogFile(hostLogDir.getPath());
+        if (host == null) {
+          continue;
+        }
+        allLogs.addAll(Arrays.asList(fs.listStatus(hostLogDir.getPath())));
+      }
+      allLogs.addAll(Arrays.asList(fs.listStatus(oldLogDir)));

Review Comment:
   The first part of my comment still needs to be addressed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] HBASE-29776: Log filtering in IncrementalBackupManager can lead to data loss [hbase]

Reply via email to