(hbase) branch branch-3 updated: HBASE-29039 Seek past delete markers instead of skipping one at a time (#8001) (#8040)

junegunn Tue, 07 Apr 2026 23:14:48 -0700

This is an automated email from the ASF dual-hosted git repository.

junegunn pushed a commit to branch branch-3
in repository https://gitbox.apache.org/repos/asf/hbase.git



The following commit(s) were added to refs/heads/branch-3 by this push:
     new d938d3b9aaa HBASE-29039 Seek past delete markers instead of skipping 
one at a time (#8001) (#8040)
d938d3b9aaa is described below

commit d938d3b9aaa1495f704829f58a3a5f962bf5473a
Author: Junegunn Choi <[email protected]>
AuthorDate: Wed Apr 8 15:14:33 2026 +0900

    HBASE-29039 Seek past delete markers instead of skipping one at a time 
(#8001) (#8040)
    
    When a DeleteColumn or DeleteFamily marker is encountered during a normal
    user scan, the matcher currently returns SKIP, forcing the scanner to
    advance one cell at a time. This causes read latency to degrade linearly
    with the number of accumulated delete markers for the same row or column.
    
    Since these are range deletes that mask all remaining versions of the
    column, seek past the entire column immediately via
    columns.getNextRowOrNextColumn(). This is safe because cells arrive in
    timestamp descending order, so any puts newer than the delete have
    already been processed.
    
    For DeleteFamily, also fix getKeyForNextColumn in ScanQueryMatcher to
    bypass the empty-qualifier guard (HBASE-18471) when the cell is a
    DeleteFamily marker. Without this, the seek barely advances past the
    current cell instead of jumping to the first real qualified column.
    
    The optimization is only applied with plain ScanDeleteTracker, and
    skipped when:
    - seePastDeleteMarkers is true (KEEP_DELETED_CELLS)
    - newVersionBehavior is enabled (sequence IDs determine visibility)
    - visibility labels are in use (delete/put label mismatch)
    
    ---
    
    Seeking is more expensive than skipping. When each row has only one
    DeleteFamily or DeleteColumn marker (common case), the seek overhead
    adds up across many rows, causing performance regression.
    
    Introduce a counter that tracks consecutive range delete markers per row.
    Only switch from SKIP to SEEK after seeing SEEK_ON_DELETE_MARKER_THRESHOLD
    (default 10) markers, indicating actual accumulation. This preserves skip
    performance for the common case while still optimizing the accumulation
    case.
    
    Signed-off-by: Charles Connell <[email protected]>
---
 .../querymatcher/NormalUserScanQueryMatcher.java   |  50 ++++++++
 .../querymatcher/ScanQueryMatcher.java             |  11 +-
 .../querymatcher/TestUserScanQueryMatcher.java     | 141 +++++++++++++++++++++
 3 files changed, 196 insertions(+), 6 deletions(-)

diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java
index 9ad3c792345..2dd5594475b 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java
@@ -18,8 +18,11 @@
 package org.apache.hadoop.hbase.regionserver.querymatcher;
 
 import java.io.IOException;
+import org.apache.hadoop.hbase.CellUtil;
 import org.apache.hadoop.hbase.ExtendedCell;
 import org.apache.hadoop.hbase.KeepDeletedCells;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.KeyValueUtil;
 import org.apache.hadoop.hbase.PrivateCellUtil;
 import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hbase.regionserver.ScanInfo;
@@ -31,6 +34,14 @@ import org.apache.yetus.audience.InterfaceAudience;
 @InterfaceAudience.Private
 public abstract class NormalUserScanQueryMatcher extends UserScanQueryMatcher {
 
+  /**
+   * Number of consecutive range delete markers (DeleteColumn/DeleteFamily) to 
skip before switching
+   * to seek. Seeking is more expensive than skipping for a single marker, but 
much faster when
+   * markers accumulate. This threshold avoids the seek overhead for the 
common case (one delete per
+   * row/column) while still kicking in when markers pile up.
+   */
+  static final int SEEK_ON_DELETE_MARKER_THRESHOLD = 10;
+
   /** Keeps track of deletes */
   private final DeleteTracker deletes;
 
@@ -40,18 +51,32 @@ public abstract class NormalUserScanQueryMatcher extends 
UserScanQueryMatcher {
   /** whether time range queries can see rows "behind" a delete */
   protected final boolean seePastDeleteMarkers;
 
+  /** Whether seek optimization for range delete markers is applicable */
+  private final boolean canSeekOnDeleteMarker;
+
+  /** Count of consecutive range delete markers seen for the same column */
+  private int rangeDeleteCount;
+
+  /** Last range delete cell, for qualifier comparison across consecutive 
markers */
+  private ExtendedCell lastDelete;
+
   protected NormalUserScanQueryMatcher(Scan scan, ScanInfo scanInfo, 
ColumnTracker columns,
     boolean hasNullColumn, DeleteTracker deletes, long oldestUnexpiredTS, long 
now) {
     super(scan, scanInfo, columns, hasNullColumn, oldestUnexpiredTS, now);
     this.deletes = deletes;
     this.get = scan.isGetScan();
     this.seePastDeleteMarkers = scanInfo.getKeepDeletedCells() != 
KeepDeletedCells.FALSE;
+    this.canSeekOnDeleteMarker =
+      !seePastDeleteMarkers && deletes.getClass() == ScanDeleteTracker.class;
   }
 
   @Override
   public void beforeShipped() throws IOException {
     super.beforeShipped();
     deletes.beforeShipped();
+    if (lastDelete != null) {
+      lastDelete = KeyValueUtil.toNewKeyCell(lastDelete);
+    }
   }
 
   @Override
@@ -71,8 +96,31 @@ public abstract class NormalUserScanQueryMatcher extends 
UserScanQueryMatcher {
       if (includeDeleteMarker) {
         this.deletes.add(cell);
       }
+
+      // A DeleteColumn or DeleteFamily masks all remaining cells for this 
column/family.
+      // Seek past them instead of skipping one cell at a time, but only after 
seeing
+      // enough consecutive markers for the same column to justify the seek 
overhead.
+      // Only safe with plain ScanDeleteTracker. Not safe with 
newVersionBehavior (sequence
+      // IDs determine visibility), visibility labels (delete/put label 
mismatch), or
+      // seePastDeleteMarkers (KEEP_DELETED_CELLS).
+      if (
+        canSeekOnDeleteMarker && (typeByte == 
KeyValue.Type.DeleteFamily.getCode()
+          || (typeByte == KeyValue.Type.DeleteColumn.getCode() && 
cell.getQualifierLength() > 0))
+      ) {
+        if (lastDelete != null && !CellUtil.matchingQualifier(cell, 
lastDelete)) {
+          rangeDeleteCount = 0;
+        }
+        lastDelete = cell;
+        if (++rangeDeleteCount >= SEEK_ON_DELETE_MARKER_THRESHOLD) {
+          rangeDeleteCount = 0;
+          return columns.getNextRowOrNextColumn(cell);
+        }
+      } else {
+        rangeDeleteCount = 0;
+      }
       return MatchCode.SKIP;
     }
+    rangeDeleteCount = 0;
     returnCode = checkDeleted(deletes, cell);
     if (returnCode != null) {
       return returnCode;
@@ -83,6 +131,8 @@ public abstract class NormalUserScanQueryMatcher extends 
UserScanQueryMatcher {
   @Override
   protected void reset() {
     deletes.reset();
+    rangeDeleteCount = 0;
+    lastDelete = null;
   }
 
   @Override
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java
index 4f6168b1e92..4478ccf4a74 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java
@@ -292,12 +292,11 @@ public abstract class ScanQueryMatcher implements 
ShipperListener {
   public abstract boolean moreRowsMayExistAfter(ExtendedCell cell);
 
   public ExtendedCell getKeyForNextColumn(ExtendedCell cell) {
-    // We aren't sure whether any DeleteFamily cells exist, so we can't skip 
to next column.
-    // TODO: Current way disable us to seek to next column quickly. Is there 
any better solution?
-    // see HBASE-18471 for more details
-    // see TestFromClientSide3#testScanAfterDeletingSpecifiedRow
-    // see TestFromClientSide3#testScanAfterDeletingSpecifiedRowV2
-    if (cell.getQualifierLength() == 0) {
+    // For cells with empty qualifier, we generally can't skip to the next 
column because
+    // DeleteFamily cells might exist that we haven't seen yet (see 
HBASE-18471).
+    // However, if the cell itself IS a DeleteFamily marker, we know we've 
already processed it,
+    // so we can safely seek to the next real column.
+    if (cell.getQualifierLength() == 0 && 
!PrivateCellUtil.isDeleteFamily(cell)) {
       ExtendedCell nextKey = PrivateCellUtil.createNextOnRowCol(cell);
       if (nextKey != cell) {
         return nextKey;
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java
index efbb4c77d47..36dbd213bbe 100644
--- 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java
@@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.HBaseClassTestRule;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.KeepDeletedCells;
 import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.KeyValue.Type;
 import org.apache.hadoop.hbase.PrivateCellUtil;
 import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hbase.filter.FilterBase;
@@ -396,4 +397,144 @@ public class TestUserScanQueryMatcher extends 
AbstractTestScanQueryMatcher {
     Cell nextCell = qm.getKeyForNextColumn(lastCell);
     assertArrayEquals(nextCell.getQualifierArray(), col4);
   }
+
+  /**
+   * After enough consecutive range delete markers, the matcher should switch 
from SKIP to
+   * SEEK_NEXT_COL. Point deletes and KEEP_DELETED_CELLS always SKIP.
+   */
+  @Test
+  public void testSeekOnRangeDelete() throws IOException {
+    int n = NormalUserScanQueryMatcher.SEEK_ON_DELETE_MARKER_THRESHOLD;
+
+    // DeleteColumn: first N-1 SKIP, N-th triggers SEEK_NEXT_COL
+    assertSeekAfterThreshold(KeepDeletedCells.FALSE, Type.DeleteColumn, n);
+
+    // DeleteFamily: same threshold behavior
+    assertSeekAfterThreshold(KeepDeletedCells.FALSE, Type.DeleteFamily, n);
+
+    // Delete (version): always SKIP (point delete, not range)
+    assertAllSkip(KeepDeletedCells.FALSE, Type.Delete, n + 1);
+
+    // KEEP_DELETED_CELLS=TRUE: always SKIP
+    assertAllSkip(KeepDeletedCells.TRUE, Type.DeleteColumn, n + 1);
+  }
+
+  /**
+   * DeleteColumn with empty qualifier must not cause seeking past a 
subsequent DeleteFamily.
+   * DeleteFamily masks all columns, so it must be tracked by the delete 
tracker.
+   */
+  @Test
+  public void testDeleteColumnEmptyQualifierDoesNotSkipDeleteFamily() throws 
IOException {
+    long now = EnvironmentEdgeManager.currentTime();
+    byte[] e = HConstants.EMPTY_BYTE_ARRAY;
+    UserScanQueryMatcher qm = UserScanQueryMatcher.create(scan, new 
ScanInfo(this.conf, fam1, 0, 1,
+      ttl, KeepDeletedCells.FALSE, HConstants.DEFAULT_BLOCKSIZE, 0, 
rowComparator, false), null,
+      now - ttl, now, null);
+
+    int n = NormalUserScanQueryMatcher.SEEK_ON_DELETE_MARKER_THRESHOLD;
+    // Feed DCs with empty qualifier past the threshold, then a DF.
+    // The DF must NOT be seeked past -- it must be SKIP'd so the tracker 
picks it up.
+    qm.setToNewRow(new KeyValue(row1, fam1, e, now, Type.DeleteColumn));
+    for (int i = 0; i < n + 1; i++) {
+      // Empty qualifier DCs should never trigger seek, regardless of threshold
+      assertEquals("DC at i=" + i, MatchCode.SKIP,
+        qm.match(new KeyValue(row1, fam1, e, now - i, Type.DeleteColumn)));
+    }
+    KeyValue df = new KeyValue(row1, fam1, e, now - n - 1, Type.DeleteFamily);
+    KeyValue put = new KeyValue(row1, fam1, col1, now - n - 1, Type.Put, data);
+    // DF must be processed (SKIP), not seeked past
+    assertEquals(MatchCode.SKIP, qm.match(df));
+    // Put in col1 at t=now-3 should be masked by DF@t=now-3
+    MatchCode putCode = qm.match(put);
+    assertEquals(MatchCode.SEEK_NEXT_COL, putCode);
+  }
+
+  /**
+   * DeleteColumn markers for different qualifiers should not accumulate the 
seek counter. Only
+   * consecutive markers for the same qualifier should trigger seeking.
+   */
+  @Test
+  public void testDeleteColumnDifferentQualifiersDoNotSeek() throws 
IOException {
+    long now = EnvironmentEdgeManager.currentTime();
+    UserScanQueryMatcher qm = UserScanQueryMatcher.create(scan, new 
ScanInfo(this.conf, fam1, 0, 1,
+      ttl, KeepDeletedCells.FALSE, HConstants.DEFAULT_BLOCKSIZE, 0, 
rowComparator, false), null,
+      now - ttl, now, null);
+
+    // DCs for different qualifiers: counter resets on qualifier change, never 
seeks
+    qm.setToNewRow(new KeyValue(row1, fam1, col1, now, Type.DeleteColumn));
+    assertEquals(MatchCode.SKIP, qm.match(new KeyValue(row1, fam1, col1, now, 
Type.DeleteColumn)));
+    assertEquals(MatchCode.SKIP,
+      qm.match(new KeyValue(row1, fam1, col2, now - 1, Type.DeleteColumn)));
+    assertEquals(MatchCode.SKIP,
+      qm.match(new KeyValue(row1, fam1, col3, now - 2, Type.DeleteColumn)));
+    assertEquals(MatchCode.SKIP,
+      qm.match(new KeyValue(row1, fam1, col4, now - 3, Type.DeleteColumn)));
+    assertEquals(MatchCode.SKIP,
+      qm.match(new KeyValue(row1, fam1, col5, now - 4, Type.DeleteColumn)));
+  }
+
+  /**
+   * Delete markers outside the scan's time range (includeDeleteMarker=false) 
should still
+   * accumulate the seek counter and trigger SEEK_NEXT_COL after the threshold.
+   */
+  @Test
+  public void testSeekOnRangeDeleteOutsideTimeRange() throws IOException {
+    long now = EnvironmentEdgeManager.currentTime();
+    long futureTs = now + 1_000_000;
+    Scan scanWithTimeRange = new Scan(scan).setTimeRange(futureTs, 
Long.MAX_VALUE);
+
+    UserScanQueryMatcher qm = UserScanQueryMatcher.create(scanWithTimeRange,
+      new ScanInfo(this.conf, fam1, 0, 1, ttl, KeepDeletedCells.FALSE, 
HConstants.DEFAULT_BLOCKSIZE,
+        0, rowComparator, false),
+      null, now - ttl, now, null);
+
+    int n = NormalUserScanQueryMatcher.SEEK_ON_DELETE_MARKER_THRESHOLD;
+    qm.setToNewRow(new KeyValue(row1, fam1, col1, now, Type.DeleteColumn));
+    // All DCs have timestamps below the time range, so includeDeleteMarker is 
false.
+    // The seek counter should still accumulate.
+    for (int i = 0; i < n - 1; i++) {
+      assertEquals("DC at i=" + i, MatchCode.SKIP,
+        qm.match(new KeyValue(row1, fam1, col1, now - i, Type.DeleteColumn)));
+    }
+    assertEquals(MatchCode.SEEK_NEXT_COL,
+      qm.match(new KeyValue(row1, fam1, col1, now - n + 1, 
Type.DeleteColumn)));
+  }
+
+  private UserScanQueryMatcher createDeleteMatcher(KeepDeletedCells 
keepDeletedCells)
+    throws IOException {
+    long now = EnvironmentEdgeManager.currentTime();
+    return UserScanQueryMatcher.create(scan, new ScanInfo(this.conf, fam1, 0, 
1, ttl,
+      keepDeletedCells, HConstants.DEFAULT_BLOCKSIZE, 0, rowComparator, 
false), null, now - ttl,
+      now, null);
+  }
+
+  /** First n-1 markers SKIP, n-th triggers SEEK_NEXT_COL. */
+  private void assertSeekAfterThreshold(KeepDeletedCells keepDeletedCells, 
Type type, int n)
+    throws IOException {
+    long now = EnvironmentEdgeManager.currentTime();
+    UserScanQueryMatcher qm = createDeleteMatcher(keepDeletedCells);
+    boolean familyLevel = type == Type.DeleteFamily || type == 
Type.DeleteFamilyVersion;
+    byte[] qual = familyLevel ? HConstants.EMPTY_BYTE_ARRAY : col1;
+    qm.setToNewRow(new KeyValue(row1, fam1, qual, now, type));
+    for (int i = 0; i < n - 1; i++) {
+      assertEquals("Mismatch at index " + i, MatchCode.SKIP,
+        qm.match(new KeyValue(row1, fam1, qual, now - i, type)));
+    }
+    assertEquals("Expected SEEK_NEXT_COL at index " + (n - 1), 
MatchCode.SEEK_NEXT_COL,
+      qm.match(new KeyValue(row1, fam1, qual, now - n + 1, type)));
+  }
+
+  /** All markers should SKIP regardless of count. */
+  private void assertAllSkip(KeepDeletedCells keepDeletedCells, Type type, int 
count)
+    throws IOException {
+    long now = EnvironmentEdgeManager.currentTime();
+    UserScanQueryMatcher qm = createDeleteMatcher(keepDeletedCells);
+    boolean familyLevel = type == Type.DeleteFamily || type == 
Type.DeleteFamilyVersion;
+    byte[] qual = familyLevel ? HConstants.EMPTY_BYTE_ARRAY : col1;
+    qm.setToNewRow(new KeyValue(row1, fam1, qual, now, type));
+    for (int i = 0; i < count; i++) {
+      assertEquals("Mismatch at index " + i, MatchCode.SKIP,
+        qm.match(new KeyValue(row1, fam1, qual, now - i, type)));
+    }
+  }
 }

(hbase) branch branch-3 updated: HBASE-29039 Seek past delete markers instead of skipping one at a time (#8001) (#8040)

Reply via email to