This is an automated email from the ASF dual-hosted git repository.
junegunn pushed a commit to branch branch-3
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-3 by this push:
new d938d3b9aaa HBASE-29039 Seek past delete markers instead of skipping
one at a time (#8001) (#8040)
d938d3b9aaa is described below
commit d938d3b9aaa1495f704829f58a3a5f962bf5473a
Author: Junegunn Choi <[email protected]>
AuthorDate: Wed Apr 8 15:14:33 2026 +0900
HBASE-29039 Seek past delete markers instead of skipping one at a time
(#8001) (#8040)
When a DeleteColumn or DeleteFamily marker is encountered during a normal
user scan, the matcher currently returns SKIP, forcing the scanner to
advance one cell at a time. This causes read latency to degrade linearly
with the number of accumulated delete markers for the same row or column.
Since these are range deletes that mask all remaining versions of the
column, seek past the entire column immediately via
columns.getNextRowOrNextColumn(). This is safe because cells arrive in
timestamp descending order, so any puts newer than the delete have
already been processed.
For DeleteFamily, also fix getKeyForNextColumn in ScanQueryMatcher to
bypass the empty-qualifier guard (HBASE-18471) when the cell is a
DeleteFamily marker. Without this, the seek barely advances past the
current cell instead of jumping to the first real qualified column.
The optimization is only applied with plain ScanDeleteTracker, and
skipped when:
- seePastDeleteMarkers is true (KEEP_DELETED_CELLS)
- newVersionBehavior is enabled (sequence IDs determine visibility)
- visibility labels are in use (delete/put label mismatch)
---
Seeking is more expensive than skipping. When each row has only one
DeleteFamily or DeleteColumn marker (common case), the seek overhead
adds up across many rows, causing performance regression.
Introduce a counter that tracks consecutive range delete markers per row.
Only switch from SKIP to SEEK after seeing SEEK_ON_DELETE_MARKER_THRESHOLD
(default 10) markers, indicating actual accumulation. This preserves skip
performance for the common case while still optimizing the accumulation
case.
Signed-off-by: Charles Connell <[email protected]>
---
.../querymatcher/NormalUserScanQueryMatcher.java | 50 ++++++++
.../querymatcher/ScanQueryMatcher.java | 11 +-
.../querymatcher/TestUserScanQueryMatcher.java | 141 +++++++++++++++++++++
3 files changed, 196 insertions(+), 6 deletions(-)
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java
index 9ad3c792345..2dd5594475b 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java
@@ -18,8 +18,11 @@
package org.apache.hadoop.hbase.regionserver.querymatcher;
import java.io.IOException;
+import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.ExtendedCell;
import org.apache.hadoop.hbase.KeepDeletedCells;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.KeyValueUtil;
import org.apache.hadoop.hbase.PrivateCellUtil;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.regionserver.ScanInfo;
@@ -31,6 +34,14 @@ import org.apache.yetus.audience.InterfaceAudience;
@InterfaceAudience.Private
public abstract class NormalUserScanQueryMatcher extends UserScanQueryMatcher {
+ /**
+ * Number of consecutive range delete markers (DeleteColumn/DeleteFamily) to
skip before switching
+ * to seek. Seeking is more expensive than skipping for a single marker, but
much faster when
+ * markers accumulate. This threshold avoids the seek overhead for the
common case (one delete per
+ * row/column) while still kicking in when markers pile up.
+ */
+ static final int SEEK_ON_DELETE_MARKER_THRESHOLD = 10;
+
/** Keeps track of deletes */
private final DeleteTracker deletes;
@@ -40,18 +51,32 @@ public abstract class NormalUserScanQueryMatcher extends
UserScanQueryMatcher {
/** whether time range queries can see rows "behind" a delete */
protected final boolean seePastDeleteMarkers;
+ /** Whether seek optimization for range delete markers is applicable */
+ private final boolean canSeekOnDeleteMarker;
+
+ /** Count of consecutive range delete markers seen for the same column */
+ private int rangeDeleteCount;
+
+ /** Last range delete cell, for qualifier comparison across consecutive
markers */
+ private ExtendedCell lastDelete;
+
protected NormalUserScanQueryMatcher(Scan scan, ScanInfo scanInfo,
ColumnTracker columns,
boolean hasNullColumn, DeleteTracker deletes, long oldestUnexpiredTS, long
now) {
super(scan, scanInfo, columns, hasNullColumn, oldestUnexpiredTS, now);
this.deletes = deletes;
this.get = scan.isGetScan();
this.seePastDeleteMarkers = scanInfo.getKeepDeletedCells() !=
KeepDeletedCells.FALSE;
+ this.canSeekOnDeleteMarker =
+ !seePastDeleteMarkers && deletes.getClass() == ScanDeleteTracker.class;
}
@Override
public void beforeShipped() throws IOException {
super.beforeShipped();
deletes.beforeShipped();
+ if (lastDelete != null) {
+ lastDelete = KeyValueUtil.toNewKeyCell(lastDelete);
+ }
}
@Override
@@ -71,8 +96,31 @@ public abstract class NormalUserScanQueryMatcher extends
UserScanQueryMatcher {
if (includeDeleteMarker) {
this.deletes.add(cell);
}
+
+ // A DeleteColumn or DeleteFamily masks all remaining cells for this
column/family.
+ // Seek past them instead of skipping one cell at a time, but only after
seeing
+ // enough consecutive markers for the same column to justify the seek
overhead.
+ // Only safe with plain ScanDeleteTracker. Not safe with
newVersionBehavior (sequence
+ // IDs determine visibility), visibility labels (delete/put label
mismatch), or
+ // seePastDeleteMarkers (KEEP_DELETED_CELLS).
+ if (
+ canSeekOnDeleteMarker && (typeByte ==
KeyValue.Type.DeleteFamily.getCode()
+ || (typeByte == KeyValue.Type.DeleteColumn.getCode() &&
cell.getQualifierLength() > 0))
+ ) {
+ if (lastDelete != null && !CellUtil.matchingQualifier(cell,
lastDelete)) {
+ rangeDeleteCount = 0;
+ }
+ lastDelete = cell;
+ if (++rangeDeleteCount >= SEEK_ON_DELETE_MARKER_THRESHOLD) {
+ rangeDeleteCount = 0;
+ return columns.getNextRowOrNextColumn(cell);
+ }
+ } else {
+ rangeDeleteCount = 0;
+ }
return MatchCode.SKIP;
}
+ rangeDeleteCount = 0;
returnCode = checkDeleted(deletes, cell);
if (returnCode != null) {
return returnCode;
@@ -83,6 +131,8 @@ public abstract class NormalUserScanQueryMatcher extends
UserScanQueryMatcher {
@Override
protected void reset() {
deletes.reset();
+ rangeDeleteCount = 0;
+ lastDelete = null;
}
@Override
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java
index 4f6168b1e92..4478ccf4a74 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java
@@ -292,12 +292,11 @@ public abstract class ScanQueryMatcher implements
ShipperListener {
public abstract boolean moreRowsMayExistAfter(ExtendedCell cell);
public ExtendedCell getKeyForNextColumn(ExtendedCell cell) {
- // We aren't sure whether any DeleteFamily cells exist, so we can't skip
to next column.
- // TODO: Current way disable us to seek to next column quickly. Is there
any better solution?
- // see HBASE-18471 for more details
- // see TestFromClientSide3#testScanAfterDeletingSpecifiedRow
- // see TestFromClientSide3#testScanAfterDeletingSpecifiedRowV2
- if (cell.getQualifierLength() == 0) {
+ // For cells with empty qualifier, we generally can't skip to the next
column because
+ // DeleteFamily cells might exist that we haven't seen yet (see
HBASE-18471).
+ // However, if the cell itself IS a DeleteFamily marker, we know we've
already processed it,
+ // so we can safely seek to the next real column.
+ if (cell.getQualifierLength() == 0 &&
!PrivateCellUtil.isDeleteFamily(cell)) {
ExtendedCell nextKey = PrivateCellUtil.createNextOnRowCol(cell);
if (nextKey != cell) {
return nextKey;
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java
index efbb4c77d47..36dbd213bbe 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java
@@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeepDeletedCells;
import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.KeyValue.Type;
import org.apache.hadoop.hbase.PrivateCellUtil;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.FilterBase;
@@ -396,4 +397,144 @@ public class TestUserScanQueryMatcher extends
AbstractTestScanQueryMatcher {
Cell nextCell = qm.getKeyForNextColumn(lastCell);
assertArrayEquals(nextCell.getQualifierArray(), col4);
}
+
+ /**
+ * After enough consecutive range delete markers, the matcher should switch
from SKIP to
+ * SEEK_NEXT_COL. Point deletes and KEEP_DELETED_CELLS always SKIP.
+ */
+ @Test
+ public void testSeekOnRangeDelete() throws IOException {
+ int n = NormalUserScanQueryMatcher.SEEK_ON_DELETE_MARKER_THRESHOLD;
+
+ // DeleteColumn: first N-1 SKIP, N-th triggers SEEK_NEXT_COL
+ assertSeekAfterThreshold(KeepDeletedCells.FALSE, Type.DeleteColumn, n);
+
+ // DeleteFamily: same threshold behavior
+ assertSeekAfterThreshold(KeepDeletedCells.FALSE, Type.DeleteFamily, n);
+
+ // Delete (version): always SKIP (point delete, not range)
+ assertAllSkip(KeepDeletedCells.FALSE, Type.Delete, n + 1);
+
+ // KEEP_DELETED_CELLS=TRUE: always SKIP
+ assertAllSkip(KeepDeletedCells.TRUE, Type.DeleteColumn, n + 1);
+ }
+
+ /**
+ * DeleteColumn with empty qualifier must not cause seeking past a
subsequent DeleteFamily.
+ * DeleteFamily masks all columns, so it must be tracked by the delete
tracker.
+ */
+ @Test
+ public void testDeleteColumnEmptyQualifierDoesNotSkipDeleteFamily() throws
IOException {
+ long now = EnvironmentEdgeManager.currentTime();
+ byte[] e = HConstants.EMPTY_BYTE_ARRAY;
+ UserScanQueryMatcher qm = UserScanQueryMatcher.create(scan, new
ScanInfo(this.conf, fam1, 0, 1,
+ ttl, KeepDeletedCells.FALSE, HConstants.DEFAULT_BLOCKSIZE, 0,
rowComparator, false), null,
+ now - ttl, now, null);
+
+ int n = NormalUserScanQueryMatcher.SEEK_ON_DELETE_MARKER_THRESHOLD;
+ // Feed DCs with empty qualifier past the threshold, then a DF.
+ // The DF must NOT be seeked past -- it must be SKIP'd so the tracker
picks it up.
+ qm.setToNewRow(new KeyValue(row1, fam1, e, now, Type.DeleteColumn));
+ for (int i = 0; i < n + 1; i++) {
+ // Empty qualifier DCs should never trigger seek, regardless of threshold
+ assertEquals("DC at i=" + i, MatchCode.SKIP,
+ qm.match(new KeyValue(row1, fam1, e, now - i, Type.DeleteColumn)));
+ }
+ KeyValue df = new KeyValue(row1, fam1, e, now - n - 1, Type.DeleteFamily);
+ KeyValue put = new KeyValue(row1, fam1, col1, now - n - 1, Type.Put, data);
+ // DF must be processed (SKIP), not seeked past
+ assertEquals(MatchCode.SKIP, qm.match(df));
+ // Put in col1 at t=now-3 should be masked by DF@t=now-3
+ MatchCode putCode = qm.match(put);
+ assertEquals(MatchCode.SEEK_NEXT_COL, putCode);
+ }
+
+ /**
+ * DeleteColumn markers for different qualifiers should not accumulate the
seek counter. Only
+ * consecutive markers for the same qualifier should trigger seeking.
+ */
+ @Test
+ public void testDeleteColumnDifferentQualifiersDoNotSeek() throws
IOException {
+ long now = EnvironmentEdgeManager.currentTime();
+ UserScanQueryMatcher qm = UserScanQueryMatcher.create(scan, new
ScanInfo(this.conf, fam1, 0, 1,
+ ttl, KeepDeletedCells.FALSE, HConstants.DEFAULT_BLOCKSIZE, 0,
rowComparator, false), null,
+ now - ttl, now, null);
+
+ // DCs for different qualifiers: counter resets on qualifier change, never
seeks
+ qm.setToNewRow(new KeyValue(row1, fam1, col1, now, Type.DeleteColumn));
+ assertEquals(MatchCode.SKIP, qm.match(new KeyValue(row1, fam1, col1, now,
Type.DeleteColumn)));
+ assertEquals(MatchCode.SKIP,
+ qm.match(new KeyValue(row1, fam1, col2, now - 1, Type.DeleteColumn)));
+ assertEquals(MatchCode.SKIP,
+ qm.match(new KeyValue(row1, fam1, col3, now - 2, Type.DeleteColumn)));
+ assertEquals(MatchCode.SKIP,
+ qm.match(new KeyValue(row1, fam1, col4, now - 3, Type.DeleteColumn)));
+ assertEquals(MatchCode.SKIP,
+ qm.match(new KeyValue(row1, fam1, col5, now - 4, Type.DeleteColumn)));
+ }
+
+ /**
+ * Delete markers outside the scan's time range (includeDeleteMarker=false)
should still
+ * accumulate the seek counter and trigger SEEK_NEXT_COL after the threshold.
+ */
+ @Test
+ public void testSeekOnRangeDeleteOutsideTimeRange() throws IOException {
+ long now = EnvironmentEdgeManager.currentTime();
+ long futureTs = now + 1_000_000;
+ Scan scanWithTimeRange = new Scan(scan).setTimeRange(futureTs,
Long.MAX_VALUE);
+
+ UserScanQueryMatcher qm = UserScanQueryMatcher.create(scanWithTimeRange,
+ new ScanInfo(this.conf, fam1, 0, 1, ttl, KeepDeletedCells.FALSE,
HConstants.DEFAULT_BLOCKSIZE,
+ 0, rowComparator, false),
+ null, now - ttl, now, null);
+
+ int n = NormalUserScanQueryMatcher.SEEK_ON_DELETE_MARKER_THRESHOLD;
+ qm.setToNewRow(new KeyValue(row1, fam1, col1, now, Type.DeleteColumn));
+ // All DCs have timestamps below the time range, so includeDeleteMarker is
false.
+ // The seek counter should still accumulate.
+ for (int i = 0; i < n - 1; i++) {
+ assertEquals("DC at i=" + i, MatchCode.SKIP,
+ qm.match(new KeyValue(row1, fam1, col1, now - i, Type.DeleteColumn)));
+ }
+ assertEquals(MatchCode.SEEK_NEXT_COL,
+ qm.match(new KeyValue(row1, fam1, col1, now - n + 1,
Type.DeleteColumn)));
+ }
+
+ private UserScanQueryMatcher createDeleteMatcher(KeepDeletedCells
keepDeletedCells)
+ throws IOException {
+ long now = EnvironmentEdgeManager.currentTime();
+ return UserScanQueryMatcher.create(scan, new ScanInfo(this.conf, fam1, 0,
1, ttl,
+ keepDeletedCells, HConstants.DEFAULT_BLOCKSIZE, 0, rowComparator,
false), null, now - ttl,
+ now, null);
+ }
+
+ /** First n-1 markers SKIP, n-th triggers SEEK_NEXT_COL. */
+ private void assertSeekAfterThreshold(KeepDeletedCells keepDeletedCells,
Type type, int n)
+ throws IOException {
+ long now = EnvironmentEdgeManager.currentTime();
+ UserScanQueryMatcher qm = createDeleteMatcher(keepDeletedCells);
+ boolean familyLevel = type == Type.DeleteFamily || type ==
Type.DeleteFamilyVersion;
+ byte[] qual = familyLevel ? HConstants.EMPTY_BYTE_ARRAY : col1;
+ qm.setToNewRow(new KeyValue(row1, fam1, qual, now, type));
+ for (int i = 0; i < n - 1; i++) {
+ assertEquals("Mismatch at index " + i, MatchCode.SKIP,
+ qm.match(new KeyValue(row1, fam1, qual, now - i, type)));
+ }
+ assertEquals("Expected SEEK_NEXT_COL at index " + (n - 1),
MatchCode.SEEK_NEXT_COL,
+ qm.match(new KeyValue(row1, fam1, qual, now - n + 1, type)));
+ }
+
+ /** All markers should SKIP regardless of count. */
+ private void assertAllSkip(KeepDeletedCells keepDeletedCells, Type type, int
count)
+ throws IOException {
+ long now = EnvironmentEdgeManager.currentTime();
+ UserScanQueryMatcher qm = createDeleteMatcher(keepDeletedCells);
+ boolean familyLevel = type == Type.DeleteFamily || type ==
Type.DeleteFamilyVersion;
+ byte[] qual = familyLevel ? HConstants.EMPTY_BYTE_ARRAY : col1;
+ qm.setToNewRow(new KeyValue(row1, fam1, qual, now, type));
+ for (int i = 0; i < count; i++) {
+ assertEquals("Mismatch at index " + i, MatchCode.SKIP,
+ qm.match(new KeyValue(row1, fam1, qual, now - i, type)));
+ }
+ }
}