This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new b1a563a5c1b branch-4.1: [fix](cloud) skip stale tablet cache check for
STOP_TOKEN #63520 (#63786)
b1a563a5c1b is described below
commit b1a563a5c1b163c40de548c53f1ed89592deda74
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Fri May 29 14:26:19 2026 +0800
branch-4.1: [fix](cloud) skip stale tablet cache check for STOP_TOKEN
#63520 (#63786)
Cherry-picked from #63520
Co-authored-by: Gavin Chou <[email protected]>
Co-authored-by: Siyang Tang <[email protected]>
---
cloud/src/meta-service/meta_service_job.cpp | 12 ++++--
cloud/test/meta_service_job_test.cpp | 65 +++++++++++++++++++++++++++++
2 files changed, 74 insertions(+), 3 deletions(-)
diff --git a/cloud/src/meta-service/meta_service_job.cpp
b/cloud/src/meta-service/meta_service_job.cpp
index e335c0a9b40..dad84f6a36a 100644
--- a/cloud/src/meta-service/meta_service_job.cpp
+++ b/cloud/src/meta-service/meta_service_job.cpp
@@ -162,9 +162,15 @@ void start_compaction_job(MetaServiceCode& code,
std::string& msg, std::stringst
return;
}
}
-
- if (compaction.base_compaction_cnt() < stats.base_compaction_cnt() ||
- compaction.cumulative_compaction_cnt() <
stats.cumulative_compaction_cnt()) {
+ // STOP_TOKEN is a lock marker used by schema change to block concurrent
compactions during
+ // delete bitmap recalculation on MOW tables. It does not perform actual
compaction, so the
+ // stale tablet cache check (which guards against compacting on outdated
rowset metadata) is
+ // not meaningful for it and must be skipped to avoid spurious failures
when the BE's cached
+ // compaction counts lag behind the meta-service due to a concurrent
compaction completing
+ // on another BE node (see CORE-5964).
+ if (compaction.type() != TabletCompactionJobPB::STOP_TOKEN &&
+ (compaction.base_compaction_cnt() < stats.base_compaction_cnt() ||
+ compaction.cumulative_compaction_cnt() <
stats.cumulative_compaction_cnt())) {
code = MetaServiceCode::STALE_TABLET_CACHE;
SS << "could not perform compaction on expired tablet cache."
<< " req_base_compaction_cnt=" << compaction.base_compaction_cnt()
diff --git a/cloud/test/meta_service_job_test.cpp
b/cloud/test/meta_service_job_test.cpp
index d5c837e8711..1926f6c600a 100644
--- a/cloud/test/meta_service_job_test.cpp
+++ b/cloud/test/meta_service_job_test.cpp
@@ -1645,6 +1645,71 @@ void check_job_key(MetaServiceProxy* meta_service,
std::string instance_id, int6
}
}
+// Regression test for CORE-5964: STOP_TOKEN should not be rejected by the
stale tablet
+// cache check even when the BE's cached compaction counts lag behind the
meta-service.
+// STOP_TOKEN is a lock marker used by schema change (MOW table) to block
concurrent
+// compactions during delete bitmap recalculation -- it does not perform
actual compaction
+// work, so verifying compaction count freshness is meaningless for it.
+TEST(MetaServiceJobTest, StopTokenSkipsStaleTabletCacheCheck) {
+ auto meta_service = get_meta_service();
+
+ auto sp = SyncPoint::get_instance();
+ DORIS_CLOUD_DEFER {
+ SyncPoint::get_instance()->clear_all_call_backs();
+ };
+ sp->set_call_back("get_instance_id", [&](auto&& args) {
+ auto* ret = try_any_cast_ret<std::string>(args);
+ ret->first = instance_id;
+ ret->second = true;
+ });
+ sp->enable_processing();
+
+ int64_t table_id = 1, index_id = 2, partition_id = 3, tablet_id = 101;
+
+ // Set up tablet index
+ auto index_key = meta_tablet_idx_key({instance_id, tablet_id});
+ TabletIndexPB idx_pb;
+ idx_pb.set_table_id(table_id);
+ idx_pb.set_index_id(index_id);
+ idx_pb.set_partition_id(partition_id);
+ idx_pb.set_tablet_id(tablet_id);
+ std::unique_ptr<Transaction> txn;
+ ASSERT_EQ(meta_service->txn_kv()->create_txn(&txn), TxnErrorCode::TXN_OK);
+ txn->put(index_key, idx_pb.SerializeAsString());
+
+ // Simulate meta-service state where cumulative_compaction_cnt=9 (advanced
by another BE)
+ std::string stats_key =
+ stats_tablet_key({instance_id, table_id, index_id, partition_id,
tablet_id});
+ TabletStatsPB stats;
+ stats.set_base_compaction_cnt(0);
+ stats.set_cumulative_compaction_cnt(9);
+ txn->put(stats_key, stats.SerializeAsString());
+ ASSERT_EQ(txn->commit(), TxnErrorCode::TXN_OK);
+
+ // A regular CUMULATIVE compaction with stale counts (req=8 < actual=9)
must be rejected.
+ {
+ StartTabletJobResponse res;
+ start_compaction_job(meta_service.get(), tablet_id, "cumu_job",
"ip:port",
+ /*base_cnt=*/0, /*cumu_cnt=*/8,
TabletCompactionJobPB::CUMULATIVE,
+ res);
+ ASSERT_EQ(res.status().code(), MetaServiceCode::STALE_TABLET_CACHE)
+ << "CUMULATIVE with stale counts should be rejected";
+ }
+
+ // A STOP_TOKEN with the same stale counts must NOT be rejected (CORE-5964
regression).
+ // The BE's cached cumulative_compaction_cnt=8 lags behind the actual
value=9 on the
+ // meta-service side, but STOP_TOKEN registration must still succeed.
+ {
+ StartTabletJobResponse res;
+ start_compaction_job(meta_service.get(), tablet_id, "stop_token_job",
"ip:port",
+ /*base_cnt=*/0, /*cumu_cnt=*/8,
TabletCompactionJobPB::STOP_TOKEN,
+ res);
+ ASSERT_EQ(res.status().code(), MetaServiceCode::OK)
+ << "STOP_TOKEN with stale counts should NOT be rejected; got: "
+ << res.status().msg();
+ }
+}
+
TEST(MetaServiceJobTest, DeleteBitmapUpdateLockCompatibilityTest) {
auto meta_service = get_meta_service();
auto sp = SyncPoint::get_instance();
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]