This is an automated email from the ASF dual-hosted git repository.
hellostephen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new f7ed94624e9 branch-3.0-pick: [Fix](cloud) Should consider tablet state
change whether to skip `sync_rowsets` in publish phase (#48400) (#48667)
f7ed94624e9 is described below
commit f7ed94624e9625948acc7e7b272df3e54149f6a1
Author: bobhan1 <[email protected]>
AuthorDate: Mon Mar 10 15:13:15 2025 +0800
branch-3.0-pick: [Fix](cloud) Should consider tablet state change whether
to skip `sync_rowsets` in publish phase (#48400) (#48667)
pick https://github.com/apache/doris/pull/48400
---
.../cloud/cloud_engine_calc_delete_bitmap_task.cpp | 20 ++-
.../cloud/cloud_engine_calc_delete_bitmap_task.h | 3 +
cloud/src/meta-service/meta_service.cpp | 34 ++++-
cloud/test/meta_service_test.cpp | 23 ++-
.../transaction/CloudGlobalTransactionMgr.java | 18 ++-
.../transaction/DeleteBitmapUpdateLockContext.java | 6 +
.../org/apache/doris/master/ReportHandler.java | 16 ++
gensrc/proto/cloud.proto | 1 +
gensrc/thrift/AgentService.thrift | 2 +
.../test_tablet_state_change_in_publish_phase.out | Bin 0 -> 227 bytes
...est_tablet_state_change_in_publish_phase.groovy | 161 +++++++++++++++++++++
11 files changed, 270 insertions(+), 14 deletions(-)
diff --git a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp
b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp
index e85b160cf2f..c4ae5513001 100644
--- a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp
+++ b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp
@@ -22,6 +22,7 @@
#include <memory>
#include <random>
#include <thread>
+#include <type_traits>
#include "cloud/cloud_meta_mgr.h"
#include "cloud/cloud_tablet.h"
@@ -75,6 +76,7 @@ Status CloudEngineCalcDeleteBitmapTask::execute() {
bool has_compaction_stats = partition.__isset.base_compaction_cnts &&
partition.__isset.cumulative_compaction_cnts &&
partition.__isset.cumulative_points;
+ bool has_tablet_states = partition.__isset.tablet_states;
for (size_t i = 0; i < partition.tablet_ids.size(); i++) {
auto tablet_id = partition.tablet_ids[i];
auto tablet_calc_delete_bitmap_ptr =
std::make_shared<CloudTabletCalcDeleteBitmapTask>(
@@ -84,6 +86,9 @@ Status CloudEngineCalcDeleteBitmapTask::execute() {
partition.base_compaction_cnts[i],
partition.cumulative_compaction_cnts[i],
partition.cumulative_points[i]);
}
+ if (has_tablet_states) {
+
tablet_calc_delete_bitmap_ptr->set_tablet_state(partition.tablet_states[i]);
+ }
auto submit_st = token->submit_func([=]() {
auto st = tablet_calc_delete_bitmap_ptr->handle();
if (!st.ok()) {
@@ -128,6 +133,9 @@ void
CloudTabletCalcDeleteBitmapTask::set_compaction_stats(int64_t ms_base_compa
_ms_cumulative_compaction_cnt = ms_cumulative_compaction_cnt;
_ms_cumulative_point = ms_cumulative_point;
}
+void CloudTabletCalcDeleteBitmapTask::set_tablet_state(int64_t tablet_state) {
+ _ms_tablet_state = tablet_state;
+}
Status CloudTabletCalcDeleteBitmapTask::handle() const {
VLOG_DEBUG << "start calculate delete bitmap on tablet " << _tablet_id;
@@ -146,7 +154,10 @@ Status CloudTabletCalcDeleteBitmapTask::handle() const {
int64_t max_version = tablet->max_version_unlocked();
int64_t t2 = MonotonicMicros();
- auto should_sync_rowsets_produced_by_compaction = [&]() {
+ auto should_sync_rowsets = [&]() {
+ if (_version != max_version + 1) {
+ return true;
+ }
if (_ms_base_compaction_cnt == -1) {
return true;
}
@@ -156,9 +167,12 @@ Status CloudTabletCalcDeleteBitmapTask::handle() const {
std::shared_lock rlock(tablet->get_header_lock());
return _ms_base_compaction_cnt > tablet->base_compaction_cnt() ||
_ms_cumulative_compaction_cnt >
tablet->cumulative_compaction_cnt() ||
- _ms_cumulative_point > tablet->cumulative_layer_point();
+ _ms_cumulative_point > tablet->cumulative_layer_point() ||
+ (_ms_tablet_state.has_value() &&
+ _ms_tablet_state.value() != // an SC job finished on other BEs
during this load job
+
static_cast<std::underlying_type_t<TabletState>>(tablet->tablet_state()));
};
- if (_version != max_version + 1 ||
should_sync_rowsets_produced_by_compaction()) {
+ if (should_sync_rowsets()) {
auto sync_st = tablet->sync_rowsets();
if (!sync_st.ok()) {
LOG(WARNING) << "failed to sync rowsets. tablet_id=" << _tablet_id
diff --git a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.h
b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.h
index e3733d3e696..62bd91b0a8a 100644
--- a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.h
+++ b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.h
@@ -18,6 +18,7 @@
#pragma once
#include <memory>
+#include <optional>
#include "cloud/cloud_storage_engine.h"
#include "cloud/cloud_tablet.h"
@@ -39,6 +40,7 @@ public:
void set_compaction_stats(int64_t ms_base_compaction_cnt, int64_t
ms_cumulative_compaction_cnt,
int64_t ms_cumulative_point);
+ void set_tablet_state(int64_t tablet_state);
Status handle() const;
@@ -53,6 +55,7 @@ private:
int64_t _ms_base_compaction_cnt {-1};
int64_t _ms_cumulative_compaction_cnt {-1};
int64_t _ms_cumulative_point {-1};
+ std::optional<int64_t> _ms_tablet_state;
std::shared_ptr<MemTrackerLimiter> _mem_tracker;
};
diff --git a/cloud/src/meta-service/meta_service.cpp
b/cloud/src/meta-service/meta_service.cpp
index bc8af94496a..cc74384decf 100644
--- a/cloud/src/meta-service/meta_service.cpp
+++ b/cloud/src/meta-service/meta_service.cpp
@@ -2310,6 +2310,7 @@ void
MetaServiceImpl::get_delete_bitmap_update_lock(google::protobuf::RpcControl
}
for (const auto& tablet_idx : request->tablet_indexes()) {
+ // 1. get compaction cnts
TabletStatsPB tablet_stat;
std::string stats_key =
stats_tablet_key({instance_id, tablet_idx.table_id(),
tablet_idx.index_id(),
@@ -2343,16 +2344,43 @@ void
MetaServiceImpl::get_delete_bitmap_update_lock(google::protobuf::RpcControl
response->add_base_compaction_cnts(tablet_stat.base_compaction_cnt());
response->add_cumulative_compaction_cnts(tablet_stat.cumulative_compaction_cnt());
response->add_cumulative_points(tablet_stat.cumulative_point());
+
+ // 2. get tablet states
+ std::string tablet_meta_key =
+ meta_tablet_key({instance_id, tablet_idx.table_id(),
tablet_idx.index_id(),
+ tablet_idx.partition_id(),
tablet_idx.tablet_id()});
+ std::string tablet_meta_val;
+ err = txn->get(tablet_meta_key, &tablet_meta_val);
+ if (err != TxnErrorCode::TXN_OK) {
+ ss << "failed to get tablet meta"
+ << (err == TxnErrorCode::TXN_KEY_NOT_FOUND ? " (not found)" :
"")
+ << " instance_id=" << instance_id << " tablet_id=" <<
tablet_idx.tablet_id()
+ << " key=" << hex(tablet_meta_key) << " err=" << err;
+ msg = ss.str();
+ code = err == TxnErrorCode::TXN_KEY_NOT_FOUND ?
MetaServiceCode::TABLET_NOT_FOUND
+ :
cast_as<ErrCategory::READ>(err);
+ return;
+ }
+ doris::TabletMetaCloudPB tablet_meta;
+ if (!tablet_meta.ParseFromString(tablet_meta_val)) {
+ code = MetaServiceCode::PROTOBUF_PARSE_ERR;
+ msg = "malformed tablet meta";
+ return;
+ }
+ response->add_tablet_states(
+
static_cast<std::underlying_type_t<TabletStatePB>>(tablet_meta.tablet_state()));
}
read_stats_sw.pause();
- LOG(INFO) << fmt::format("tablet_idxes.size()={}, read tablet compaction
cnts cost={} ms",
- request->tablet_indexes().size(),
read_stats_sw.elapsed_us() / 1000);
+ LOG(INFO) << fmt::format(
+ "tablet_idxes.size()={}, read tablet compaction cnts and tablet
states cost={} ms",
+ request->tablet_indexes().size(), read_stats_sw.elapsed_us() /
1000);
DeleteBitmapUpdateLockPB lock_info_tmp;
if (!check_delete_bitmap_lock(code, msg, ss, txn, table_id,
request->lock_id(),
request->initiator(), lock_key,
lock_info_tmp)) {
- LOG(WARNING) << "failed to check delete bitmap lock after get tablet
stats, table_id="
+ LOG(WARNING) << "failed to check delete bitmap lock after get tablet
stats and tablet "
+ "states, table_id="
<< table_id << " request lock_id=" << request->lock_id()
<< " request initiator=" << request->initiator() << "
code=" << code
<< " msg=" << msg;
diff --git a/cloud/test/meta_service_test.cpp b/cloud/test/meta_service_test.cpp
index 9eed77271f2..10a5b3c6f18 100644
--- a/cloud/test/meta_service_test.cpp
+++ b/cloud/test/meta_service_test.cpp
@@ -277,7 +277,7 @@ static void insert_rowset(MetaServiceProxy* meta_service,
int64_t db_id, const s
commit_txn(meta_service, db_id, txn_id, label);
}
-static void add_tablet_stats(MetaServiceProxy* meta_service, std::string
instance_id,
+static void add_tablet_metas(MetaServiceProxy* meta_service, std::string
instance_id,
int64_t table_id, int64_t index_id,
const std::vector<std::array<int64_t, 2>>&
tablet_idxes) {
std::unique_ptr<Transaction> txn;
@@ -293,6 +293,17 @@ static void add_tablet_stats(MetaServiceProxy*
meta_service, std::string instanc
stats.set_cumulative_compaction_cnt(20);
stats.set_cumulative_point(30);
txn->put(stats_key, stats.SerializeAsString());
+
+ doris::TabletMetaCloudPB tablet_pb;
+ tablet_pb.set_table_id(table_id);
+ tablet_pb.set_index_id(index_id);
+ tablet_pb.set_partition_id(partition_id);
+ tablet_pb.set_tablet_id(tablet_id);
+ tablet_pb.set_tablet_state(doris::TabletStatePB::PB_RUNNING);
+ auto tablet_meta_key =
+ meta_tablet_key({instance_id, table_id, index_id,
partition_id, tablet_id});
+ auto tablet_meta_val = tablet_pb.SerializeAsString();
+ txn->put(tablet_meta_key, tablet_meta_val);
}
ASSERT_EQ(txn->commit(), TxnErrorCode::TXN_OK);
}
@@ -4659,7 +4670,7 @@ TEST(MetaServiceTest,
GetDeleteBitmapUpdateLockTabletStatsNormal) {
// [(partition_id, tablet_id)]
std::vector<std::array<int64_t, 2>> tablet_idxes {{70001, 12345}, {80001,
3456}, {90001, 6789}};
- add_tablet_stats(meta_service.get(), instance_id, table_id, index_id,
tablet_idxes);
+ add_tablet_metas(meta_service.get(), instance_id, table_id, index_id,
tablet_idxes);
GetDeleteBitmapUpdateLockResponse res;
get_delete_bitmap_update_lock(meta_service.get(), res, db_id, table_id,
index_id, tablet_idxes,
@@ -4713,7 +4724,7 @@ TEST(MetaServiceTest,
GetDeleteBitmapUpdateLockTabletStatsLockExpired) {
std::vector<std::array<int64_t, 2>> tablet_idxes {
{70001, 12345}, {80001, 3456}, {90001, 6789}};
- add_tablet_stats(meta_service.get(), instance_id, table_id, index_id,
tablet_idxes);
+ add_tablet_metas(meta_service.get(), instance_id, table_id, index_id,
tablet_idxes);
GetDeleteBitmapUpdateLockResponse res;
get_delete_bitmap_update_lock(meta_service.get(), res, db_id,
table_id, index_id,
@@ -4754,7 +4765,7 @@ TEST(MetaServiceTest,
GetDeleteBitmapUpdateLockTabletStatsLockExpired) {
std::vector<std::array<int64_t, 2>> tablet_idxes {
{70001, 12345}, {80001, 3456}, {90001, 6789}};
- add_tablet_stats(meta_service.get(), instance_id, table_id, index_id,
tablet_idxes);
+ add_tablet_metas(meta_service.get(), instance_id, table_id, index_id,
tablet_idxes);
GetDeleteBitmapUpdateLockResponse res;
get_delete_bitmap_update_lock(meta_service.get(), res, db_id,
table_id, index_id,
@@ -4796,7 +4807,7 @@ TEST(MetaServiceTest,
GetDeleteBitmapUpdateLockTabletStatsError) {
std::vector<std::array<int64_t, 2>> tablet_idxes {
{70001, 12345}, {80001, 3456}, {90001, 6789}};
- add_tablet_stats(meta_service.get(), instance_id, table_id, index_id,
tablet_idxes);
+ add_tablet_metas(meta_service.get(), instance_id, table_id, index_id,
tablet_idxes);
GetDeleteBitmapUpdateLockResponse res;
get_delete_bitmap_update_lock(meta_service.get(), res, db_id,
table_id, index_id,
@@ -4841,7 +4852,7 @@ TEST(MetaServiceTest,
GetDeleteBitmapUpdateLockTabletStatsError) {
tablet_idxes.push_back({partition_id, tablet_id});
}
- add_tablet_stats(meta_service.get(), instance_id, table_id, index_id,
tablet_idxes);
+ add_tablet_metas(meta_service.get(), instance_id, table_id, index_id,
tablet_idxes);
GetDeleteBitmapUpdateLockResponse res;
get_delete_bitmap_update_lock(meta_service.get(), res, db_id,
table_id, index_id,
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java
b/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java
index bee81365628..188173b32a9 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java
@@ -780,17 +780,26 @@ public class CloudGlobalTransactionMgr implements
GlobalTransactionMgrIface {
if (!lockContext.getBaseCompactionCnts().isEmpty()
&& !lockContext.getCumulativeCompactionCnts().isEmpty()
&& !lockContext.getCumulativePoints().isEmpty()) {
+ boolean hasTabletStats =
!lockContext.getTabletStates().isEmpty();
+
List<Long> reqBaseCompactionCnts = Lists.newArrayList();
List<Long> reqCumulativeCompactionCnts =
Lists.newArrayList();
List<Long> reqCumulativePoints = Lists.newArrayList();
+ List<Long> reqTabletStates = Lists.newArrayList();
for (long tabletId : tabletList) {
reqBaseCompactionCnts.add(lockContext.getBaseCompactionCnts().get(tabletId));
reqCumulativeCompactionCnts.add(lockContext.getCumulativeCompactionCnts().get(tabletId));
reqCumulativePoints.add(lockContext.getCumulativePoints().get(tabletId));
+ if (hasTabletStats) {
+
reqTabletStates.add(lockContext.getTabletStates().get(tabletId));
+ }
}
partitionInfo.setBaseCompactionCnts(reqBaseCompactionCnts);
partitionInfo.setCumulativeCompactionCnts(reqCumulativeCompactionCnts);
partitionInfo.setCumulativePoints(reqCumulativePoints);
+ if (hasTabletStats) {
+ partitionInfo.setTabletStates(reqTabletStates);
+ }
}
partitionInfos.add(partitionInfo);
}
@@ -917,19 +926,24 @@ public class CloudGlobalTransactionMgr implements
GlobalTransactionMgrIface {
List<Long> respBaseCompactionCnts =
response.getBaseCompactionCntsList();
List<Long> respCumulativeCompactionCnts =
response.getCumulativeCompactionCntsList();
List<Long> respCumulativePoints =
response.getCumulativePointsList();
+ List<Long> respTabletStates = response.getTabletStatesList();
int size1 = respBaseCompactionCnts.size();
int size2 = respCumulativeCompactionCnts.size();
int size3 = respCumulativePoints.size();
- if (size1 != tabletList.size() || size2 != tabletList.size() ||
size3 != tabletList.size()) {
+ int size4 = respTabletStates.size();
+ if (size1 != tabletList.size() || size2 != tabletList.size() ||
size3 != tabletList.size()
+ || (size4 > 0 && size4 != tabletList.size())) {
throw new UserException("The size of returned compaction cnts
can't match the size of tabletList, "
+ "tabletList.size()=" + tabletList.size() + ",
respBaseCompactionCnts.size()=" + size1
- + ", respCumulativeCompactionCnts.size()=" + size2 +
", respCumulativePoints.size()=" + size3);
+ + ", respCumulativeCompactionCnts.size()=" + size2 +
", respCumulativePoints.size()=" + size3
+ + ", respTabletStates.size()=" + size4);
}
for (int i = 0; i < tabletList.size(); i++) {
long tabletId = tabletList.get(i);
lockContext.getBaseCompactionCnts().put(tabletId,
respBaseCompactionCnts.get(i));
lockContext.getCumulativeCompactionCnts().put(tabletId,
respCumulativeCompactionCnts.get(i));
lockContext.getCumulativePoints().put(tabletId,
respCumulativePoints.get(i));
+ lockContext.getTabletStates().put(tabletId,
respTabletStates.get(i));
}
totalRetryTime += retryTime;
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/DeleteBitmapUpdateLockContext.java
b/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/DeleteBitmapUpdateLockContext.java
index 02886f63427..120715d6276 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/DeleteBitmapUpdateLockContext.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/DeleteBitmapUpdateLockContext.java
@@ -30,6 +30,7 @@ public class DeleteBitmapUpdateLockContext {
private Map<Long, Long> baseCompactionCnts;
private Map<Long, Long> cumulativeCompactionCnts;
private Map<Long, Long> cumulativePoints;
+ private Map<Long, Long> tabletStates;
private Map<Long, Set<Long>> tableToPartitions;
private Map<Long, Partition> partitions;
private Map<Long, Map<Long, List<Long>>> backendToPartitionTablets;
@@ -40,6 +41,7 @@ public class DeleteBitmapUpdateLockContext {
baseCompactionCnts = Maps.newHashMap();
cumulativeCompactionCnts = Maps.newHashMap();
cumulativePoints = Maps.newHashMap();
+ tabletStates = Maps.newHashMap();
tableToPartitions = Maps.newHashMap();
partitions = Maps.newHashMap();
backendToPartitionTablets = Maps.newHashMap();
@@ -63,6 +65,10 @@ public class DeleteBitmapUpdateLockContext {
return cumulativePoints;
}
+ public Map<Long, Long> getTabletStates() {
+ return tabletStates;
+ }
+
public Map<Long, Map<Long, List<Long>>> getBackendToPartitionTablets() {
return backendToPartitionTablets;
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
index 06047e2cf16..e104ed288b3 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java
@@ -44,6 +44,7 @@ import org.apache.doris.common.Config;
import org.apache.doris.common.MetaNotFoundException;
import org.apache.doris.common.Pair;
import org.apache.doris.common.util.Daemon;
+import org.apache.doris.common.util.DebugPointUtil;
import org.apache.doris.common.util.NetUtils;
import org.apache.doris.common.util.TimeUtils;
import org.apache.doris.cooldown.CooldownConf;
@@ -593,7 +594,22 @@ public class ReportHandler extends Daemon {
LOG.info("finished to handle tablet report from backend[{}] cost: {}
ms", backendId, (end - start));
}
+ private static void debugBlock() {
+ if (DebugPointUtil.isEnable("ReportHandler.block")) {
+ LOG.info("debug point: block at ReportHandler.block");
+ while (DebugPointUtil.isEnable("ReportHandler.block")) {
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException e) {
+ LOG.info("error ", e);
+ }
+ }
+ LOG.info("debug point: leave ReportHandler.block");
+ }
+ }
+
private static void taskReport(long backendId, Map<TTaskType, Set<Long>>
runningTasks) {
+ debugBlock();
if (LOG.isDebugEnabled()) {
LOG.debug("begin to handle task report from backend {}",
backendId);
}
diff --git a/gensrc/proto/cloud.proto b/gensrc/proto/cloud.proto
index dd85f6ec2a2..ff0279990ee 100644
--- a/gensrc/proto/cloud.proto
+++ b/gensrc/proto/cloud.proto
@@ -1470,6 +1470,7 @@ message GetDeleteBitmapUpdateLockResponse {
repeated int64 base_compaction_cnts = 2;
repeated int64 cumulative_compaction_cnts = 3;
repeated int64 cumulative_points = 4;
+ repeated int64 tablet_states = 5;
}
message RemoveDeleteBitmapUpdateLockRequest {
diff --git a/gensrc/thrift/AgentService.thrift
b/gensrc/thrift/AgentService.thrift
index abffd176ef8..aa1ee2f5b9f 100644
--- a/gensrc/thrift/AgentService.thrift
+++ b/gensrc/thrift/AgentService.thrift
@@ -440,6 +440,8 @@ struct TCalcDeleteBitmapPartitionInfo {
4: optional list<i64> base_compaction_cnts
5: optional list<i64> cumulative_compaction_cnts
6: optional list<i64> cumulative_points
+ 7: optional list<i64> sub_txn_ids
+ 8: optional list<i64> tablet_states
}
struct TCalcDeleteBitmapRequest {
diff --git
a/regression-test/data/fault_injection_p0/cloud/test_tablet_state_change_in_publish_phase.out
b/regression-test/data/fault_injection_p0/cloud/test_tablet_state_change_in_publish_phase.out
new file mode 100644
index 00000000000..0ece86d0fb4
Binary files /dev/null and
b/regression-test/data/fault_injection_p0/cloud/test_tablet_state_change_in_publish_phase.out
differ
diff --git
a/regression-test/suites/fault_injection_p0/cloud/test_tablet_state_change_in_publish_phase.groovy
b/regression-test/suites/fault_injection_p0/cloud/test_tablet_state_change_in_publish_phase.groovy
new file mode 100644
index 00000000000..6b7102ed243
--- /dev/null
+++
b/regression-test/suites/fault_injection_p0/cloud/test_tablet_state_change_in_publish_phase.groovy
@@ -0,0 +1,161 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.apache.doris.regression.suite.ClusterOptions
+
+suite("test_tablet_state_change_in_publish_phase", "docker") {
+ def options = new ClusterOptions()
+ options.setFeNum(1)
+ options.setBeNum(2)
+ options.cloudMode = true
+ options.feConfigs += [
+ 'cloud_cluster_check_interval_second=1',
+ 'sys_log_verbose_modules=org',
+ 'heartbeat_interval_second=1'
+ ]
+ options.enableDebugPoints()
+
+ docker(options) {
+ GetDebugPoint().clearDebugPointsForAllFEs()
+ GetDebugPoint().clearDebugPointsForAllBEs()
+
+ def table1 = "test_tablet_state_change_in_publish_phase"
+ sql "DROP TABLE IF EXISTS ${table1} FORCE;"
+ sql """ CREATE TABLE IF NOT EXISTS ${table1} (
+ `k1` int NOT NULL,
+ `c1` int,
+ `c2` int
+ )UNIQUE KEY(k1)
+ DISTRIBUTED BY HASH(k1) BUCKETS 1
+ PROPERTIES (
+ "enable_unique_key_merge_on_write" = "true",
+ "disable_auto_compaction" = "true",
+ "replication_num" = "1"); """
+
+ sql "insert into ${table1} values(1,1,1);"
+ sql "insert into ${table1} values(2,2,2);"
+ sql "insert into ${table1} values(3,3,3);"
+ sql "sync;"
+ qt_sql "select * from ${table1} order by k1;"
+
+ def beNodes = sql_return_maparray("show backends;")
+ def tabletStat = sql_return_maparray("show tablets from
${table1};").get(0)
+ def tabletBackendId = tabletStat.BackendId
+ def tabletId = tabletStat.TabletId
+ def be1
+ for (def be : beNodes) {
+ if (be.BackendId == tabletBackendId) {
+ be1 = be
+ }
+ }
+ logger.info("tablet ${tabletId} on backend ${be1.Host} with
backendId=${be1.BackendId}");
+ logger.info("backends: ${cluster.getBackends()}")
+ int beIndex = 1
+ for (def backend : cluster.getBackends()) {
+ if (backend.host == be1.Host) {
+ beIndex = backend.index
+ break
+ }
+ }
+ assert cluster.getBeByIndex(beIndex).backendId as String ==
tabletBackendId
+
+ try {
+
GetDebugPoint().enableDebugPointForAllBEs("CloudSchemaChangeJob::_convert_historical_rowsets.block")
+ sql "alter table ${table1} modify column c1 varchar(100);"
+ Thread.sleep(1000)
+
+ cluster.stopBackends(beIndex)
+
+ Thread.sleep(1000)
+
+ // let tablet be on another BE
+ sql "insert into ${table1} values(10,88,88);"
+ qt_sql "select * from ${table1} order by k1;"
+ assert sql_return_maparray("show tablets from
${table1};").get(0).BackendId as String != tabletBackendId
+
+ // block FE's task report handler to avoid alter task re-sended to
BE before we enable debug points for SC
+ GetDebugPoint().enableDebugPointForAllFEs("ReportHandler.block")
+ cluster.startBackends(beIndex)
+
GetDebugPoint().enableDebugPointForAllBEs("CloudSchemaChangeJob::_convert_historical_rowsets.block")
+
GetDebugPoint().enableDebugPointForAllBEs("CloudSchemaChangeJob.process_alter_tablet.sleep")
+ GetDebugPoint().disableDebugPointForAllFEs("ReportHandler.block")
+
+ def newThreadInDocker = { Closure actionSupplier ->
+ def connInfo = context.threadLocalConn.get()
+ return Thread.start {
+ connect(connInfo.username, connInfo.password,
connInfo.conn.getMetaData().getURL(), actionSupplier)
+ }
+ }
+
+ // let load 1 block before publish
+
GetDebugPoint().enableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.enable_spin_wait")
+
GetDebugPoint().enableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.block")
+ def t1 = newThreadInDocker {
+ // load 1 will not see any historical data when flush
+ // and will skip to calculate delete bitmaps in later phase
becase the tablet's state is NOT_READY
+ sql "insert into ${table1} values(1,88,88);"
+ }
+ Thread.sleep(800)
+
+ // let sc finish converting historical data
+
GetDebugPoint().disableDebugPointForAllBEs("CloudSchemaChangeJob::_convert_historical_rowsets.block")
+ Thread.sleep(1000)
+
+ // let load 1 publish
+
GetDebugPoint().disableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.enable_spin_wait")
+
GetDebugPoint().disableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.block")
+ t1.join()
+
+ // load 2
+ sql "insert into ${table1} values(1,77,77);"
+
+
+ // let load 3 block before publish
+
GetDebugPoint().enableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.enable_spin_wait")
+
GetDebugPoint().enableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.block")
+ def t2 = newThreadInDocker {
+ sql "insert into ${table1} values(1,99,99);"
+ }
+ Thread.sleep(1000)
+
+ // let sc finish
+
GetDebugPoint().disableDebugPointForAllBEs("CloudSchemaChangeJob.process_alter_tablet.sleep")
+
+ dockerAwaitUntil(30) {
+ def res = sql_return_maparray """ SHOW ALTER TABLE COLUMN
WHERE TableName='${table1}' ORDER BY createtime DESC LIMIT 1 """
+ logger.info("alter status: ${res}")
+ res[0].State as String == "FINISHED"
+ }
+ // tablet state has changed to NORMAL in MS
+
+ // let load 3 publish
+
GetDebugPoint().disableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.enable_spin_wait")
+
GetDebugPoint().disableDebugPointForAllFEs("CloudGlobalTransactionMgr.getDeleteBitmapUpdateLock.block")
+ t2.join()
+
+ qt_dup_key_count "select k1,count() as cnt from ${table1} group by
k1 having cnt>1;"
+ qt_sql "select * from ${table1} order by k1;"
+
+ } catch(Exception e) {
+ logger.info(e.getMessage())
+ throw e
+ } finally {
+ GetDebugPoint().clearDebugPointsForAllFEs()
+ GetDebugPoint().clearDebugPointsForAllBEs()
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]