This is an automated email from the ASF dual-hosted git repository.
zhangchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new a3de177da07 [fix](cloud) Adjust rowset state check in
`CloudTablet::create_transient_rowset_writer` (#45496)
a3de177da07 is described below
commit a3de177da0799b93c02a173c659800fde7f5278d
Author: bobhan1 <[email protected]>
AuthorDate: Thu Dec 19 17:17:28 2024 +0800
[fix](cloud) Adjust rowset state check in
`CloudTablet::create_transient_rowset_writer` (#45496)
https://github.com/apache/doris/pull/32257 checks if the current rowset
state is `BEGIN_PARTIAL_UPDATE` in
`CloudTablet::create_transient_rowset_writer`. But if this is a retry
calculate task, the rowset's state may have been changed to `COMMITTED`
in the first try. This PR adjust this check to avoid DCHECK fails.
---
be/src/cloud/cloud_tablet.cpp | 23 +++--
.../cloud/test_cloud_mow_partial_update_retry.out | 16 ++++
.../test_cloud_mow_partial_update_retry.groovy | 100 +++++++++++++++++++++
3 files changed, 131 insertions(+), 8 deletions(-)
diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp
index 93c71287567..c7d3170726b 100644
--- a/be/src/cloud/cloud_tablet.cpp
+++ b/be/src/cloud/cloud_tablet.cpp
@@ -54,6 +54,7 @@ namespace doris {
using namespace ErrorCode;
static constexpr int COMPACTION_DELETE_BITMAP_LOCK_ID = -1;
+static constexpr int LOAD_INITIATOR_ID = -1;
CloudTablet::CloudTablet(CloudStorageEngine& engine, TabletMetaSharedPtr
tablet_meta)
: BaseTablet(std::move(tablet_meta)), _engine(engine) {}
@@ -504,13 +505,19 @@ Result<std::unique_ptr<RowsetWriter>>
CloudTablet::create_rowset_writer(
Result<std::unique_ptr<RowsetWriter>>
CloudTablet::create_transient_rowset_writer(
const Rowset& rowset, std::shared_ptr<PartialUpdateInfo>
partial_update_info,
int64_t txn_expiration) {
- if (rowset.rowset_meta()->rowset_state() !=
RowsetStatePB::BEGIN_PARTIAL_UPDATE) [[unlikely]] {
- // May cause the segment files generated by the transient rowset
writer unable to be
- // recycled, see `CloudRowsetWriter::build` for detail.
- LOG(WARNING) << "Wrong rowset state: " <<
rowset.rowset_meta()->rowset_state();
- DCHECK(false) << rowset.rowset_meta()->rowset_state();
+ if (rowset.rowset_meta_state() != RowsetStatePB::BEGIN_PARTIAL_UPDATE &&
+ rowset.rowset_meta_state() != RowsetStatePB::COMMITTED) [[unlikely]] {
+ auto msg = fmt::format(
+ "wrong rowset state when create_transient_rowset_writer,
rowset state should be "
+ "BEGIN_PARTIAL_UPDATE or COMMITTED, but found {},
rowset_id={}, tablet_id={}",
+ RowsetStatePB_Name(rowset.rowset_meta_state()),
rowset.rowset_id().to_string(),
+ tablet_id());
+ // see `CloudRowsetWriter::build` for detail.
+ // if this is in a retry task, the rowset state may have been changed
to RowsetStatePB::COMMITTED
+ // in `RowsetMeta::merge_rowset_meta()` in previous trials.
+ LOG(WARNING) << msg;
+ DCHECK(false) << msg;
}
-
RowsetWriterContext context;
context.rowset_state = PREPARED;
context.segments_overlap = OVERLAPPING;
@@ -719,8 +726,8 @@ Status CloudTablet::save_delete_bitmap(const TabletTxnInfo*
txn_info, int64_t tx
}
auto ms_lock_id = lock_id == -1 ? txn_id : lock_id;
- RETURN_IF_ERROR(_engine.meta_mgr().update_delete_bitmap(
- *this, ms_lock_id, COMPACTION_DELETE_BITMAP_LOCK_ID,
new_delete_bitmap.get()));
+ RETURN_IF_ERROR(_engine.meta_mgr().update_delete_bitmap(*this, ms_lock_id,
LOAD_INITIATOR_ID,
+
new_delete_bitmap.get()));
// store the delete bitmap with sentinel marks in txn_delete_bitmap_cache
because if the txn is retried for some reason,
// it will use the delete bitmap from txn_delete_bitmap_cache when
re-calculating the delete bitmap, during which it will do
diff --git
a/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.out
b/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.out
new file mode 100644
index 00000000000..3b24419bdc6
--- /dev/null
+++
b/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.out
@@ -0,0 +1,16 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql --
+1 1 1 1
+2 2 2 2
+3 3 3 2
+
+-- !sql --
+1 1 888 1
+2 2 777 2
+3 3 3 2
+
+-- !sql --
+1 999 888 1
+2 666 777 2
+3 3 3 2
+
diff --git
a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.groovy
b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.groovy
new file mode 100644
index 00000000000..13abaf1ffca
--- /dev/null
+++
b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.groovy
@@ -0,0 +1,100 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_cloud_mow_partial_update_retry", "nonConcurrent") {
+ if (!isCloudMode()) {
+ return
+ }
+
+ GetDebugPoint().clearDebugPointsForAllFEs()
+ GetDebugPoint().clearDebugPointsForAllBEs()
+
+ def customFeConfig = [
+ delete_bitmap_lock_expiration_seconds : 10,
+ calculate_delete_bitmap_task_timeout_seconds : 15,
+ ]
+
+ setFeConfigTemporary(customFeConfig) {
+
+ def table1 = "test_cloud_mow_partial_update_retry"
+ sql "DROP TABLE IF EXISTS ${table1} FORCE;"
+ sql """ CREATE TABLE IF NOT EXISTS ${table1} (
+ `k1` int NOT NULL,
+ `c1` int,
+ `c2` int,
+ `c3` int
+ )UNIQUE KEY(k1)
+ DISTRIBUTED BY HASH(k1) BUCKETS 1
+ PROPERTIES (
+ "enable_unique_key_merge_on_write" = "true",
+ "disable_auto_compaction" = "true",
+ "replication_num" = "1"); """
+
+ sql "insert into ${table1} values(1,1,1,1);"
+ sql "insert into ${table1} values(2,2,2,2);"
+ sql "insert into ${table1} values(3,3,3,2);"
+ sql "sync;"
+ qt_sql "select * from ${table1} order by k1;"
+
+ try {
+ // block the first load
+
GetDebugPoint().enableDebugPointForAllBEs("BaseTablet::update_delete_bitmap.enable_spin_wait",
[token: "token1"])
+
GetDebugPoint().enableDebugPointForAllBEs("BaseTablet::update_delete_bitmap.block",
[wait_token: "token1"])
+
+ // the first load
+ t1 = Thread.start {
+ sql "set enable_unique_key_partial_update=true;"
+ sql "sync;"
+ sql "insert into ${table1}(k1,c1) values(1,999),(2,666);"
+ }
+
+ // wait util the first partial update load's delete bitmap update
lock expired
+ // to ensure that the second load can take the delete bitmap
update lock
+ // Config.delete_bitmap_lock_expiration_seconds = 10s
+ Thread.sleep(11 * 1000)
+
+ // the second load
+
GetDebugPoint().enableDebugPointForAllBEs("BaseTablet::update_delete_bitmap.enable_spin_wait",
[token: "token2"])
+ Thread.sleep(200)
+
+ sql "set enable_unique_key_partial_update=true;"
+ sql "sync;"
+ sql "insert into ${table1}(k1,c2) values(1,888),(2,777);"
+
+ qt_sql "select * from ${table1} order by k1;"
+
+
+ // keep waiting util the delete bitmap calculation
timeout(Config.calculate_delete_bitmap_task_timeout_seconds = 15s)
+ // and the first load will retry the calculation of delete bitmap
+ Thread.sleep(15 * 1000)
+
+ // let the first partial update load finish
+
GetDebugPoint().enableDebugPointForAllBEs("BaseTablet::update_delete_bitmap.block")
+ t1.join()
+
+ Thread.sleep(1000)
+
+ qt_sql "select * from ${table1} order by k1;"
+
+ } catch(Exception e) {
+ logger.info(e.getMessage())
+ throw e
+ } finally {
+ GetDebugPoint().clearDebugPointsForAllBEs()
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]