This is an automated email from the ASF dual-hosted git repository.
hellostephen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new bf27b1845ee [feat](cloud) Add lock wait and bthread schedule delay
metrics to SyncRowset profile (#61036)
bf27b1845ee is described below
commit bf27b1845eececa8df4e28afe86d811c8ee613dd
Author: bobhan1 <[email protected]>
AuthorDate: Wed Mar 18 10:34:45 2026 +0800
[feat](cloud) Add lock wait and bthread schedule delay metrics to
SyncRowset profile (#61036)
## Proposed changes
This PR adds visibility into "hidden" time sinks in SyncRowset that are
not covered by existing RPC metrics.
---
be/src/cloud/cloud_meta_mgr.cpp | 14 +++++++++++
be/src/cloud/cloud_tablet.cpp | 36 +++++++++++++++++++++++++++++
be/src/cloud/cloud_tablet.h | 5 ++++
be/src/exec/operator/olap_scan_operator.cpp | 22 +++++++++++++++++-
be/src/exec/operator/olap_scan_operator.h | 3 +++
5 files changed, 79 insertions(+), 1 deletion(-)
diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp
index f7bc86a7308..f51b58bb757 100644
--- a/be/src/cloud/cloud_meta_mgr.cpp
+++ b/be/src/cloud/cloud_meta_mgr.cpp
@@ -590,7 +590,14 @@ Status
CloudMetaMgr::sync_tablet_rowsets_unlocked(CloudTablet* tablet,
idx->set_index_id(index_id);
idx->set_partition_id(tablet->partition_id());
{
+ auto lock_start = std::chrono::steady_clock::now();
std::shared_lock rlock(tablet->get_header_lock());
+ if (sync_stats) {
+ sync_stats->meta_lock_wait_ns +=
+ std::chrono::duration_cast<std::chrono::nanoseconds>(
+ std::chrono::steady_clock::now() - lock_start)
+ .count();
+ }
if (options.full_sync) {
req.set_start_version(0);
} else {
@@ -693,7 +700,14 @@ Status
CloudMetaMgr::sync_tablet_rowsets_unlocked(CloudTablet* tablet,
});
{
const auto& stats = resp.stats();
+ auto lock_start = std::chrono::steady_clock::now();
std::unique_lock wlock(tablet->get_header_lock());
+ if (sync_stats) {
+ sync_stats->meta_lock_wait_ns +=
+ std::chrono::duration_cast<std::chrono::nanoseconds>(
+ std::chrono::steady_clock::now() - lock_start)
+ .count();
+ }
// ATTN: we are facing following data race
//
diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp
index eb07def6006..9936ac374f6 100644
--- a/be/src/cloud/cloud_tablet.cpp
+++ b/be/src/cloud/cloud_tablet.cpp
@@ -275,16 +275,34 @@ Status CloudTablet::sync_rowsets(const SyncOptions&
options, SyncRowsetStats* st
RETURN_IF_ERROR(sync_if_not_running(stats));
if (options.query_version > 0) {
+ auto lock_start = std::chrono::steady_clock::now();
std::shared_lock rlock(_meta_lock);
+ if (stats) {
+ stats->meta_lock_wait_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds>(
+
std::chrono::steady_clock::now() - lock_start)
+ .count();
+ }
if (_max_version >= options.query_version) {
return Status::OK();
}
}
// serially execute sync to reduce unnecessary network overhead
+ auto sync_lock_start = std::chrono::steady_clock::now();
std::unique_lock lock(_sync_meta_lock);
+ if (stats) {
+ stats->sync_meta_lock_wait_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds>(
+
std::chrono::steady_clock::now() - sync_lock_start)
+ .count();
+ }
if (options.query_version > 0) {
+ auto lock_start = std::chrono::steady_clock::now();
std::shared_lock rlock(_meta_lock);
+ if (stats) {
+ stats->meta_lock_wait_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds>(
+
std::chrono::steady_clock::now() - lock_start)
+ .count();
+ }
if (_max_version >= options.query_version) {
return Status::OK();
}
@@ -307,10 +325,22 @@ Status CloudTablet::sync_if_not_running(SyncRowsetStats*
stats) {
}
// Serially execute sync to reduce unnecessary network overhead
+ auto sync_lock_start = std::chrono::steady_clock::now();
std::unique_lock lock(_sync_meta_lock);
+ if (stats) {
+ stats->sync_meta_lock_wait_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds>(
+
std::chrono::steady_clock::now() - sync_lock_start)
+ .count();
+ }
{
+ auto lock_start = std::chrono::steady_clock::now();
std::shared_lock rlock(_meta_lock);
+ if (stats) {
+ stats->meta_lock_wait_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds>(
+
std::chrono::steady_clock::now() - lock_start)
+ .count();
+ }
if (tablet_state() == TABLET_RUNNING) {
return Status::OK();
}
@@ -332,7 +362,13 @@ Status CloudTablet::sync_if_not_running(SyncRowsetStats*
stats) {
TimestampedVersionTracker empty_tracker;
{
+ auto lock_start = std::chrono::steady_clock::now();
std::lock_guard wlock(_meta_lock);
+ if (stats) {
+ stats->meta_lock_wait_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds>(
+
std::chrono::steady_clock::now() - lock_start)
+ .count();
+ }
RETURN_IF_ERROR(set_tablet_state(TABLET_RUNNING));
_rs_version_map.clear();
_stale_rs_version_map.clear();
diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h
index 38675681db7..e7a9b13e851 100644
--- a/be/src/cloud/cloud_tablet.h
+++ b/be/src/cloud/cloud_tablet.h
@@ -53,6 +53,11 @@ struct SyncRowsetStats {
int64_t get_remote_tablet_meta_rpc_ns {0};
int64_t tablet_meta_cache_hit {0};
int64_t tablet_meta_cache_miss {0};
+
+ int64_t bthread_schedule_delay_ns {0};
+ int64_t meta_lock_wait_ns {0}; // _meta_lock (std::shared_mutex) wait
across all acquisitions
+ int64_t sync_meta_lock_wait_ns {
+ 0}; // _sync_meta_lock (bthread::Mutex) wait across all
acquisitions
};
struct SyncOptions {
diff --git a/be/src/exec/operator/olap_scan_operator.cpp
b/be/src/exec/operator/olap_scan_operator.cpp
index 0c6dc914a09..083d1326f9e 100644
--- a/be/src/exec/operator/olap_scan_operator.cpp
+++ b/be/src/exec/operator/olap_scan_operator.cpp
@@ -171,6 +171,12 @@ Status OlapScanLocalState::_init_profile() {
TUnit::BYTES, sync_rowset_timer_name);
_sync_rowset_get_remote_delete_bitmap_rpc_timer = ADD_CHILD_TIMER(
_scanner_profile, "SyncRowsetGetRemoteDeleteBitmapRpcTime",
sync_rowset_timer_name);
+ _sync_rowset_bthread_schedule_wait_timer = ADD_CHILD_TIMER(
+ _scanner_profile, "SyncRowsetBthreadScheduleWaitTime",
sync_rowset_timer_name);
+ _sync_rowset_meta_lock_wait_timer = ADD_CHILD_TIMER(
+ _scanner_profile, "SyncRowsetMetaLockWaitTime",
sync_rowset_timer_name);
+ _sync_rowset_sync_meta_lock_wait_timer = ADD_CHILD_TIMER(
+ _scanner_profile, "SyncRowsetSyncMetaLockWaitTime",
sync_rowset_timer_name);
}
_block_init_timer = ADD_TIMER(_segment_profile, "BlockInitTime");
_block_init_seek_timer = ADD_TIMER(_segment_profile, "BlockInitSeekTime");
@@ -616,7 +622,16 @@ Status
OlapScanLocalState::_sync_cloud_tablets(RuntimeState* state) {
_scan_ranges[i]->version.data() +
_scan_ranges[i]->version.size(),
version);
auto task_ctx = state->get_task_execution_context();
- tasks.emplace_back([this, sync_stats, version, i, task_ctx]() {
+ auto task_create_time = std::chrono::steady_clock::now();
+ tasks.emplace_back([this, sync_stats, version, i, task_ctx,
task_create_time]() {
+ // Record bthread scheduling delay
+ auto task_start_time = std::chrono::steady_clock::now();
+ if (sync_stats) {
+ sync_stats->bthread_schedule_delay_ns +=
+
std::chrono::duration_cast<std::chrono::nanoseconds>(
+ task_start_time - task_create_time)
+ .count();
+ }
auto task_lock = task_ctx.lock();
if (task_lock == nullptr) {
return Status::OK();
@@ -690,6 +705,11 @@ Status OlapScanLocalState::prepare(RuntimeState* state) {
sync_stats.get_remote_delete_bitmap_bytes);
COUNTER_UPDATE(_sync_rowset_get_remote_delete_bitmap_rpc_timer,
sync_stats.get_remote_delete_bitmap_rpc_ns);
+ COUNTER_UPDATE(_sync_rowset_bthread_schedule_wait_timer,
+ sync_stats.bthread_schedule_delay_ns);
+ COUNTER_UPDATE(_sync_rowset_meta_lock_wait_timer,
sync_stats.meta_lock_wait_ns);
+ COUNTER_UPDATE(_sync_rowset_sync_meta_lock_wait_timer,
+ sync_stats.sync_meta_lock_wait_ns);
}
auto time_ms = _sync_cloud_tablets_watcher.elapsed_time_microseconds();
if (time_ms >= config::sync_rowsets_slow_threshold_ms) {
diff --git a/be/src/exec/operator/olap_scan_operator.h
b/be/src/exec/operator/olap_scan_operator.h
index 2ccf7e3b65e..e38d1eaf8f8 100644
--- a/be/src/exec/operator/olap_scan_operator.h
+++ b/be/src/exec/operator/olap_scan_operator.h
@@ -180,6 +180,9 @@ private:
RuntimeProfile::Counter* _sync_rowset_get_remote_delete_bitmap_key_count =
nullptr;
RuntimeProfile::Counter* _sync_rowset_get_remote_delete_bitmap_bytes =
nullptr;
RuntimeProfile::Counter* _sync_rowset_get_remote_delete_bitmap_rpc_timer =
nullptr;
+ RuntimeProfile::Counter* _sync_rowset_bthread_schedule_wait_timer =
nullptr;
+ RuntimeProfile::Counter* _sync_rowset_meta_lock_wait_timer = nullptr;
+ RuntimeProfile::Counter* _sync_rowset_sync_meta_lock_wait_timer = nullptr;
RuntimeProfile::Counter* _block_load_timer = nullptr;
RuntimeProfile::Counter* _block_load_counter = nullptr;
// Add more detail seek timer and counter profile
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]