This is an automated email from the ASF dual-hosted git repository. laiyingchun pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push: new 7133b894e [server] add 'slow_scans' metric for tablet server 7133b894e is described below commit 7133b894e87c705353753c25e83c56b683e6d805 Author: kedeng <kdeng...@gmail.com> AuthorDate: Mon Feb 27 16:07:16 2023 +0800 [server] add 'slow_scans' metric for tablet server As a supplement to the slow scan display, I added a new metric 'slow_scans' to facilitate our monitoring. At the same time, I added test to ensure that the new metric will take effect. Change-Id: I51e64bc4602f0e1dd99207f7d4cf8883085eca9a Reviewed-on: http://gerrit.cloudera.org:8080/19545 Tested-by: Kudu Jenkins Reviewed-by: Yingchun Lai <laiyingc...@apache.org> --- src/kudu/tserver/scanners.cc | 29 ++++++++++++++++++ src/kudu/tserver/scanners.h | 4 +++ src/kudu/tserver/tablet_server-test.cc | 54 ++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) diff --git a/src/kudu/tserver/scanners.cc b/src/kudu/tserver/scanners.cc index d28f1a4aa..c53b331a1 100644 --- a/src/kudu/tserver/scanners.cc +++ b/src/kudu/tserver/scanners.cc @@ -91,6 +91,13 @@ METRIC_DEFINE_gauge_size(server, active_scanners, "Number of scanners that are currently active", kudu::MetricLevel::kInfo); +METRIC_DEFINE_gauge_size(server, slow_scans, + "Slow Scans", + kudu::MetricUnit::kScanners, + "Number of slow scanners that are defined by --slow_scanner_threshold_ms " + "if --show_slow_scans set to 'true'.", + kudu::MetricLevel::kWarn); + using kudu::rpc::RemoteUser; using kudu::tablet::TabletReplica; using std::string; @@ -133,6 +140,9 @@ ScannerManager::ScannerManager(const scoped_refptr<MetricEntity>& metric_entity) METRIC_active_scanners.InstantiateFunctionGauge( metric_entity, [this]() { return this->CountActiveScanners(); }) ->AutoDetach(&metric_detacher_); + METRIC_slow_scans.InstantiateFunctionGauge( + metric_entity, [this]() { return this->CountSlowScans(); }) + ->AutoDetach(&metric_detacher_); } for (size_t i = 0; i < kNumScannerMapStripes; i++) { scanner_maps_.push_back(new ScannerMapStripe()); @@ -279,6 +289,25 @@ size_t ScannerManager::CountActiveScanners() const { return total; } +size_t ScannerManager::CountSlowScans() const { + size_t total = 0; + const MonoTime now = MonoTime::Now(); + const MonoDelta slow_threshold = MonoDelta::FromMilliseconds(FLAGS_slow_scanner_threshold_ms); + for (const auto* stripe : scanner_maps_) { + shared_lock<RWMutex> l(stripe->lock_); + for (const auto& it : stripe->scanners_by_id_) { + const SharedScanner& scanner = it.second; + const MonoTime start_time = scanner->start_time(); + if (start_time + slow_threshold >= now) { + continue; + } + total++; + } + } + + return total; +} + void ScannerManager::ListScanners(std::vector<SharedScanner>* scanners) const { for (const ScannerMapStripe* stripe : scanner_maps_) { shared_lock<RWMutex> l(stripe->lock_); diff --git a/src/kudu/tserver/scanners.h b/src/kudu/tserver/scanners.h index 531a33f68..0fbaa8ec5 100644 --- a/src/kudu/tserver/scanners.h +++ b/src/kudu/tserver/scanners.h @@ -107,6 +107,10 @@ class ScannerManager { // if under concurrent modifications. size_t CountActiveScanners() const; + // Return the number of slow scans that have been defined as + // slow by --slow_scanner_threshold_ms. + size_t CountSlowScans() const; + // List all active scanners. // Note this method will not return a consistent view // of all active scanners if under concurrent modifications. diff --git a/src/kudu/tserver/tablet_server-test.cc b/src/kudu/tserver/tablet_server-test.cc index f023f9fac..4916d796a 100644 --- a/src/kudu/tserver/tablet_server-test.cc +++ b/src/kudu/tserver/tablet_server-test.cc @@ -181,6 +181,7 @@ DECLARE_bool(enable_workload_score_for_perf_improvement_ops); DECLARE_bool(fail_dns_resolution); DECLARE_bool(rowset_metadata_store_keys); DECLARE_bool(scanner_unregister_on_invalid_seq_id); +DECLARE_bool(show_slow_scans); DECLARE_double(cfile_inject_corruption); DECLARE_double(env_inject_eio); DECLARE_double(env_inject_full); @@ -199,7 +200,9 @@ DECLARE_int32(metrics_retirement_age_ms); DECLARE_int32(rpc_service_queue_length); DECLARE_int32(scanner_batch_size_rows); DECLARE_int32(scanner_gc_check_interval_us); +DECLARE_int32(scanner_inject_latency_on_each_batch_ms); DECLARE_int32(scanner_ttl_ms); +DECLARE_int32(slow_scanner_threshold_ms); DECLARE_int32(tablet_bootstrap_inject_latency_ms); DECLARE_int32(tablet_inject_latency_on_apply_write_op_ms); DECLARE_int32(workload_stats_rate_collection_min_interval_ms); @@ -226,6 +229,7 @@ METRIC_DECLARE_gauge_uint64(log_block_manager_containers); METRIC_DECLARE_gauge_size(active_scanners); METRIC_DECLARE_gauge_size(tablet_active_scanners); METRIC_DECLARE_gauge_size(num_rowsets_on_disk); +METRIC_DECLARE_gauge_size(slow_scans); METRIC_DECLARE_histogram(flush_dms_duration); METRIC_DECLARE_histogram(op_apply_queue_length); METRIC_DECLARE_histogram(op_apply_queue_time); @@ -559,6 +563,7 @@ TEST_F(TabletServerTest, TestWebPages) { // bugs in the past. ASSERT_STR_CONTAINS(buf.ToString(), "hybrid_clock_timestamp"); ASSERT_STR_CONTAINS(buf.ToString(), "active_scanners"); + ASSERT_STR_CONTAINS(buf.ToString(), "slow_scans"); ASSERT_STR_CONTAINS(buf.ToString(), "threads_started"); ASSERT_STR_CONTAINS(buf.ToString(), "code_cache_queries"); #ifdef TCMALLOC_ENABLED @@ -2283,6 +2288,55 @@ static const ReadMode kReadModes[] = { INSTANTIATE_TEST_SUITE_P(Params, ExpiredScannerParamTest, testing::ValuesIn(kReadModes)); +class SlowScansParamTest : + public TabletServerTest, + public ::testing::WithParamInterface<ReadMode> { +}; + +TEST_P(SlowScansParamTest, Test) { + const ReadMode mode = GetParam(); + // Slow scanners aren't shown by default. + ASSERT_FALSE(FLAGS_show_slow_scans); + FLAGS_show_slow_scans = true; + FLAGS_scanner_ttl_ms = 500; + // Create slow scan scenarios. + FLAGS_scanner_inject_latency_on_each_batch_ms = 50; + FLAGS_slow_scanner_threshold_ms = 1; + + int num_rows = 10000; + InsertTestRowsDirect(0, num_rows); + + // Instantiate slow scans metric. + ASSERT_TRUE(mini_server_->server()->metric_entity()); + auto slow_scans = METRIC_slow_scans.InstantiateFunctionGauge( + mini_server_->server()->metric_entity(), [this]() { + return this->mini_server_->server()->scanner_manager()->CountSlowScans(); }); + + // Initially, there've been no scanners, so none is slow. + ASSERT_EQ(0, slow_scans->value()); + + ScanResponsePB resp; + NO_FATALS(OpenScannerWithAllColumns(&resp, mode)); + ScanRequestPB req; + RpcController rpc; + req.set_scanner_id(resp.scanner_id()); + req.set_call_seq_id(1); + resp.Clear(); + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + // The scanner should be slow after a while, which is defined by '--slow_scanner_threshold_ms'. + ASSERT_EQ(1, slow_scans->value()); + + // Make scanners expire quickly. + FLAGS_scanner_ttl_ms = 1; + // Ensure that the metrics have been updated now. + ASSERT_EVENTUALLY([&]() { + ASSERT_EQ(0, slow_scans->value()); + }); +} + +INSTANTIATE_TEST_SUITE_P(Params, SlowScansParamTest, + testing::ValuesIn(kReadModes)); + class ScanCorruptedDeltasParamTest : public TabletServerTest, public ::testing::WithParamInterface<ReadMode> {