[kudu] branch master updated: [tools] update schema if needed when rebuilding master
This is an automated email from the ASF dual-hosted git repository. alexey pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git The following commit(s) were added to refs/heads/master by this push: new 5f29e0423 [tools] update schema if needed when rebuilding master 5f29e0423 is described below commit 5f29e04230dadd840b12887423fb76ac62d96538 Author: Yingchun Lai AuthorDate: Fri May 6 00:47:36 2022 +0800 [tools] update schema if needed when rebuilding master Before this patch, if tservers had outdated schemas with lower version for some tables (e.g. when the cluster wasn't healthy), the 'kudu master unsafe_rebuild' tool might rebuild the system catalog with outdated schemas for the unhealthy tables. This patch optimizes the rebuild logic, so the tool now picks up schema of the highest version for an unhealthy table when rebuilding the system catalog. Change-Id: Iec99d57115228b521ba645b8e19c7057a4bb5d3d Reviewed-on: http://gerrit.cloudera.org:8080/18496 Tested-by: Yingchun Lai Reviewed-by: Alexey Serbin --- src/kudu/tools/kudu-admin-test.cc | 102 - src/kudu/tools/master_rebuilder.cc | 67 +--- 2 files changed, 125 insertions(+), 44 deletions(-) diff --git a/src/kudu/tools/kudu-admin-test.cc b/src/kudu/tools/kudu-admin-test.cc index f5314ca3e..120b0da84 100644 --- a/src/kudu/tools/kudu-admin-test.cc +++ b/src/kudu/tools/kudu-admin-test.cc @@ -3037,7 +3037,10 @@ namespace { constexpr const char* kPrincipal = "oryx"; vector RebuildMasterCmd(const ExternalMiniCluster& cluster, +int tserver_num, bool is_secure, bool log_to_stderr = false) { + CHECK_GT(tserver_num, 0); + CHECK_LE(tserver_num, cluster.num_tablet_servers()); vector command = { "master", "unsafe_rebuild", @@ -3052,7 +3055,7 @@ vector RebuildMasterCmd(const ExternalMiniCluster& cluster, if (is_secure) { command.emplace_back(Substitute("--sasl_protocol_name=$0", kPrincipal)); } - for (int i = 0; i < cluster.num_tablet_servers(); i++) { + for (int i = 0; i < tserver_num; i++) { auto* ts = cluster.tablet_server(i); command.emplace_back(ts->bound_rpc_hostport().ToString()); } @@ -3085,7 +3088,8 @@ TEST_F(AdminCliTest, TestRebuildMasterWhenNonEmpty) { NO_FATALS(cluster_->master()->Shutdown()); string stdout; string stderr; - Status s = RunKuduTool(RebuildMasterCmd(*cluster_, /*is_secure*/false, /*log_to_stderr*/true), + Status s = RunKuduTool(RebuildMasterCmd(*cluster_, FLAGS_num_tablet_servers, + /*is_secure*/false, /*log_to_stderr*/true), &stdout, &stderr); ASSERT_TRUE(s.IsRuntimeError()) << s.ToString(); ASSERT_STR_CONTAINS(stderr, "must be empty"); @@ -3093,7 +3097,8 @@ TEST_F(AdminCliTest, TestRebuildMasterWhenNonEmpty) { // Delete the contents of the old master from disk. This should allow the // tool to run. ASSERT_OK(cluster_->master()->DeleteFromDisk()); - ASSERT_OK(RunKuduTool(RebuildMasterCmd(*cluster_, /*is_secure*/false, /*log_to_stderr*/true), + ASSERT_OK(RunKuduTool(RebuildMasterCmd(*cluster_, FLAGS_num_tablet_servers, + /*is_secure*/false, /*log_to_stderr*/true), &stdout, &stderr)); ASSERT_STR_NOT_CONTAINS(stderr, "must be empty"); ASSERT_STR_CONTAINS(stdout, @@ -3136,7 +3141,8 @@ TEST_F(AdminCliTest, TestRebuildMasterWithTombstones) { ASSERT_OK(cluster_->master()->DeleteFromDisk()); string stdout; string stderr; - ASSERT_OK(RunKuduTool(RebuildMasterCmd(*cluster_, /*is_secure*/false, /*log_to_stderr*/true), + ASSERT_OK(RunKuduTool(RebuildMasterCmd(*cluster_, FLAGS_num_tablet_servers, + /*is_secure*/false, /*log_to_stderr*/true), &stdout, &stderr)); ASSERT_STR_CONTAINS(stderr, Substitute("Skipping replica of tablet $0 of table $1", tablet_id, kTable)); @@ -3165,39 +3171,56 @@ TEST_F(AdminCliTest, TestRebuildMasterWithTombstones) { NO_FATALS(cv.CheckCluster()); } -TEST_F(AdminCliTest, TestRebuildMasterAndAddColumns) { - FLAGS_num_tablet_servers = 1; - FLAGS_num_replicas = 1; +TEST_F(AdminCliTest, TestAddColumnsAndRebuildMaster) { + FLAGS_num_tablet_servers = 3; + FLAGS_num_replicas = 3; NO_FATALS(BuildAndStart()); + // Add a column and shutdown a tserver, the tserver holds a schema with a lower version. { unique_ptr table_alterer(client_->NewTableAlterer(kTableId)); table_alterer->AddColumn("old_column_0")->Type(KuduColumnSchema::INT32); ASSERT_OK(table_alterer->Alter()); } + NO_FATALS(cluster_->tablet_server(0)->Shutdown()); + + // Add another column, the latest schema has a higher version. + { +unique_ptr
[kudu] branch master updated: [test] fix ASAN test failures in master-stress-test
This is an automated email from the ASF dual-hosted git repository. alexey pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git The following commit(s) were added to refs/heads/master by this push: new e2f0d8fb0 [test] fix ASAN test failures in master-stress-test e2f0d8fb0 is described below commit e2f0d8fb056d9834199ec2feaba30daefae7 Author: zhangyifan27 AuthorDate: Thu May 5 20:30:10 2022 +0800 [test] fix ASAN test failures in master-stress-test The MasterStressTest occasionally failed in ASAN builds [1] [2]. I also built the test in ASAN mode on a CentOS 7.9 machine, ran it in slow mode 3 times and all failed. It seems that some RetryingTSRpcTask is still running after 'TableInfo' object destroyed if we set --enable_metadata_cleanup_for_deleted_tables_and_tablets=true. To fix heap-use-after-free errors, we should abort and wait all pending tasks completed before we destroy a TabletInfo object. With this fix the ASAN test can pass 10 times. [1] http://jenkins.kudu.apache.org/job/kudu-gerrit/25414/BUILD_TYPE=ASAN/ [2] http://jenkins.kudu.apache.org/job/kudu-gerrit/25448/BUILD_TYPE=ASAN/ Change-Id: I4eff45fbf05644362169485cd32678509eab1b07 Reviewed-on: http://gerrit.cloudera.org:8080/18501 Tested-by: Kudu Jenkins Reviewed-by: Alexey Serbin --- src/kudu/master/catalog_manager.cc | 49 ++ 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc index 7af3873ba..a1cf87ac8 100644 --- a/src/kudu/master/catalog_manager.cc +++ b/src/kudu/master/catalog_manager.cc @@ -3845,14 +3845,17 @@ class PickLeaderReplica : public TSPicker { // // The target tablet server is refreshed before each RPC by consulting the provided // TSPicker implementation. +// Each created RetryingTSRpcTask should be added to TableInfo::pending_tasks_ by +// calling TableInfo::AddTask(), so 'table' must remain valid for the lifetime of +// this class. class RetryingTSRpcTask : public MonitoredTask { public: - RetryingTSRpcTask(Master *master, + RetryingTSRpcTask(Master* master, unique_ptr replica_picker, -scoped_refptr table) +TableInfo* table) : master_(master), replica_picker_(std::move(replica_picker)), - table_(std::move(table)), + table_(table), start_ts_(MonoTime::Now()), deadline_(start_ts_ + MonoDelta::FromMilliseconds(FLAGS_unresponsive_ts_rpc_timeout_ms)), attempt_(0), @@ -3876,7 +3879,7 @@ class RetryingTSRpcTask : public MonitoredTask { MonoTime start_timestamp() const override { return start_ts_; } MonoTime completion_timestamp() const override { return end_ts_; } - const scoped_refptr& table() const { return table_ ; } + TableInfo* table() const { return table_; } protected: // Send an RPC request and register a callback. @@ -3919,7 +3922,8 @@ class RetryingTSRpcTask : public MonitoredTask { Master * const master_; const unique_ptr replica_picker_; - const scoped_refptr table_; + // RetryingTSRpcTask is owned by 'TableInfo', so the backpointer should be raw. + TableInfo* table_; MonoTime start_ts_; MonoTime end_ts_; @@ -4091,7 +4095,7 @@ class RetrySpecificTSRpcTask : public RetryingTSRpcTask { public: RetrySpecificTSRpcTask(Master* master, const string& permanent_uuid, - const scoped_refptr& table) + TableInfo* table) : RetryingTSRpcTask(master, unique_ptr(new PickSpecificUUID(permanent_uuid)), table), @@ -4113,7 +4117,7 @@ class AsyncCreateReplica : public RetrySpecificTSRpcTask { const string& permanent_uuid, const scoped_refptr& tablet, const TabletMetadataLock& tablet_lock) -: RetrySpecificTSRpcTask(master, permanent_uuid, tablet->table()), +: RetrySpecificTSRpcTask(master, permanent_uuid, tablet->table().get()), tablet_id_(tablet->id()) { deadline_ = start_ts_ + MonoDelta::FromMilliseconds(FLAGS_tablet_creation_timeout_ms); @@ -4179,17 +4183,17 @@ class AsyncCreateReplica : public RetrySpecificTSRpcTask { // Send a DeleteTablet() RPC request. class AsyncDeleteReplica : public RetrySpecificTSRpcTask { public: - AsyncDeleteReplica( - Master* master, const string& permanent_uuid, - const scoped_refptr& table, string tablet_id, - TabletDataState delete_type, - optional cas_config_opid_index_less_or_equal, - string reason) + AsyncDeleteReplica(Master* master, + const string& permanent_uuid, + TableInfo* table, + string tablet_id, + TabletDataState delete_type, + optional cas_co