This is an automated email from the ASF dual-hosted git repository. alexey pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push: new e2f0d8fb0 [test] fix ASAN test failures in master-stress-test e2f0d8fb0 is described below commit e2f0d8fb056d9834199ec2feab8888a30daefae7 Author: zhangyifan27 <chinazhangyi...@163.com> AuthorDate: Thu May 5 20:30:10 2022 +0800 [test] fix ASAN test failures in master-stress-test The MasterStressTest occasionally failed in ASAN builds [1] [2]. I also built the test in ASAN mode on a CentOS 7.9 machine, ran it in slow mode 3 times and all failed. It seems that some RetryingTSRpcTask is still running after 'TableInfo' object destroyed if we set --enable_metadata_cleanup_for_deleted_tables_and_tablets=true. To fix heap-use-after-free errors, we should abort and wait all pending tasks completed before we destroy a TabletInfo object. With this fix the ASAN test can pass 10 times. [1] http://jenkins.kudu.apache.org/job/kudu-gerrit/25414/BUILD_TYPE=ASAN/ [2] http://jenkins.kudu.apache.org/job/kudu-gerrit/25448/BUILD_TYPE=ASAN/ Change-Id: I4eff45fbf05644362169485cd32678509eab1b07 Reviewed-on: http://gerrit.cloudera.org:8080/18501 Tested-by: Kudu Jenkins Reviewed-by: Alexey Serbin <ale...@apache.org> --- src/kudu/master/catalog_manager.cc | 49 ++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc index 7af3873ba..a1cf87ac8 100644 --- a/src/kudu/master/catalog_manager.cc +++ b/src/kudu/master/catalog_manager.cc @@ -3845,14 +3845,17 @@ class PickLeaderReplica : public TSPicker { // // The target tablet server is refreshed before each RPC by consulting the provided // TSPicker implementation. +// Each created RetryingTSRpcTask should be added to TableInfo::pending_tasks_ by +// calling TableInfo::AddTask(), so 'table' must remain valid for the lifetime of +// this class. class RetryingTSRpcTask : public MonitoredTask { public: - RetryingTSRpcTask(Master *master, + RetryingTSRpcTask(Master* master, unique_ptr<TSPicker> replica_picker, - scoped_refptr<TableInfo> table) + TableInfo* table) : master_(master), replica_picker_(std::move(replica_picker)), - table_(std::move(table)), + table_(table), start_ts_(MonoTime::Now()), deadline_(start_ts_ + MonoDelta::FromMilliseconds(FLAGS_unresponsive_ts_rpc_timeout_ms)), attempt_(0), @@ -3876,7 +3879,7 @@ class RetryingTSRpcTask : public MonitoredTask { MonoTime start_timestamp() const override { return start_ts_; } MonoTime completion_timestamp() const override { return end_ts_; } - const scoped_refptr<TableInfo>& table() const { return table_ ; } + TableInfo* table() const { return table_; } protected: // Send an RPC request and register a callback. @@ -3919,7 +3922,8 @@ class RetryingTSRpcTask : public MonitoredTask { Master * const master_; const unique_ptr<TSPicker> replica_picker_; - const scoped_refptr<TableInfo> table_; + // RetryingTSRpcTask is owned by 'TableInfo', so the backpointer should be raw. + TableInfo* table_; MonoTime start_ts_; MonoTime end_ts_; @@ -4091,7 +4095,7 @@ class RetrySpecificTSRpcTask : public RetryingTSRpcTask { public: RetrySpecificTSRpcTask(Master* master, const string& permanent_uuid, - const scoped_refptr<TableInfo>& table) + TableInfo* table) : RetryingTSRpcTask(master, unique_ptr<TSPicker>(new PickSpecificUUID(permanent_uuid)), table), @@ -4113,7 +4117,7 @@ class AsyncCreateReplica : public RetrySpecificTSRpcTask { const string& permanent_uuid, const scoped_refptr<TabletInfo>& tablet, const TabletMetadataLock& tablet_lock) - : RetrySpecificTSRpcTask(master, permanent_uuid, tablet->table()), + : RetrySpecificTSRpcTask(master, permanent_uuid, tablet->table().get()), tablet_id_(tablet->id()) { deadline_ = start_ts_ + MonoDelta::FromMilliseconds(FLAGS_tablet_creation_timeout_ms); @@ -4179,17 +4183,17 @@ class AsyncCreateReplica : public RetrySpecificTSRpcTask { // Send a DeleteTablet() RPC request. class AsyncDeleteReplica : public RetrySpecificTSRpcTask { public: - AsyncDeleteReplica( - Master* master, const string& permanent_uuid, - const scoped_refptr<TableInfo>& table, string tablet_id, - TabletDataState delete_type, - optional<int64_t> cas_config_opid_index_less_or_equal, - string reason) + AsyncDeleteReplica(Master* master, + const string& permanent_uuid, + TableInfo* table, + string tablet_id, + TabletDataState delete_type, + optional<int64_t> cas_config_opid_index_less_or_equal, + string reason) : RetrySpecificTSRpcTask(master, permanent_uuid, table), tablet_id_(std::move(tablet_id)), delete_type_(delete_type), - cas_config_opid_index_less_or_equal_( - std::move(cas_config_opid_index_less_or_equal)), + cas_config_opid_index_less_or_equal_(std::move(cas_config_opid_index_less_or_equal)), reason_(std::move(reason)) {} string type_name() const override { @@ -4294,7 +4298,7 @@ class AsyncAlterTable : public RetryingTSRpcTask { scoped_refptr<TabletInfo> tablet) : RetryingTSRpcTask(master, unique_ptr<TSPicker>(new PickLeaderReplica(tablet)), - tablet->table()), + tablet->table().get()), tablet_(std::move(tablet)) { } @@ -4394,7 +4398,7 @@ AsyncChangeConfigTask::AsyncChangeConfigTask(Master* master, consensus::ChangeConfigType change_config_type) : RetryingTSRpcTask(master, unique_ptr<TSPicker>(new PickLeaderReplica(tablet)), - tablet->table()), + tablet->table().get()), tablet_(std::move(tablet)), cstate_(std::move(cstate)), change_config_type_(change_config_type) { @@ -4759,7 +4763,7 @@ Status CatalogManager::ProcessTabletReport( // TODO(unknown): Cancel tablet creation, instead of deleting, in cases // where that might be possible (tablet creation timeout & replacement). rpcs.emplace_back(new AsyncDeleteReplica( - master_, ts_desc->permanent_uuid(), table, tablet_id, + master_, ts_desc->permanent_uuid(), table.get(), tablet_id, TABLET_DATA_DELETED, none, msg)); continue; } @@ -4786,7 +4790,7 @@ Status CatalogManager::ProcessTabletReport( "Replica has no consensus available" : Substitute("Replica with old config index $0", report_opid_index); rpcs.emplace_back(new AsyncDeleteReplica( - master_, ts_desc->permanent_uuid(), table, tablet_id, + master_, ts_desc->permanent_uuid(), table.get(), tablet_id, TABLET_DATA_TOMBSTONED, prev_opid_index, Substitute("$0 (current committed config index is $1)", delete_msg, prev_opid_index))); @@ -4909,7 +4913,7 @@ Status CatalogManager::ProcessTabletReport( const string& peer_uuid = p.permanent_uuid(); if (!ContainsKey(current_member_uuids, peer_uuid)) { rpcs.emplace_back(new AsyncDeleteReplica( - master_, peer_uuid, table, tablet_id, + master_, peer_uuid, table.get(), tablet_id, TABLET_DATA_TOMBSTONED, prev_cstate.committed_config().opid_index(), Substitute("TS $0 not found in new config with opid_index $1", peer_uuid, cstate.committed_config().opid_index()))); @@ -5166,7 +5170,7 @@ void CatalogManager::SendDeleteTabletRequest(const scoped_refptr<TabletInfo>& ta << " replicas of tablet " << tablet->id(); for (const auto& peer : cstate.committed_config().peers()) { scoped_refptr<AsyncDeleteReplica> task = new AsyncDeleteReplica( - master_, peer.permanent_uuid(), tablet->table(), tablet->id(), + master_, peer.permanent_uuid(), tablet->table().get(), tablet->id(), TABLET_DATA_DELETED, none, deletion_msg); tablet->table()->AddTask(tablet->id(), task); WARN_NOT_OK(task->Run(), Substitute( @@ -6513,6 +6517,9 @@ void PersistentTabletInfo::set_state(SysTabletsEntryPB::State state, const strin TableInfo::TableInfo(string table_id) : table_id_(std::move(table_id)) {} TableInfo::~TableInfo() { + // Abort and wait for all pending tasks completed. + AbortTasks(); + WaitTasksCompletion(); } string TableInfo::ToString() const {