[kudu] branch master updated: [tools] update schema if needed when rebuilding master

2022-05-16 Thread alexey
This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
 new 5f29e0423 [tools] update schema if needed when rebuilding master
5f29e0423 is described below

commit 5f29e04230dadd840b12887423fb76ac62d96538
Author: Yingchun Lai 
AuthorDate: Fri May 6 00:47:36 2022 +0800

[tools] update schema if needed when rebuilding master

Before this patch, if tservers had outdated schemas with lower
version for some tables (e.g. when the cluster wasn't healthy), the
'kudu master unsafe_rebuild' tool might rebuild the system catalog
with outdated schemas for the unhealthy tables.

This patch optimizes the rebuild logic, so the tool now picks up
schema of the highest version for an unhealthy table when rebuilding
the system catalog.

Change-Id: Iec99d57115228b521ba645b8e19c7057a4bb5d3d
Reviewed-on: http://gerrit.cloudera.org:8080/18496
Tested-by: Yingchun Lai 
Reviewed-by: Alexey Serbin 
---
 src/kudu/tools/kudu-admin-test.cc  | 102 -
 src/kudu/tools/master_rebuilder.cc |  67 +---
 2 files changed, 125 insertions(+), 44 deletions(-)

diff --git a/src/kudu/tools/kudu-admin-test.cc 
b/src/kudu/tools/kudu-admin-test.cc
index f5314ca3e..120b0da84 100644
--- a/src/kudu/tools/kudu-admin-test.cc
+++ b/src/kudu/tools/kudu-admin-test.cc
@@ -3037,7 +3037,10 @@ namespace {
 constexpr const char* kPrincipal = "oryx";
 
 vector RebuildMasterCmd(const ExternalMiniCluster& cluster,
+int tserver_num,
 bool is_secure, bool log_to_stderr = false) {
+  CHECK_GT(tserver_num, 0);
+  CHECK_LE(tserver_num, cluster.num_tablet_servers());
   vector command = {
 "master",
 "unsafe_rebuild",
@@ -3052,7 +3055,7 @@ vector RebuildMasterCmd(const 
ExternalMiniCluster& cluster,
   if (is_secure) {
 command.emplace_back(Substitute("--sasl_protocol_name=$0", kPrincipal));
   }
-  for (int i = 0; i < cluster.num_tablet_servers(); i++) {
+  for (int i = 0; i < tserver_num; i++) {
 auto* ts = cluster.tablet_server(i);
 command.emplace_back(ts->bound_rpc_hostport().ToString());
   }
@@ -3085,7 +3088,8 @@ TEST_F(AdminCliTest, TestRebuildMasterWhenNonEmpty) {
   NO_FATALS(cluster_->master()->Shutdown());
   string stdout;
   string stderr;
-  Status s = RunKuduTool(RebuildMasterCmd(*cluster_, /*is_secure*/false, 
/*log_to_stderr*/true),
+  Status s = RunKuduTool(RebuildMasterCmd(*cluster_, FLAGS_num_tablet_servers,
+  /*is_secure*/false, 
/*log_to_stderr*/true),
  &stdout, &stderr);
   ASSERT_TRUE(s.IsRuntimeError()) << s.ToString();
   ASSERT_STR_CONTAINS(stderr, "must be empty");
@@ -3093,7 +3097,8 @@ TEST_F(AdminCliTest, TestRebuildMasterWhenNonEmpty) {
   // Delete the contents of the old master from disk. This should allow the
   // tool to run.
   ASSERT_OK(cluster_->master()->DeleteFromDisk());
-  ASSERT_OK(RunKuduTool(RebuildMasterCmd(*cluster_, /*is_secure*/false, 
/*log_to_stderr*/true),
+  ASSERT_OK(RunKuduTool(RebuildMasterCmd(*cluster_, FLAGS_num_tablet_servers,
+ /*is_secure*/false, 
/*log_to_stderr*/true),
 &stdout, &stderr));
   ASSERT_STR_NOT_CONTAINS(stderr, "must be empty");
   ASSERT_STR_CONTAINS(stdout,
@@ -3136,7 +3141,8 @@ TEST_F(AdminCliTest, TestRebuildMasterWithTombstones) {
   ASSERT_OK(cluster_->master()->DeleteFromDisk());
   string stdout;
   string stderr;
-  ASSERT_OK(RunKuduTool(RebuildMasterCmd(*cluster_, /*is_secure*/false, 
/*log_to_stderr*/true),
+  ASSERT_OK(RunKuduTool(RebuildMasterCmd(*cluster_, FLAGS_num_tablet_servers,
+ /*is_secure*/false, 
/*log_to_stderr*/true),
  &stdout, &stderr));
   ASSERT_STR_CONTAINS(stderr, Substitute("Skipping replica of tablet $0 of 
table $1",
  tablet_id, kTable));
@@ -3165,39 +3171,56 @@ TEST_F(AdminCliTest, TestRebuildMasterWithTombstones) {
   NO_FATALS(cv.CheckCluster());
 }
 
-TEST_F(AdminCliTest, TestRebuildMasterAndAddColumns) {
-  FLAGS_num_tablet_servers = 1;
-  FLAGS_num_replicas = 1;
+TEST_F(AdminCliTest, TestAddColumnsAndRebuildMaster) {
+  FLAGS_num_tablet_servers = 3;
+  FLAGS_num_replicas = 3;
 
   NO_FATALS(BuildAndStart());
 
+  // Add a column and shutdown a tserver, the tserver holds a schema with a 
lower version.
   {
 unique_ptr 
table_alterer(client_->NewTableAlterer(kTableId));
 table_alterer->AddColumn("old_column_0")->Type(KuduColumnSchema::INT32);
 ASSERT_OK(table_alterer->Alter());
   }
+  NO_FATALS(cluster_->tablet_server(0)->Shutdown());
+
+  // Add another column, the latest schema has a higher version.
+  {
+unique_ptr 

[kudu] branch master updated: [test] fix ASAN test failures in master-stress-test

2022-05-16 Thread alexey
This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
 new e2f0d8fb0 [test] fix ASAN test failures in master-stress-test
e2f0d8fb0 is described below

commit e2f0d8fb056d9834199ec2feaba30daefae7
Author: zhangyifan27 
AuthorDate: Thu May 5 20:30:10 2022 +0800

[test] fix ASAN test failures in master-stress-test

The MasterStressTest occasionally failed in ASAN builds [1] [2]. I also 
built
the test in ASAN mode on a CentOS 7.9 machine, ran it in slow mode 3 times 
and
all failed.

It seems that some RetryingTSRpcTask is still running after 'TableInfo' 
object
destroyed if we set 
--enable_metadata_cleanup_for_deleted_tables_and_tablets=true.
To fix heap-use-after-free errors, we should abort and wait all pending 
tasks
completed before we destroy a TabletInfo object. With this fix the ASAN test
can pass 10 times.

[1] http://jenkins.kudu.apache.org/job/kudu-gerrit/25414/BUILD_TYPE=ASAN/
[2] http://jenkins.kudu.apache.org/job/kudu-gerrit/25448/BUILD_TYPE=ASAN/

Change-Id: I4eff45fbf05644362169485cd32678509eab1b07
Reviewed-on: http://gerrit.cloudera.org:8080/18501
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin 
---
 src/kudu/master/catalog_manager.cc | 49 ++
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/src/kudu/master/catalog_manager.cc 
b/src/kudu/master/catalog_manager.cc
index 7af3873ba..a1cf87ac8 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -3845,14 +3845,17 @@ class PickLeaderReplica : public TSPicker {
 //
 // The target tablet server is refreshed before each RPC by consulting the 
provided
 // TSPicker implementation.
+// Each created RetryingTSRpcTask should be added to TableInfo::pending_tasks_ 
by
+// calling TableInfo::AddTask(), so 'table' must remain valid for the lifetime 
of
+// this class.
 class RetryingTSRpcTask : public MonitoredTask {
  public:
-  RetryingTSRpcTask(Master *master,
+  RetryingTSRpcTask(Master* master,
 unique_ptr replica_picker,
-scoped_refptr table)
+TableInfo* table)
 : master_(master),
   replica_picker_(std::move(replica_picker)),
-  table_(std::move(table)),
+  table_(table),
   start_ts_(MonoTime::Now()),
   deadline_(start_ts_ + 
MonoDelta::FromMilliseconds(FLAGS_unresponsive_ts_rpc_timeout_ms)),
   attempt_(0),
@@ -3876,7 +3879,7 @@ class RetryingTSRpcTask : public MonitoredTask {
 
   MonoTime start_timestamp() const override { return start_ts_; }
   MonoTime completion_timestamp() const override { return end_ts_; }
-  const scoped_refptr& table() const { return table_ ; }
+  TableInfo* table() const { return table_; }
 
  protected:
   // Send an RPC request and register a callback.
@@ -3919,7 +3922,8 @@ class RetryingTSRpcTask : public MonitoredTask {
 
   Master * const master_;
   const unique_ptr replica_picker_;
-  const scoped_refptr table_;
+  // RetryingTSRpcTask is owned by 'TableInfo', so the backpointer should be 
raw.
+  TableInfo* table_;
 
   MonoTime start_ts_;
   MonoTime end_ts_;
@@ -4091,7 +4095,7 @@ class RetrySpecificTSRpcTask : public RetryingTSRpcTask {
  public:
   RetrySpecificTSRpcTask(Master* master,
  const string& permanent_uuid,
- const scoped_refptr& table)
+ TableInfo* table)
 : RetryingTSRpcTask(master,
 unique_ptr(new 
PickSpecificUUID(permanent_uuid)),
 table),
@@ -4113,7 +4117,7 @@ class AsyncCreateReplica : public RetrySpecificTSRpcTask {
  const string& permanent_uuid,
  const scoped_refptr& tablet,
  const TabletMetadataLock& tablet_lock)
-: RetrySpecificTSRpcTask(master, permanent_uuid, tablet->table()),
+: RetrySpecificTSRpcTask(master, permanent_uuid, tablet->table().get()),
   tablet_id_(tablet->id()) {
 deadline_ = start_ts_ + 
MonoDelta::FromMilliseconds(FLAGS_tablet_creation_timeout_ms);
 
@@ -4179,17 +4183,17 @@ class AsyncCreateReplica : public 
RetrySpecificTSRpcTask {
 // Send a DeleteTablet() RPC request.
 class AsyncDeleteReplica : public RetrySpecificTSRpcTask {
  public:
-  AsyncDeleteReplica(
-  Master* master, const string& permanent_uuid,
-  const scoped_refptr& table, string tablet_id,
-  TabletDataState delete_type,
-  optional cas_config_opid_index_less_or_equal,
-  string reason)
+  AsyncDeleteReplica(Master* master,
+ const string& permanent_uuid,
+ TableInfo* table,
+ string tablet_id,
+ TabletDataState delete_type,
+ optional cas_co