Repository: kudu
Updated Branches:
  refs/heads/master 745ebab2f -> 03d1fcb14


KUDU-2293 fix cleanup after failed copies

Before, when a tablet server failed to tablet copy, Kudu would perform a
best-effort cleanup of the partially-copied replica and leave the tablet
tombstoned. If this tombstoning were to fail due to disk issues (e.g.
out of space), Kudu would allow this and tablet would remain in
TABLET_DATA_COPYING both in-memory and on-disk. This would lead to a
FATAL error if another tablet copy were started for the same replica, as
the server would attempt to copy over a replica with data already marked
TABLET_DATA_COPYING.

This behavior arose from trying to balance two invariants:
- keep on-disk state consistent with in-memory state when possible
- when a tablet copy fails, leave it in as dead of a state as we can
  (i.e. TABLET_DATA_TOMBSTONED with no transitions in progress)

This patch updates the Abort() logic to lean towards the latter
invariant: if a tablet copy fails, at least its in-memory state will be
set as TABLET_DATA_TOMBSTONED. This may not be true on-disk, but that's
okay because either 1) the tablet server will eventually overwrite it
via another tablet copy (at which time its data must _not_ be in the
TABLET_DATA_COPYING state), or 2) the server will be restarted and the
tablet will be tombstoned upon seeing a non-TABLET_DATA_READY state
anyway.

We use the tablet copy internal state machine to determine whether to do
anything during Abort(). This wasn't always right before, since the
state machine didn't accurately reflect when cleanup was necessary (e.g.
the copy would be set to kStarted only at the end of Start(), so if
Abort() were called due to a failure in Start(), no cleanup was done).
This patch updates the state machine to account for this by introducing a
new state kStarting that indicates that there exists state to clean up
even if Start() has not completed, and by moving the setting of
kFinished such that cleanup is done even if Finish() fails.

A test is added to tablet_copy-itest that tests failures to copy,
ensuring that the tablet is left in such an state that further copies
are possible without crashing. The test uses EIO injection to fail the
copies, but the logic is the same as if full disks were used instead.
Another is added to tablet_copy_client-test that tests getting into the
various states in the updated tablet copy state machine.

Change-Id: I0459d484a3064aa2de392328e2910c9f6741be04
Reviewed-on: http://gerrit.cloudera.org:8080/9452
Reviewed-by: Mike Percy <mpe...@apache.org>
Tested-by: Andrew Wong <aw...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/03d1fcb1
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/03d1fcb1
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/03d1fcb1

Branch: refs/heads/master
Commit: 03d1fcb1467e4e29ead191d875a53e490f1d1372
Parents: 745ebab
Author: Andrew Wong <aw...@cloudera.com>
Authored: Fri Feb 23 14:58:58 2018 -0800
Committer: Andrew Wong <aw...@cloudera.com>
Committed: Wed Apr 11 02:46:31 2018 +0000

----------------------------------------------------------------------
 src/kudu/integration-tests/tablet_copy-itest.cc |  92 ++++++++++++++++
 src/kudu/tablet/tablet_metadata.cc              |  15 +--
 src/kudu/tablet/tablet_metadata.h               |  16 +--
 src/kudu/tserver/tablet_copy_client-test.cc     | 104 +++++++++++++++++--
 src/kudu/tserver/tablet_copy_client.cc          |  60 ++++++++---
 src/kudu/tserver/tablet_copy_client.h           |  27 ++++-
 6 files changed, 272 insertions(+), 42 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/03d1fcb1/src/kudu/integration-tests/tablet_copy-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/tablet_copy-itest.cc 
b/src/kudu/integration-tests/tablet_copy-itest.cc
index e52bead..e8fc265 100644
--- a/src/kudu/integration-tests/tablet_copy-itest.cc
+++ b/src/kudu/integration-tests/tablet_copy-itest.cc
@@ -95,6 +95,7 @@ DEFINE_int32(test_delete_leader_num_writer_threads, 1,
 using kudu::client::KuduSchema;
 using kudu::client::KuduSchemaFromSchema;
 using kudu::client::KuduTableCreator;
+using kudu::cluster::ExternalMiniClusterOptions;
 using kudu::cluster::ExternalTabletServer;
 using kudu::consensus::COMMITTED_OPID;
 using kudu::consensus::ConsensusMetadataManager;
@@ -379,6 +380,97 @@ TEST_F(TabletCopyITest, TestListTabletsDuringTabletCopy) {
   NO_FATALS(cluster_->AssertNoCrashes());
 }
 
+// Regression test for KUDU-2293.
+TEST_F(TabletCopyITest, TestCopyAfterFailedCopy) {
+  MonoDelta kTimeout = MonoDelta::FromSeconds(30);
+  const int kTsIndex = 1; // Pick an arbitrary tserver to observe.
+  // Use 2 data dirs so we can fail a directory without crashing the server.
+  ExternalMiniClusterOptions cluster_opts;
+  cluster_opts.num_data_dirs = 2;
+  cluster_opts.num_tablet_servers = 3;
+  // We're going to tombstone replicas manually, so prevent the master from
+  // tombstoning evicted followers.
+  
cluster_opts.extra_master_flags.emplace_back("--master_tombstone_evicted_tablet_replicas=false");
+  NO_FATALS(StartClusterWithOpts(std::move(cluster_opts)));
+
+  // Populate a tablet with some data.
+  TestWorkload workload(cluster_.get());
+  workload.Setup();
+  workload.Start();
+  while (workload.rows_inserted() < 1000) {
+    SleepFor(MonoDelta::FromMilliseconds(10));
+  }
+
+  TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()];
+  vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets;
+  ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, kTimeout, &tablets));
+  string tablet_id = tablets[0].tablet_status().tablet_id();
+
+  // Ensure all the servers agree before we proceed.
+  workload.StopAndJoin();
+  ASSERT_OK(WaitForServersToAgree(kTimeout, ts_map_, tablet_id,
+                                  workload.batches_completed()));
+
+  int leader_index;
+  int follower_index;
+  TServerDetails* leader_ts;
+  TServerDetails* follower_ts;
+
+  // Get the leader and follower tablet servers.
+  ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, kTimeout, &leader_ts));
+  leader_index = cluster_->tablet_server_index_by_uuid(leader_ts->uuid());
+  follower_index = (leader_index + 1) % cluster_->num_tablet_servers();
+  ExternalTabletServer* follower = cluster_->tablet_server(follower_index);
+  leader_ts = ts_map_[cluster_->tablet_server(leader_index)->uuid()];
+  follower_ts = ts_map_[follower->uuid()];
+
+  // Tombstone the follower.
+  ASSERT_OK(DeleteTabletWithRetries(follower_ts, tablet_id,
+                                    TabletDataState::TABLET_DATA_TOMBSTONED,
+                                    kTimeout));
+  HostPort leader_addr;
+  ASSERT_OK(HostPortFromPB(leader_ts->registration.rpc_addresses(0), 
&leader_addr));
+
+  // Inject failures to the metadata and trigger the tablet copy. This will
+  // cause the copy to fail.
+  ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(follower_index),
+      "env_inject_eio_globs", 
JoinPathSegments(cluster_->WalRootForTS(follower_index),
+                                               "tablet-meta/**")));
+  ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(follower_index),
+      "env_inject_eio", "0.10"));
+  Status s;
+  ASSERT_EVENTUALLY([&] {
+    s = itest::StartTabletCopy(follower_ts, tablet_id, leader_ts->uuid(),
+                               leader_addr, 
std::numeric_limits<int64_t>::max(), kTimeout);
+    ASSERT_STR_CONTAINS(s.ToString(), "INJECTED FAILURE");
+  });
+
+  // Trigger a copy again. This should fail. The previous copy might still be
+  // cleaning up so we might get some "already in progress" errors; eventually
+  // a copy will successfully start and fail with the expected error.
+  ASSERT_EVENTUALLY([&] {
+    s = itest::StartTabletCopy(follower_ts, tablet_id, leader_ts->uuid(),
+                               leader_addr, 
std::numeric_limits<int64_t>::max(), kTimeout);
+    ASSERT_STR_CONTAINS(s.ToString(), "INJECTED FAILURE");
+  });
+
+  // Regression condition for KUDU-2293: the above second attempt at a copy
+  // should not crash the destination server.
+  for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
+    const auto tserver = cluster_->tablet_server(i);
+    ASSERT_TRUE(tserver->IsProcessAlive())
+        << Substitute("Tablet server $0 ($1) has crashed...", i, 
tserver->uuid());
+  }
+
+  // Finally, trigger a successful copy.
+  ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(follower_index),
+      "env_inject_eio", "0.0"));
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(itest::StartTabletCopy(follower_ts, tablet_id, leader_ts->uuid(),
+                                    leader_addr, 
std::numeric_limits<int64_t>::max(), kTimeout));
+  });
+}
+
 // Start tablet copy session and delete the tablet in the middle.
 // It should actually be possible to complete copying in such a case, because
 // when a Tablet Copy session is started on the "source" server, all of

http://git-wip-us.apache.org/repos/asf/kudu/blob/03d1fcb1/src/kudu/tablet/tablet_metadata.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_metadata.cc 
b/src/kudu/tablet/tablet_metadata.cc
index 004a5ed..c8c04a7 100644
--- a/src/kudu/tablet/tablet_metadata.cc
+++ b/src/kudu/tablet/tablet_metadata.cc
@@ -215,22 +215,15 @@ Status TabletMetadata::DeleteTabletData(TabletDataState 
delete_type,
     }
   }
 
-  // Keep a copy of the old data dir group in case of flush failure.
-  DataDirGroupPB pb;
-  bool old_group_exists = 
fs_manager_->dd_manager()->GetDataDirGroupPB(tablet_id_, &pb).ok();
-
-  // Remove the tablet's data dir group tracked by the DataDirManager.
+  // Unregister the tablet's data dir group in memory (it is stored on disk in
+  // the superblock). Even if we fail to flush below, the expectation is that
+  // we will no longer be writing to the tablet, and therefore don't need its
+  // data dir group.
   fs_manager_->dd_manager()->DeleteDataDirGroup(tablet_id_);
-  auto revert_group_cleanup = MakeScopedCleanup([&]() {
-    if (old_group_exists) {
-      fs_manager_->dd_manager()->LoadDataDirGroupFromPB(tablet_id_, pb);
-    }
-  });
 
   // Flushing will sync the new tablet_data_state_ to disk and will now also
   // delete all the data.
   RETURN_NOT_OK(Flush());
-  revert_group_cleanup.cancel();
 
   // Re-sync to disk one more time.
   // This call will typically re-sync with an empty orphaned blocks list

http://git-wip-us.apache.org/repos/asf/kudu/blob/03d1fcb1/src/kudu/tablet/tablet_metadata.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/tablet_metadata.h 
b/src/kudu/tablet/tablet_metadata.h
index e063946..107ec9f 100644
--- a/src/kudu/tablet/tablet_metadata.h
+++ b/src/kudu/tablet/tablet_metadata.h
@@ -14,8 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#ifndef KUDU_TABLET_TABLET_METADATA_H
-#define KUDU_TABLET_TABLET_METADATA_H
+#pragma once
 
 #include <atomic>
 #include <cstdint>
@@ -198,6 +197,9 @@ class TabletMetadata : public 
RefCountedThreadSafe<TabletMetadata> {
   // partially-tombstoned tablets during crash recovery.
   //
   // Returns only once all data has been removed.
+  //
+  // Note: this will always update the in-memory state, but upon failure,
+  // may not update the on-disk state.
   Status DeleteTabletData(TabletDataState delete_type,
                           const boost::optional<consensus::OpId>& 
last_logged_opid);
 
@@ -237,9 +239,12 @@ class TabletMetadata : public 
RefCountedThreadSafe<TabletMetadata> {
   // Sets *super_block to the serialized form of the current metadata.
   Status ToSuperBlock(TabletSuperBlockPB* super_block) const;
 
-  // Fully replace a superblock (used for bootstrap).
+  // Replace the superblock (used for bootstrap) both in memory and on disk.
   Status ReplaceSuperBlock(const TabletSuperBlockPB &pb);
 
+  // Update the in-memory state of metadata to that of the given superblock PB.
+  Status LoadFromSuperBlock(const TabletSuperBlockPB& superblock);
+
   int64_t on_disk_size() const {
     return on_disk_size_.load(std::memory_order_relaxed);
   }
@@ -286,9 +291,6 @@ class TabletMetadata : public 
RefCountedThreadSafe<TabletMetadata> {
   // Updates the cached on-disk size of the tablet superblock.
   Status UpdateOnDiskSize();
 
-  // Update state of metadata to that of the given superblock PB.
-  Status LoadFromSuperBlock(const TabletSuperBlockPB& superblock);
-
   Status ReadSuperBlock(TabletSuperBlockPB *pb);
 
   // Fully replace superblock.
@@ -392,5 +394,3 @@ class TabletMetadata : public 
RefCountedThreadSafe<TabletMetadata> {
 
 } // namespace tablet
 } // namespace kudu
-
-#endif /* KUDU_TABLET_TABLET_METADATA_H */

http://git-wip-us.apache.org/repos/asf/kudu/blob/03d1fcb1/src/kudu/tserver/tablet_copy_client-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_copy_client-test.cc 
b/src/kudu/tserver/tablet_copy_client-test.cc
index de5d623..8c880ce 100644
--- a/src/kudu/tserver/tablet_copy_client-test.cc
+++ b/src/kudu/tserver/tablet_copy_client-test.cc
@@ -25,6 +25,7 @@
 #include <thread>
 #include <vector>
 
+#include <gflags/gflags.h>
 #include <gflags/gflags_declare.h>
 #include <glog/logging.h>
 #include <glog/stl_logging.h>
@@ -43,10 +44,10 @@
 #include "kudu/fs/data_dirs.h"
 #include "kudu/fs/fs_manager.h"
 #include "kudu/gutil/casts.h"
-#include "kudu/gutil/gscoped_ptr.h"
 #include "kudu/gutil/port.h"
 #include "kudu/gutil/ref_counted.h"
 #include "kudu/gutil/strings/fastmem.h"
+#include "kudu/gutil/strings/join.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/rpc/messenger.h"
 #include "kudu/tablet/metadata.pb.h"
@@ -61,6 +62,7 @@
 #include "kudu/util/metrics.h"
 #include "kudu/util/monotime.h"
 #include "kudu/util/net/net_util.h"
+#include "kudu/util/path_util.h"
 #include "kudu/util/slice.h"
 #include "kudu/util/status.h"
 #include "kudu/util/test_macros.h"
@@ -70,7 +72,9 @@ using std::string;
 using std::thread;
 using std::vector;
 
+DECLARE_double(env_inject_eio);
 DECLARE_string(block_manager);
+DECLARE_string(env_inject_eio_globs);
 
 METRIC_DECLARE_counter(block_manager_total_disk_sync);
 
@@ -106,7 +110,11 @@ class TabletCopyClientTest : public TabletCopyTest {
     fs_manager_.reset(new FsManager(Env::Default(), opts));
     ASSERT_OK(fs_manager_->CreateInitialFileSystemLayout());
     ASSERT_OK(fs_manager_->Open());
+    ASSERT_OK(ResetTabletCopyClient());
+  }
 
+  // Sets up a new tablet copy client.
+  Status ResetTabletCopyClient() {
     scoped_refptr<ConsensusMetadataManager> cmeta_manager(
         new ConsensusMetadataManager(fs_manager_.get()));
 
@@ -119,13 +127,17 @@ class TabletCopyClientTest : public TabletCopyTest {
                                        nullptr /* no metrics */));
     RaftPeerPB* cstate_leader;
     ConsensusStatePB cstate;
-    ASSERT_OK(tablet_replica_->consensus()->ConsensusState(&cstate));
-    ASSERT_OK(GetRaftConfigLeader(&cstate, &cstate_leader));
+    RETURN_NOT_OK(tablet_replica_->consensus()->ConsensusState(&cstate));
+    RETURN_NOT_OK(GetRaftConfigLeader(&cstate, &cstate_leader));
     leader_ = *cstate_leader;
+    return Status::OK();
+  }
 
+  // Starts the tablet copy.
+  Status StartCopy() {
     HostPort host_port;
-    ASSERT_OK(HostPortFromPB(leader_.last_known_addr(), &host_port));
-    ASSERT_OK(client_->Start(host_port, &meta_));
+    RETURN_NOT_OK(HostPortFromPB(leader_.last_known_addr(), &host_port));
+    return client_->Start(host_port, &meta_);
   }
 
  protected:
@@ -133,9 +145,9 @@ class TabletCopyClientTest : public TabletCopyTest {
 
   MetricRegistry metric_registry_;
   scoped_refptr<MetricEntity> metric_entity_;
-  gscoped_ptr<FsManager> fs_manager_;
+  unique_ptr<FsManager> fs_manager_;
   shared_ptr<rpc::Messenger> messenger_;
-  gscoped_ptr<TabletCopyClient> client_;
+  unique_ptr<TabletCopyClient> client_;
   scoped_refptr<TabletMetadata> meta_;
   RaftPeerPB leader_;
 };
@@ -167,20 +179,89 @@ Status TabletCopyClientTest::CompareFileContents(const 
string& path1, const stri
   return Status::OK();
 }
 
+// Test a tablet copy going through the various states in the copy state
+// machine.
+TEST_F(TabletCopyClientTest, TestLifeCycle) {
+  // Target fault injection for the tablet metadata directories, but do not
+  // start injecting failures just yet.
+  const vector<string> meta_dirs = {
+      JoinPathSegments(client_->fs_manager_->GetConsensusMetadataDir(), "**"),
+      JoinPathSegments(client_->fs_manager_->GetTabletMetadataDir(), "**") };
+  FLAGS_env_inject_eio_globs = JoinStrings(meta_dirs, ",");
+
+  ASSERT_EQ(TabletCopyClient::State::kInitialized, client_->state_);
+  Status s;
+  // If we're creating a brand new tablet, failing to start the copy will yield
+  // no changes in-memory. It should be as if the copy hadn't started.
+  {
+    google::FlagSaver fs;
+    FLAGS_env_inject_eio = 1.0;
+    s = StartCopy();
+    ASSERT_TRUE(s.IsIOError()) << s.ToString();
+    ASSERT_STR_CONTAINS(s.ToString(), "Failed to write tablet metadata");
+    ASSERT_EQ(TabletCopyClient::State::kInitialized, client_->state_);
+    ASSERT_FALSE(meta_);
+  }
+
+  // Now let's try replacing a tablet. Set a metadata that we can replace.
+  ASSERT_OK(ResetTabletCopyClient());
+  ASSERT_OK(StartCopy());
+  ASSERT_EQ(TabletCopyClient::State::kStarted, client_->state_);
+  ASSERT_OK(client_->Finish());
+  ASSERT_EQ(TabletCopyClient::State::kFinished, client_->state_);
+
+  // Since we're going to replace the tablet, we need to tombstone the existing
+  // metadata first.
+  meta_->set_tablet_data_state(tablet::TABLET_DATA_TOMBSTONED);
+  ASSERT_OK(ResetTabletCopyClient());
+  ASSERT_OK(client_->SetTabletToReplace(meta_, 0));
+
+  // If we're replacing a tablet, failing to start will yield changes
+  // in-memory, and it is thus necessary to recognize the copy is in the
+  // process of starting.
+  {
+    google::FlagSaver fs;
+    FLAGS_env_inject_eio = 1.0;
+    s = StartCopy();
+    ASSERT_TRUE(s.IsIOError()) << s.ToString();
+    ASSERT_STR_CONTAINS(s.ToString(), "Could not replace superblock");
+    ASSERT_EQ(TabletCopyClient::State::kStarting, client_->state_);
+  }
+
+  // Make sure we are still in the appropriate state if we fail to finish.
+  ASSERT_OK(ResetTabletCopyClient());
+  client_->SetTabletToReplace(meta_, 0);
+  ASSERT_OK(StartCopy());
+  FLAGS_env_inject_eio = 1.0;
+  s = client_->Finish();
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_EQ(TabletCopyClient::State::kStarted, client_->state_);
+
+  // Closing out the copy should leave the copy client in its terminal state,
+  // even upon failure.
+  s = client_->Abort();
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_EQ(TabletCopyClient::State::kFinished, client_->state_);
+  ASSERT_EQ(tablet::TABLET_DATA_TOMBSTONED, meta_->tablet_data_state());
+}
+
 // Implementation test that no blocks exist in the new superblock before 
fetching.
 TEST_F(TabletCopyClientTest, TestNoBlocksAtStart) {
+  ASSERT_OK(StartCopy());
   ASSERT_GT(ListBlocks(*client_->remote_superblock_).size(), 0);
   ASSERT_EQ(0, ListBlocks(*client_->superblock_).size());
 }
 
 // Basic begin / end tablet copy session.
 TEST_F(TabletCopyClientTest, TestBeginEndSession) {
+  ASSERT_OK(StartCopy());
   ASSERT_OK(client_->FetchAll(nullptr /* no listener */));
   ASSERT_OK(client_->Finish());
 }
 
 // Basic data block download unit test.
 TEST_F(TabletCopyClientTest, TestDownloadBlock) {
+  ASSERT_OK(StartCopy());
   BlockId block_id = FirstColumnBlockId(*client_->remote_superblock_);
   Slice slice;
   faststring scratch;
@@ -201,6 +282,7 @@ TEST_F(TabletCopyClientTest, TestDownloadBlock) {
 
 // Basic WAL segment download unit test.
 TEST_F(TabletCopyClientTest, TestDownloadWalSegment) {
+  ASSERT_OK(StartCopy());
   ASSERT_OK(env_util::CreateDirIfMissing(
       env_, fs_manager_->GetTabletWalDir(GetTabletId())));
 
@@ -222,6 +304,7 @@ TEST_F(TabletCopyClientTest, TestDownloadWalSegment) {
 
 // Ensure that we detect data corruption at the per-transfer level.
 TEST_F(TabletCopyClientTest, TestVerifyData) {
+  ASSERT_OK(StartCopy());
   string good = "This is a known good string";
   string bad = "This is a known bad! string";
   const int kGoodOffset = 0;
@@ -257,6 +340,7 @@ TEST_F(TabletCopyClientTest, TestVerifyData) {
 }
 
 TEST_F(TabletCopyClientTest, TestDownloadAllBlocks) {
+  ASSERT_OK(StartCopy());
   // Download and commit all the blocks.
   ASSERT_OK(client_->DownloadBlocks());
   ASSERT_OK(client_->transaction_->CommitCreatedBlocks());
@@ -290,6 +374,7 @@ TEST_F(TabletCopyClientTest, TestDownloadAllBlocks) {
 // Test that failing a disk outside fo the tablet copy client will eventually
 // stop the copy client and cause it to fail.
 TEST_F(TabletCopyClientTest, TestFailedDiskStopsClient) {
+  ASSERT_OK(StartCopy());
   DataDirManager* dd_manager = fs_manager_->dd_manager();
 
   // Repeatedly fetch files for the client.
@@ -334,6 +419,11 @@ struct AbortTestParams {
 class TabletCopyClientAbortTest : public TabletCopyClientTest,
                                   public ::testing::WithParamInterface<
                                       tuple<DownloadBlocks, DeleteTrigger>> {
+ public:
+  virtual void SetUp() override {
+    TabletCopyClientTest::SetUp();
+    ASSERT_OK(StartCopy());
+  }
  protected:
   // Create the specified number of blocks with junk data for testing purposes.
   void CreateTestBlocks(int num_blocks);

http://git-wip-us.apache.org/repos/asf/kudu/blob/03d1fcb1/src/kudu/tserver/tablet_copy_client.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_copy_client.cc 
b/src/kudu/tserver/tablet_copy_client.cc
index ef88b0e..cb4ca7f 100644
--- a/src/kudu/tserver/tablet_copy_client.cc
+++ b/src/kudu/tserver/tablet_copy_client.cc
@@ -64,6 +64,8 @@
 #include "kudu/util/pb_util.h"
 #include "kudu/util/random.h"
 #include "kudu/util/random_util.h"
+#include "kudu/util/scoped_cleanup.h"
+#include "kudu/util/status.h"
 
 DEFINE_int32(tablet_copy_begin_session_timeout_ms, 3000,
              "Tablet server RPC client timeout for BeginTabletCopySession 
calls. "
@@ -318,8 +320,11 @@ Status TabletCopyClient::Start(const HostPort& 
copy_source_addr,
       *superblock_->mutable_tombstone_last_logged_opid() = *last_logged_opid;
     }
 
-    // Remove any existing orphaned blocks and WALs from the tablet, and
-    // set the data state to 'COPYING'.
+    // We are about to persist things to disk. Update the tablet copy state
+    // machine so we know there is state to clean up in case we fail.
+    state_ = kStarting;
+    // Set the data state to 'COPYING' and remove any existing orphaned blocks
+    // and WALs from the tablet.
     RETURN_NOT_OK_PREPEND(
         TSTabletManager::DeleteTabletData(meta_, cmeta_manager_,
                                           tablet::TABLET_DATA_COPYING,
@@ -349,6 +354,9 @@ Status TabletCopyClient::Start(const HostPort& 
copy_source_addr,
                                             superblock_->tablet_data_state(),
                                             
superblock_->tombstone_last_logged_opid(),
                                             &meta_));
+    // We have begun persisting things to disk. Update the tablet copy state
+    // machine so we know there is state to clean up in case we fail.
+    state_ = kStarting;
   }
   CHECK_OK(fs_manager_->dd_manager()->GetDataDirGroupPB(
       tablet_id_, superblock_->mutable_data_dir_group()));
@@ -389,15 +397,26 @@ Status TabletCopyClient::Finish() {
   //     so the fsync() operations could be cheaper.
   //  3) Downloaded blocks should be made durable before replacing superblock.
   RETURN_NOT_OK(transaction_->CommitCreatedBlocks());
-  state_ = kFinished;
 
   // Replace tablet metadata superblock. This will set the tablet metadata 
state
   // to TABLET_DATA_READY, since we checked above that the response
   // superblock is in a valid state to bootstrap from.
   LOG_WITH_PREFIX(INFO) << "Tablet Copy complete. Replacing tablet 
superblock.";
   SetStatusMessage("Replacing tablet superblock");
-  superblock_->set_tablet_data_state(tablet::TABLET_DATA_READY);
+
+  boost::optional<OpId> last_logged_opid = 
superblock_->tombstone_last_logged_opid();
+  auto revert_activate_superblock = MakeScopedCleanup([&] {
+    // If we fail below, revert the updated state so further calls to Abort()
+    // can clean up appropriately.
+    if (last_logged_opid) {
+      *superblock_->mutable_tombstone_last_logged_opid() = *last_logged_opid;
+    }
+    superblock_->set_tablet_data_state(TabletDataState::TABLET_DATA_COPYING);
+  });
+
   superblock_->clear_tombstone_last_logged_opid();
+  superblock_->set_tablet_data_state(tablet::TABLET_DATA_READY);
+
   RETURN_NOT_OK(meta_->ReplaceSuperBlock(*superblock_));
 
   if (FLAGS_tablet_copy_save_downloaded_metadata) {
@@ -408,22 +427,39 @@ Status TabletCopyClient::Finish() {
                           "Unable to make copy of tablet metadata");
   }
 
+  // Now that we've finished everything, complete.
+  revert_activate_superblock.cancel();
+  state_ = kFinished;
   return Status::OK();
 }
 
 Status TabletCopyClient::Abort() {
-  if (state_ != kStarted) {
+  // If we have not begun doing anything or have already finished, there is
+  // nothing left to do.
+  if (state_ == kInitialized || state_ == kFinished) {
     return Status::OK();
   }
-  state_ = kFinished;
-  CHECK(meta_);
 
-  // Write the in-progress superblock to disk so that when we delete the tablet
-  // data all the partial blocks we have persisted will be deleted.
   DCHECK_EQ(tablet::TABLET_DATA_COPYING, superblock_->tablet_data_state());
-  RETURN_NOT_OK(meta_->ReplaceSuperBlock(*superblock_));
-
-  // Delete all of the tablet data, including blocks and WALs.
+  // Ensure upon exiting that the copy is finished and that the tablet is left
+  // tombstoned. This is guaranteed because we always call DeleteTabletData(),
+  // which leaves the metadata in this state, even on failure.
+  SCOPED_CLEANUP({
+    DCHECK_EQ(tablet::TABLET_DATA_TOMBSTONED, meta_->tablet_data_state());
+    state_ = kFinished;
+  });
+
+  // Load the in-progress superblock in-memory so that when we delete the
+  // tablet, all the partial blocks we persisted will be deleted.
+  //
+  // Note: We warn instead of returning early here upon failure because even if
+  // the superblock protobuf was somehow corrupted, we still want to attempt to
+  // delete the tablet's data dir group, WAL segments, etc.
+  WARN_NOT_OK(meta_->LoadFromSuperBlock(*superblock_),
+      "Failed to load the new superblock");
+
+  // Finally, tombstone the tablet metadata and try deleting all of the tablet
+  // data on disk, including blocks and WALs.
   RETURN_NOT_OK_PREPEND(
       TSTabletManager::DeleteTabletData(meta_, cmeta_manager_,
                                         tablet::TABLET_DATA_TOMBSTONED,

http://git-wip-us.apache.org/repos/asf/kudu/blob/03d1fcb1/src/kudu/tserver/tablet_copy_client.h
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/tablet_copy_client.h 
b/src/kudu/tserver/tablet_copy_client.h
index 018c6db..c2adf6a 100644
--- a/src/kudu/tserver/tablet_copy_client.h
+++ b/src/kudu/tserver/tablet_copy_client.h
@@ -14,12 +14,11 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#ifndef KUDU_TSERVER_TABLET_COPY_CLIENT_H
-#define KUDU_TSERVER_TABLET_COPY_CLIENT_H
+#pragma once
 
 #include <cstdint>
-#include <string>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include <gtest/gtest_prod.h>
@@ -132,14 +131,35 @@ class TabletCopyClient {
   FRIEND_TEST(TabletCopyClientTest, TestNoBlocksAtStart);
   FRIEND_TEST(TabletCopyClientTest, TestBeginEndSession);
   FRIEND_TEST(TabletCopyClientTest, TestDownloadBlock);
+  FRIEND_TEST(TabletCopyClientTest, TestLifeCycle);
   FRIEND_TEST(TabletCopyClientTest, TestVerifyData);
   FRIEND_TEST(TabletCopyClientTest, TestDownloadWalSegment);
   FRIEND_TEST(TabletCopyClientTest, TestDownloadAllBlocks);
   FRIEND_TEST(TabletCopyClientAbortTest, TestAbort);
 
+  // State machine that guides the progression of a single tablet copy.
+  // A tablet copy will go through the states:
+  //
+  //   kInitialized --> kStarting --> kStarted --> kFinished
+  //        |               |             |          ^^^
+  //        |               |             +----------+||
+  //        |               +-------------------------+|
+  //        +------------------------------------------+
   enum State {
+    // The copy has yet to do anything and no changes have been made on disk.
     kInitialized,
+
+    // The copy has begun updating metadata on disk, but has not begun
+    // receiving WAL segments or blocks from the tablet copy source. Being in
+    // this state or beyond implies that 'meta_' is non-null.
+    kStarting,
+
+    // The metadata has been updated on disk to indicate the start of a new
+    // copy. Blocks, metadata, and WAL segments may be received from the tablet
+    // copy source in this state.
     kStarted,
+
+    // The copy is finished and needs no cleanup.
     kFinished,
   };
 
@@ -261,4 +281,3 @@ class TabletCopyClient {
 
 } // namespace tserver
 } // namespace kudu
-#endif /* KUDU_TSERVER_TABLET_COPY_CLIENT_H */

Reply via email to