Repository: kudu Updated Branches: refs/heads/master 745ebab2f -> 03d1fcb14
KUDU-2293 fix cleanup after failed copies Before, when a tablet server failed to tablet copy, Kudu would perform a best-effort cleanup of the partially-copied replica and leave the tablet tombstoned. If this tombstoning were to fail due to disk issues (e.g. out of space), Kudu would allow this and tablet would remain in TABLET_DATA_COPYING both in-memory and on-disk. This would lead to a FATAL error if another tablet copy were started for the same replica, as the server would attempt to copy over a replica with data already marked TABLET_DATA_COPYING. This behavior arose from trying to balance two invariants: - keep on-disk state consistent with in-memory state when possible - when a tablet copy fails, leave it in as dead of a state as we can (i.e. TABLET_DATA_TOMBSTONED with no transitions in progress) This patch updates the Abort() logic to lean towards the latter invariant: if a tablet copy fails, at least its in-memory state will be set as TABLET_DATA_TOMBSTONED. This may not be true on-disk, but that's okay because either 1) the tablet server will eventually overwrite it via another tablet copy (at which time its data must _not_ be in the TABLET_DATA_COPYING state), or 2) the server will be restarted and the tablet will be tombstoned upon seeing a non-TABLET_DATA_READY state anyway. We use the tablet copy internal state machine to determine whether to do anything during Abort(). This wasn't always right before, since the state machine didn't accurately reflect when cleanup was necessary (e.g. the copy would be set to kStarted only at the end of Start(), so if Abort() were called due to a failure in Start(), no cleanup was done). This patch updates the state machine to account for this by introducing a new state kStarting that indicates that there exists state to clean up even if Start() has not completed, and by moving the setting of kFinished such that cleanup is done even if Finish() fails. A test is added to tablet_copy-itest that tests failures to copy, ensuring that the tablet is left in such an state that further copies are possible without crashing. The test uses EIO injection to fail the copies, but the logic is the same as if full disks were used instead. Another is added to tablet_copy_client-test that tests getting into the various states in the updated tablet copy state machine. Change-Id: I0459d484a3064aa2de392328e2910c9f6741be04 Reviewed-on: http://gerrit.cloudera.org:8080/9452 Reviewed-by: Mike Percy <mpe...@apache.org> Tested-by: Andrew Wong <aw...@cloudera.com> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/03d1fcb1 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/03d1fcb1 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/03d1fcb1 Branch: refs/heads/master Commit: 03d1fcb1467e4e29ead191d875a53e490f1d1372 Parents: 745ebab Author: Andrew Wong <aw...@cloudera.com> Authored: Fri Feb 23 14:58:58 2018 -0800 Committer: Andrew Wong <aw...@cloudera.com> Committed: Wed Apr 11 02:46:31 2018 +0000 ---------------------------------------------------------------------- src/kudu/integration-tests/tablet_copy-itest.cc | 92 ++++++++++++++++ src/kudu/tablet/tablet_metadata.cc | 15 +-- src/kudu/tablet/tablet_metadata.h | 16 +-- src/kudu/tserver/tablet_copy_client-test.cc | 104 +++++++++++++++++-- src/kudu/tserver/tablet_copy_client.cc | 60 ++++++++--- src/kudu/tserver/tablet_copy_client.h | 27 ++++- 6 files changed, 272 insertions(+), 42 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/03d1fcb1/src/kudu/integration-tests/tablet_copy-itest.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/tablet_copy-itest.cc b/src/kudu/integration-tests/tablet_copy-itest.cc index e52bead..e8fc265 100644 --- a/src/kudu/integration-tests/tablet_copy-itest.cc +++ b/src/kudu/integration-tests/tablet_copy-itest.cc @@ -95,6 +95,7 @@ DEFINE_int32(test_delete_leader_num_writer_threads, 1, using kudu::client::KuduSchema; using kudu::client::KuduSchemaFromSchema; using kudu::client::KuduTableCreator; +using kudu::cluster::ExternalMiniClusterOptions; using kudu::cluster::ExternalTabletServer; using kudu::consensus::COMMITTED_OPID; using kudu::consensus::ConsensusMetadataManager; @@ -379,6 +380,97 @@ TEST_F(TabletCopyITest, TestListTabletsDuringTabletCopy) { NO_FATALS(cluster_->AssertNoCrashes()); } +// Regression test for KUDU-2293. +TEST_F(TabletCopyITest, TestCopyAfterFailedCopy) { + MonoDelta kTimeout = MonoDelta::FromSeconds(30); + const int kTsIndex = 1; // Pick an arbitrary tserver to observe. + // Use 2 data dirs so we can fail a directory without crashing the server. + ExternalMiniClusterOptions cluster_opts; + cluster_opts.num_data_dirs = 2; + cluster_opts.num_tablet_servers = 3; + // We're going to tombstone replicas manually, so prevent the master from + // tombstoning evicted followers. + cluster_opts.extra_master_flags.emplace_back("--master_tombstone_evicted_tablet_replicas=false"); + NO_FATALS(StartClusterWithOpts(std::move(cluster_opts))); + + // Populate a tablet with some data. + TestWorkload workload(cluster_.get()); + workload.Setup(); + workload.Start(); + while (workload.rows_inserted() < 1000) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()]; + vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; + ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, kTimeout, &tablets)); + string tablet_id = tablets[0].tablet_status().tablet_id(); + + // Ensure all the servers agree before we proceed. + workload.StopAndJoin(); + ASSERT_OK(WaitForServersToAgree(kTimeout, ts_map_, tablet_id, + workload.batches_completed())); + + int leader_index; + int follower_index; + TServerDetails* leader_ts; + TServerDetails* follower_ts; + + // Get the leader and follower tablet servers. + ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, kTimeout, &leader_ts)); + leader_index = cluster_->tablet_server_index_by_uuid(leader_ts->uuid()); + follower_index = (leader_index + 1) % cluster_->num_tablet_servers(); + ExternalTabletServer* follower = cluster_->tablet_server(follower_index); + leader_ts = ts_map_[cluster_->tablet_server(leader_index)->uuid()]; + follower_ts = ts_map_[follower->uuid()]; + + // Tombstone the follower. + ASSERT_OK(DeleteTabletWithRetries(follower_ts, tablet_id, + TabletDataState::TABLET_DATA_TOMBSTONED, + kTimeout)); + HostPort leader_addr; + ASSERT_OK(HostPortFromPB(leader_ts->registration.rpc_addresses(0), &leader_addr)); + + // Inject failures to the metadata and trigger the tablet copy. This will + // cause the copy to fail. + ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(follower_index), + "env_inject_eio_globs", JoinPathSegments(cluster_->WalRootForTS(follower_index), + "tablet-meta/**"))); + ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(follower_index), + "env_inject_eio", "0.10")); + Status s; + ASSERT_EVENTUALLY([&] { + s = itest::StartTabletCopy(follower_ts, tablet_id, leader_ts->uuid(), + leader_addr, std::numeric_limits<int64_t>::max(), kTimeout); + ASSERT_STR_CONTAINS(s.ToString(), "INJECTED FAILURE"); + }); + + // Trigger a copy again. This should fail. The previous copy might still be + // cleaning up so we might get some "already in progress" errors; eventually + // a copy will successfully start and fail with the expected error. + ASSERT_EVENTUALLY([&] { + s = itest::StartTabletCopy(follower_ts, tablet_id, leader_ts->uuid(), + leader_addr, std::numeric_limits<int64_t>::max(), kTimeout); + ASSERT_STR_CONTAINS(s.ToString(), "INJECTED FAILURE"); + }); + + // Regression condition for KUDU-2293: the above second attempt at a copy + // should not crash the destination server. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + const auto tserver = cluster_->tablet_server(i); + ASSERT_TRUE(tserver->IsProcessAlive()) + << Substitute("Tablet server $0 ($1) has crashed...", i, tserver->uuid()); + } + + // Finally, trigger a successful copy. + ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(follower_index), + "env_inject_eio", "0.0")); + ASSERT_EVENTUALLY([&] { + ASSERT_OK(itest::StartTabletCopy(follower_ts, tablet_id, leader_ts->uuid(), + leader_addr, std::numeric_limits<int64_t>::max(), kTimeout)); + }); +} + // Start tablet copy session and delete the tablet in the middle. // It should actually be possible to complete copying in such a case, because // when a Tablet Copy session is started on the "source" server, all of http://git-wip-us.apache.org/repos/asf/kudu/blob/03d1fcb1/src/kudu/tablet/tablet_metadata.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tablet/tablet_metadata.cc b/src/kudu/tablet/tablet_metadata.cc index 004a5ed..c8c04a7 100644 --- a/src/kudu/tablet/tablet_metadata.cc +++ b/src/kudu/tablet/tablet_metadata.cc @@ -215,22 +215,15 @@ Status TabletMetadata::DeleteTabletData(TabletDataState delete_type, } } - // Keep a copy of the old data dir group in case of flush failure. - DataDirGroupPB pb; - bool old_group_exists = fs_manager_->dd_manager()->GetDataDirGroupPB(tablet_id_, &pb).ok(); - - // Remove the tablet's data dir group tracked by the DataDirManager. + // Unregister the tablet's data dir group in memory (it is stored on disk in + // the superblock). Even if we fail to flush below, the expectation is that + // we will no longer be writing to the tablet, and therefore don't need its + // data dir group. fs_manager_->dd_manager()->DeleteDataDirGroup(tablet_id_); - auto revert_group_cleanup = MakeScopedCleanup([&]() { - if (old_group_exists) { - fs_manager_->dd_manager()->LoadDataDirGroupFromPB(tablet_id_, pb); - } - }); // Flushing will sync the new tablet_data_state_ to disk and will now also // delete all the data. RETURN_NOT_OK(Flush()); - revert_group_cleanup.cancel(); // Re-sync to disk one more time. // This call will typically re-sync with an empty orphaned blocks list http://git-wip-us.apache.org/repos/asf/kudu/blob/03d1fcb1/src/kudu/tablet/tablet_metadata.h ---------------------------------------------------------------------- diff --git a/src/kudu/tablet/tablet_metadata.h b/src/kudu/tablet/tablet_metadata.h index e063946..107ec9f 100644 --- a/src/kudu/tablet/tablet_metadata.h +++ b/src/kudu/tablet/tablet_metadata.h @@ -14,8 +14,7 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -#ifndef KUDU_TABLET_TABLET_METADATA_H -#define KUDU_TABLET_TABLET_METADATA_H +#pragma once #include <atomic> #include <cstdint> @@ -198,6 +197,9 @@ class TabletMetadata : public RefCountedThreadSafe<TabletMetadata> { // partially-tombstoned tablets during crash recovery. // // Returns only once all data has been removed. + // + // Note: this will always update the in-memory state, but upon failure, + // may not update the on-disk state. Status DeleteTabletData(TabletDataState delete_type, const boost::optional<consensus::OpId>& last_logged_opid); @@ -237,9 +239,12 @@ class TabletMetadata : public RefCountedThreadSafe<TabletMetadata> { // Sets *super_block to the serialized form of the current metadata. Status ToSuperBlock(TabletSuperBlockPB* super_block) const; - // Fully replace a superblock (used for bootstrap). + // Replace the superblock (used for bootstrap) both in memory and on disk. Status ReplaceSuperBlock(const TabletSuperBlockPB &pb); + // Update the in-memory state of metadata to that of the given superblock PB. + Status LoadFromSuperBlock(const TabletSuperBlockPB& superblock); + int64_t on_disk_size() const { return on_disk_size_.load(std::memory_order_relaxed); } @@ -286,9 +291,6 @@ class TabletMetadata : public RefCountedThreadSafe<TabletMetadata> { // Updates the cached on-disk size of the tablet superblock. Status UpdateOnDiskSize(); - // Update state of metadata to that of the given superblock PB. - Status LoadFromSuperBlock(const TabletSuperBlockPB& superblock); - Status ReadSuperBlock(TabletSuperBlockPB *pb); // Fully replace superblock. @@ -392,5 +394,3 @@ class TabletMetadata : public RefCountedThreadSafe<TabletMetadata> { } // namespace tablet } // namespace kudu - -#endif /* KUDU_TABLET_TABLET_METADATA_H */ http://git-wip-us.apache.org/repos/asf/kudu/blob/03d1fcb1/src/kudu/tserver/tablet_copy_client-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tserver/tablet_copy_client-test.cc b/src/kudu/tserver/tablet_copy_client-test.cc index de5d623..8c880ce 100644 --- a/src/kudu/tserver/tablet_copy_client-test.cc +++ b/src/kudu/tserver/tablet_copy_client-test.cc @@ -25,6 +25,7 @@ #include <thread> #include <vector> +#include <gflags/gflags.h> #include <gflags/gflags_declare.h> #include <glog/logging.h> #include <glog/stl_logging.h> @@ -43,10 +44,10 @@ #include "kudu/fs/data_dirs.h" #include "kudu/fs/fs_manager.h" #include "kudu/gutil/casts.h" -#include "kudu/gutil/gscoped_ptr.h" #include "kudu/gutil/port.h" #include "kudu/gutil/ref_counted.h" #include "kudu/gutil/strings/fastmem.h" +#include "kudu/gutil/strings/join.h" #include "kudu/gutil/strings/substitute.h" #include "kudu/rpc/messenger.h" #include "kudu/tablet/metadata.pb.h" @@ -61,6 +62,7 @@ #include "kudu/util/metrics.h" #include "kudu/util/monotime.h" #include "kudu/util/net/net_util.h" +#include "kudu/util/path_util.h" #include "kudu/util/slice.h" #include "kudu/util/status.h" #include "kudu/util/test_macros.h" @@ -70,7 +72,9 @@ using std::string; using std::thread; using std::vector; +DECLARE_double(env_inject_eio); DECLARE_string(block_manager); +DECLARE_string(env_inject_eio_globs); METRIC_DECLARE_counter(block_manager_total_disk_sync); @@ -106,7 +110,11 @@ class TabletCopyClientTest : public TabletCopyTest { fs_manager_.reset(new FsManager(Env::Default(), opts)); ASSERT_OK(fs_manager_->CreateInitialFileSystemLayout()); ASSERT_OK(fs_manager_->Open()); + ASSERT_OK(ResetTabletCopyClient()); + } + // Sets up a new tablet copy client. + Status ResetTabletCopyClient() { scoped_refptr<ConsensusMetadataManager> cmeta_manager( new ConsensusMetadataManager(fs_manager_.get())); @@ -119,13 +127,17 @@ class TabletCopyClientTest : public TabletCopyTest { nullptr /* no metrics */)); RaftPeerPB* cstate_leader; ConsensusStatePB cstate; - ASSERT_OK(tablet_replica_->consensus()->ConsensusState(&cstate)); - ASSERT_OK(GetRaftConfigLeader(&cstate, &cstate_leader)); + RETURN_NOT_OK(tablet_replica_->consensus()->ConsensusState(&cstate)); + RETURN_NOT_OK(GetRaftConfigLeader(&cstate, &cstate_leader)); leader_ = *cstate_leader; + return Status::OK(); + } + // Starts the tablet copy. + Status StartCopy() { HostPort host_port; - ASSERT_OK(HostPortFromPB(leader_.last_known_addr(), &host_port)); - ASSERT_OK(client_->Start(host_port, &meta_)); + RETURN_NOT_OK(HostPortFromPB(leader_.last_known_addr(), &host_port)); + return client_->Start(host_port, &meta_); } protected: @@ -133,9 +145,9 @@ class TabletCopyClientTest : public TabletCopyTest { MetricRegistry metric_registry_; scoped_refptr<MetricEntity> metric_entity_; - gscoped_ptr<FsManager> fs_manager_; + unique_ptr<FsManager> fs_manager_; shared_ptr<rpc::Messenger> messenger_; - gscoped_ptr<TabletCopyClient> client_; + unique_ptr<TabletCopyClient> client_; scoped_refptr<TabletMetadata> meta_; RaftPeerPB leader_; }; @@ -167,20 +179,89 @@ Status TabletCopyClientTest::CompareFileContents(const string& path1, const stri return Status::OK(); } +// Test a tablet copy going through the various states in the copy state +// machine. +TEST_F(TabletCopyClientTest, TestLifeCycle) { + // Target fault injection for the tablet metadata directories, but do not + // start injecting failures just yet. + const vector<string> meta_dirs = { + JoinPathSegments(client_->fs_manager_->GetConsensusMetadataDir(), "**"), + JoinPathSegments(client_->fs_manager_->GetTabletMetadataDir(), "**") }; + FLAGS_env_inject_eio_globs = JoinStrings(meta_dirs, ","); + + ASSERT_EQ(TabletCopyClient::State::kInitialized, client_->state_); + Status s; + // If we're creating a brand new tablet, failing to start the copy will yield + // no changes in-memory. It should be as if the copy hadn't started. + { + google::FlagSaver fs; + FLAGS_env_inject_eio = 1.0; + s = StartCopy(); + ASSERT_TRUE(s.IsIOError()) << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "Failed to write tablet metadata"); + ASSERT_EQ(TabletCopyClient::State::kInitialized, client_->state_); + ASSERT_FALSE(meta_); + } + + // Now let's try replacing a tablet. Set a metadata that we can replace. + ASSERT_OK(ResetTabletCopyClient()); + ASSERT_OK(StartCopy()); + ASSERT_EQ(TabletCopyClient::State::kStarted, client_->state_); + ASSERT_OK(client_->Finish()); + ASSERT_EQ(TabletCopyClient::State::kFinished, client_->state_); + + // Since we're going to replace the tablet, we need to tombstone the existing + // metadata first. + meta_->set_tablet_data_state(tablet::TABLET_DATA_TOMBSTONED); + ASSERT_OK(ResetTabletCopyClient()); + ASSERT_OK(client_->SetTabletToReplace(meta_, 0)); + + // If we're replacing a tablet, failing to start will yield changes + // in-memory, and it is thus necessary to recognize the copy is in the + // process of starting. + { + google::FlagSaver fs; + FLAGS_env_inject_eio = 1.0; + s = StartCopy(); + ASSERT_TRUE(s.IsIOError()) << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "Could not replace superblock"); + ASSERT_EQ(TabletCopyClient::State::kStarting, client_->state_); + } + + // Make sure we are still in the appropriate state if we fail to finish. + ASSERT_OK(ResetTabletCopyClient()); + client_->SetTabletToReplace(meta_, 0); + ASSERT_OK(StartCopy()); + FLAGS_env_inject_eio = 1.0; + s = client_->Finish(); + ASSERT_TRUE(s.IsIOError()); + ASSERT_EQ(TabletCopyClient::State::kStarted, client_->state_); + + // Closing out the copy should leave the copy client in its terminal state, + // even upon failure. + s = client_->Abort(); + ASSERT_TRUE(s.IsIOError()); + ASSERT_EQ(TabletCopyClient::State::kFinished, client_->state_); + ASSERT_EQ(tablet::TABLET_DATA_TOMBSTONED, meta_->tablet_data_state()); +} + // Implementation test that no blocks exist in the new superblock before fetching. TEST_F(TabletCopyClientTest, TestNoBlocksAtStart) { + ASSERT_OK(StartCopy()); ASSERT_GT(ListBlocks(*client_->remote_superblock_).size(), 0); ASSERT_EQ(0, ListBlocks(*client_->superblock_).size()); } // Basic begin / end tablet copy session. TEST_F(TabletCopyClientTest, TestBeginEndSession) { + ASSERT_OK(StartCopy()); ASSERT_OK(client_->FetchAll(nullptr /* no listener */)); ASSERT_OK(client_->Finish()); } // Basic data block download unit test. TEST_F(TabletCopyClientTest, TestDownloadBlock) { + ASSERT_OK(StartCopy()); BlockId block_id = FirstColumnBlockId(*client_->remote_superblock_); Slice slice; faststring scratch; @@ -201,6 +282,7 @@ TEST_F(TabletCopyClientTest, TestDownloadBlock) { // Basic WAL segment download unit test. TEST_F(TabletCopyClientTest, TestDownloadWalSegment) { + ASSERT_OK(StartCopy()); ASSERT_OK(env_util::CreateDirIfMissing( env_, fs_manager_->GetTabletWalDir(GetTabletId()))); @@ -222,6 +304,7 @@ TEST_F(TabletCopyClientTest, TestDownloadWalSegment) { // Ensure that we detect data corruption at the per-transfer level. TEST_F(TabletCopyClientTest, TestVerifyData) { + ASSERT_OK(StartCopy()); string good = "This is a known good string"; string bad = "This is a known bad! string"; const int kGoodOffset = 0; @@ -257,6 +340,7 @@ TEST_F(TabletCopyClientTest, TestVerifyData) { } TEST_F(TabletCopyClientTest, TestDownloadAllBlocks) { + ASSERT_OK(StartCopy()); // Download and commit all the blocks. ASSERT_OK(client_->DownloadBlocks()); ASSERT_OK(client_->transaction_->CommitCreatedBlocks()); @@ -290,6 +374,7 @@ TEST_F(TabletCopyClientTest, TestDownloadAllBlocks) { // Test that failing a disk outside fo the tablet copy client will eventually // stop the copy client and cause it to fail. TEST_F(TabletCopyClientTest, TestFailedDiskStopsClient) { + ASSERT_OK(StartCopy()); DataDirManager* dd_manager = fs_manager_->dd_manager(); // Repeatedly fetch files for the client. @@ -334,6 +419,11 @@ struct AbortTestParams { class TabletCopyClientAbortTest : public TabletCopyClientTest, public ::testing::WithParamInterface< tuple<DownloadBlocks, DeleteTrigger>> { + public: + virtual void SetUp() override { + TabletCopyClientTest::SetUp(); + ASSERT_OK(StartCopy()); + } protected: // Create the specified number of blocks with junk data for testing purposes. void CreateTestBlocks(int num_blocks); http://git-wip-us.apache.org/repos/asf/kudu/blob/03d1fcb1/src/kudu/tserver/tablet_copy_client.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tserver/tablet_copy_client.cc b/src/kudu/tserver/tablet_copy_client.cc index ef88b0e..cb4ca7f 100644 --- a/src/kudu/tserver/tablet_copy_client.cc +++ b/src/kudu/tserver/tablet_copy_client.cc @@ -64,6 +64,8 @@ #include "kudu/util/pb_util.h" #include "kudu/util/random.h" #include "kudu/util/random_util.h" +#include "kudu/util/scoped_cleanup.h" +#include "kudu/util/status.h" DEFINE_int32(tablet_copy_begin_session_timeout_ms, 3000, "Tablet server RPC client timeout for BeginTabletCopySession calls. " @@ -318,8 +320,11 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr, *superblock_->mutable_tombstone_last_logged_opid() = *last_logged_opid; } - // Remove any existing orphaned blocks and WALs from the tablet, and - // set the data state to 'COPYING'. + // We are about to persist things to disk. Update the tablet copy state + // machine so we know there is state to clean up in case we fail. + state_ = kStarting; + // Set the data state to 'COPYING' and remove any existing orphaned blocks + // and WALs from the tablet. RETURN_NOT_OK_PREPEND( TSTabletManager::DeleteTabletData(meta_, cmeta_manager_, tablet::TABLET_DATA_COPYING, @@ -349,6 +354,9 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr, superblock_->tablet_data_state(), superblock_->tombstone_last_logged_opid(), &meta_)); + // We have begun persisting things to disk. Update the tablet copy state + // machine so we know there is state to clean up in case we fail. + state_ = kStarting; } CHECK_OK(fs_manager_->dd_manager()->GetDataDirGroupPB( tablet_id_, superblock_->mutable_data_dir_group())); @@ -389,15 +397,26 @@ Status TabletCopyClient::Finish() { // so the fsync() operations could be cheaper. // 3) Downloaded blocks should be made durable before replacing superblock. RETURN_NOT_OK(transaction_->CommitCreatedBlocks()); - state_ = kFinished; // Replace tablet metadata superblock. This will set the tablet metadata state // to TABLET_DATA_READY, since we checked above that the response // superblock is in a valid state to bootstrap from. LOG_WITH_PREFIX(INFO) << "Tablet Copy complete. Replacing tablet superblock."; SetStatusMessage("Replacing tablet superblock"); - superblock_->set_tablet_data_state(tablet::TABLET_DATA_READY); + + boost::optional<OpId> last_logged_opid = superblock_->tombstone_last_logged_opid(); + auto revert_activate_superblock = MakeScopedCleanup([&] { + // If we fail below, revert the updated state so further calls to Abort() + // can clean up appropriately. + if (last_logged_opid) { + *superblock_->mutable_tombstone_last_logged_opid() = *last_logged_opid; + } + superblock_->set_tablet_data_state(TabletDataState::TABLET_DATA_COPYING); + }); + superblock_->clear_tombstone_last_logged_opid(); + superblock_->set_tablet_data_state(tablet::TABLET_DATA_READY); + RETURN_NOT_OK(meta_->ReplaceSuperBlock(*superblock_)); if (FLAGS_tablet_copy_save_downloaded_metadata) { @@ -408,22 +427,39 @@ Status TabletCopyClient::Finish() { "Unable to make copy of tablet metadata"); } + // Now that we've finished everything, complete. + revert_activate_superblock.cancel(); + state_ = kFinished; return Status::OK(); } Status TabletCopyClient::Abort() { - if (state_ != kStarted) { + // If we have not begun doing anything or have already finished, there is + // nothing left to do. + if (state_ == kInitialized || state_ == kFinished) { return Status::OK(); } - state_ = kFinished; - CHECK(meta_); - // Write the in-progress superblock to disk so that when we delete the tablet - // data all the partial blocks we have persisted will be deleted. DCHECK_EQ(tablet::TABLET_DATA_COPYING, superblock_->tablet_data_state()); - RETURN_NOT_OK(meta_->ReplaceSuperBlock(*superblock_)); - - // Delete all of the tablet data, including blocks and WALs. + // Ensure upon exiting that the copy is finished and that the tablet is left + // tombstoned. This is guaranteed because we always call DeleteTabletData(), + // which leaves the metadata in this state, even on failure. + SCOPED_CLEANUP({ + DCHECK_EQ(tablet::TABLET_DATA_TOMBSTONED, meta_->tablet_data_state()); + state_ = kFinished; + }); + + // Load the in-progress superblock in-memory so that when we delete the + // tablet, all the partial blocks we persisted will be deleted. + // + // Note: We warn instead of returning early here upon failure because even if + // the superblock protobuf was somehow corrupted, we still want to attempt to + // delete the tablet's data dir group, WAL segments, etc. + WARN_NOT_OK(meta_->LoadFromSuperBlock(*superblock_), + "Failed to load the new superblock"); + + // Finally, tombstone the tablet metadata and try deleting all of the tablet + // data on disk, including blocks and WALs. RETURN_NOT_OK_PREPEND( TSTabletManager::DeleteTabletData(meta_, cmeta_manager_, tablet::TABLET_DATA_TOMBSTONED, http://git-wip-us.apache.org/repos/asf/kudu/blob/03d1fcb1/src/kudu/tserver/tablet_copy_client.h ---------------------------------------------------------------------- diff --git a/src/kudu/tserver/tablet_copy_client.h b/src/kudu/tserver/tablet_copy_client.h index 018c6db..c2adf6a 100644 --- a/src/kudu/tserver/tablet_copy_client.h +++ b/src/kudu/tserver/tablet_copy_client.h @@ -14,12 +14,11 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -#ifndef KUDU_TSERVER_TABLET_COPY_CLIENT_H -#define KUDU_TSERVER_TABLET_COPY_CLIENT_H +#pragma once #include <cstdint> -#include <string> #include <memory> +#include <string> #include <vector> #include <gtest/gtest_prod.h> @@ -132,14 +131,35 @@ class TabletCopyClient { FRIEND_TEST(TabletCopyClientTest, TestNoBlocksAtStart); FRIEND_TEST(TabletCopyClientTest, TestBeginEndSession); FRIEND_TEST(TabletCopyClientTest, TestDownloadBlock); + FRIEND_TEST(TabletCopyClientTest, TestLifeCycle); FRIEND_TEST(TabletCopyClientTest, TestVerifyData); FRIEND_TEST(TabletCopyClientTest, TestDownloadWalSegment); FRIEND_TEST(TabletCopyClientTest, TestDownloadAllBlocks); FRIEND_TEST(TabletCopyClientAbortTest, TestAbort); + // State machine that guides the progression of a single tablet copy. + // A tablet copy will go through the states: + // + // kInitialized --> kStarting --> kStarted --> kFinished + // | | | ^^^ + // | | +----------+|| + // | +-------------------------+| + // +------------------------------------------+ enum State { + // The copy has yet to do anything and no changes have been made on disk. kInitialized, + + // The copy has begun updating metadata on disk, but has not begun + // receiving WAL segments or blocks from the tablet copy source. Being in + // this state or beyond implies that 'meta_' is non-null. + kStarting, + + // The metadata has been updated on disk to indicate the start of a new + // copy. Blocks, metadata, and WAL segments may be received from the tablet + // copy source in this state. kStarted, + + // The copy is finished and needs no cleanup. kFinished, }; @@ -261,4 +281,3 @@ class TabletCopyClient { } // namespace tserver } // namespace kudu -#endif /* KUDU_TSERVER_TABLET_COPY_CLIENT_H */