This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch branch-1.13.x
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/branch-1.13.x by this push:
     new bd24791  [client] make metacache reset safe
bd24791 is described below

commit bd247910306e0065f04026d97fe37cfb04e4fb70
Author: Alexey Serbin <ale...@apache.org>
AuthorDate: Tue Dec 8 20:33:21 2020 -0800

    [client] make metacache reset safe
    
    I noticed that the newly added TxnManagerTest.BeginManyTransactions
    test scenario started failing with ASAN heap-use-after-free warnings.
    
    After looking a that, it turned out that the original code was assuming
    the cache wouldn't be ever reset before calling the MetaCache's
    destructor.  However, changelist 232474a51 introduced a new method
    MetaCache::ClearCache() and since then the method is being called upon
    altering a table if the partitioning scheme has been updated.
    
    This patch resolves the issue by introducing so-called tablet server
    registry that's never reset indeed, where entries in the tablet server
    cache are just references to the entries in the registry (they are
    raw pointers, actually).
    
    The newly added test scenario was reliably producing AddressSanitizer's
    heap-use-after-free warnings every time I ran it using ASAN build.
    Below is a snapshot of the relevant traces captured when running the
    new test scenario without the changes in the client metacache.
    
      AddressSanitizer: heap-use-after-free on address 0x608000129e20 at pc 
0x00000078bd54 bp 0x7fa731d0b240 sp 0x7fa731d0b238
      READ of size 4 at 0x608000129e20 thread T149 (rpc reactor-146)
          #0 0x78bd53 in base::subtle::NoBarrier_Load(int const volatile*) 
src/kudu/gutil/atomicops-internals-x86.h:200:10
          #1 0x7fa79520e227 in base::SpinLock::SpinLoop(long, int*) 
src/kudu/gutil/spinlock.cc:86:10
          #2 0x7fa79520e38b in base::SpinLock::SlowLock() 
src/kudu/gutil/spinlock.cc:104:25
          #3 0x7fa7a099aab0 in std::unique_lock<kudu::simple_spinlock>::lock() 
../../../include/c++/8/bits/std_mutex.h:267:17
          #4 0x7fa7a0991e3e in 
std::unique_lock<kudu::simple_spinlock>::unique_lock(kudu::simple_spinlock&) 
../../../include/c++/8/bits/std_mutex.h:197:2
          #5 0x7fa7a0abfda1 in 
kudu::client::internal::RemoteTabletServer::InitProxy(kudu::client::KuduClient*,
 std::function<void (kudu::Status const&)> const&) 
src/kudu/client/meta_cache.cc:145:39
          #6 0x7fa7a0ac60f5 in 
kudu::client::internal::MetaCacheServerPicker::PickLeader(std::function<void 
(kudu::Status const&, kudu::client::internal::RemoteTabletServer*)> const&, 
kudu::MonoTime const&) src/kudu/client/meta_cache.cc:524:11
          #7 0x7fa7a09b2dcf in 
kudu::rpc::RetriableRpc<kudu::client::internal::RemoteTabletServer, 
kudu::tserver::WriteRequestPB, kudu::tserver::WriteResponsePB>::SendRpc() 
src/kudu/rpc/retriable_rpc.h:163:19
          #8 0x7fa7a09ac6cc in 
kudu::client::internal::Batcher::FlushBuffer(kudu::client::internal::RemoteTablet*,
 std::vector<kudu::client::internal::InFlightOp*, 
std::allocator<kudu::client::internal::InFlightOp*> > const&) 
src/kudu/client/batcher.cc:911:8
          #9 0x7fa7a09a9e38 in 
kudu::client::internal::Batcher::FlushBuffersIfReady() 
src/kudu/client/batcher.cc:884:5
          #10 0x7fa7a09abd2d in 
kudu::client::internal::Batcher::TabletLookupFinished(kudu::client::internal::InFlightOp*,
 kudu::Status const&) src/kudu/client/batcher.cc:851:3
          #11 0x7fa7a0acec66 in 
kudu::client::internal::LookupRpc::SendRpcCb(kudu::Status const&) 
src/kudu/client/meta_cache.cc:923:3
          #12 0x7fa7a0ab8800 in 
kudu::client::internal::AsyncLeaderMasterRpc<kudu::master::GetTableLocationsRequestPB,
 
kudu::master::GetTableLocationsResponsePB>::SendRpc()::'lambda'()::operator()() 
const src/kudu/client/master_proxy_rpc.cc:130:26
    
      0x608000129e20 is located 0 bytes inside of 96-byte region 
[0x608000129e20,0x608000129e80)
      freed by thread T163 here:
          #0 0x65c650 in operator delete(void*) 
thirdparty/src/llvm-9.0.0.src/projects/compiler-rt/lib/asan/asan_new_delete.cc:160:399
          #1 0x7fa7a0aec859 in void 
STLDeleteContainerPairSecondPointers<std::__detail::_Node_iterator<std::pair<std::string
 const, kudu::client::internal::RemoteTabletServer*>, false, true> 
>(std::__detail::_Node_iterator<std::pair<std::string const, 
kudu::client::internal::RemoteTabletServer*>, false, true>, 
std::__detail::_Node_iterator<std::pair<std::string const, 
kudu::client::internal::RemoteTabletServer*>, false, true>) 
src/kudu/gutil/stl_util.h:199:5
          #2 0x7fa7a0ad96d1 in void 
STLDeleteValues<std::unordered_map<std::string, 
kudu::client::internal::RemoteTabletServer*, std::hash<std::string>, 
std::equal_to<std::string>, std::allocator<std::pair<std::string const, 
kudu::client::internal::RemoteTabletServer*> > > 
>(std::unordered_map<std::string, kudu::client::internal::RemoteTabletServer*, 
std::hash<std::string>, std::equal_to<std::string>, 
std::allocator<std::pair<std::string const, 
kudu::client::internal::RemoteTabletServer*> [...]
          #3 0x7fa7a0ad4188 in kudu::client::internal::MetaCache::ClearCache() 
src/kudu/client/meta_cache.cc:1257:3
    
      previously allocated by thread T149 (rpc reactor-146) here:
          #0 0x65bc98 in operator new(unsigned long) 
thirdparty/src/llvm-9.0.0.src/projects/compiler-rt/lib/asan/asan_new_delete.cc:99:386
          #1 0x7fa7a0ac7bcb in 
kudu::client::internal::MetaCache::UpdateTabletServerUnlocked(kudu::master::TSInfoPB
 const&) src/kudu/client/meta_cache.cc:596:48
          #2 0x7fa7a0ad0802 in 
kudu::client::internal::MetaCache::ProcessGetTableLocationsResponse(kudu::client::KuduTable
 const*, std::string const&, bool, kudu::master::GetTableLocationsResponsePB 
const&, kudu::client::internal::MetaCacheEntry*, int) 
src/kudu/client/meta_cache.cc:1030:7
          #3 0x7fa7a0acf9c0 in 
kudu::client::internal::MetaCache::ProcessLookupResponse(kudu::client::internal::LookupRpc
 const&, kudu::client::internal::MetaCacheEntry*, int) 
src/kudu/client/meta_cache.cc:941:10
          #4 0x7fa7a0ace64e in 
kudu::client::internal::LookupRpc::SendRpcCb(kudu::Status const&) 
src/kudu/client/meta_cache.cc:911:31
          #5 0x7fa7a0ab8800 in 
kudu::client::internal::AsyncLeaderMasterRpc<kudu::master::GetTableLocationsRequestPB,
 
kudu::master::GetTableLocationsResponsePB>::SendRpc()::'lambda'()::operator()() 
const src/kudu/client/master_proxy_rpc.cc:130:26
          #6 0x7fa79b2c1620 in kudu::rpc::OutboundCall::CallCallback() 
src/kudu/rpc/outbound_call.cc:274:5
          #7 0x7fa79b2c1af0 in 
kudu::rpc::OutboundCall::SetResponse(std::unique_ptr<kudu::rpc::CallResponse, 
std::default_delete<kudu::rpc::CallResponse> >) 
src/kudu/rpc/outbound_call.cc:306:5
          #8 0x7fa79b26ef5e in 
kudu::rpc::Connection::HandleCallResponse(std::unique_ptr<kudu::rpc::InboundTransfer,
 std::default_delete<kudu::rpc::InboundTransfer> >) 
src/kudu/rpc/connection.cc:735:14
          #9 0x7fa79b26e0d6 in kudu::rpc::Connection::ReadHandler(ev::io&, int) 
src/kudu/rpc/connection.cc:673:7
    
      SUMMARY: AddressSanitizer: heap-use-after-free 
src/kudu/gutil/atomicops-internals-x86.h:200:10 in 
base::subtle::NoBarrier_Load(int const volatile*)
      Shadow bytes around the buggy address:
        0x0c108001d370: fa fa fa fa 00 00 00 00 00 00 00 00 00 00 00 fa
        0x0c108001d380: fa fa fa fa fd fd fd fd fd fd fd fd fd fd fd fd
        0x0c108001d390: fa fa fa fa 00 00 00 00 00 00 00 00 00 00 00 fa
        0x0c108001d3a0: fa fa fa fa fd fd fd fd fd fd fd fd fd fd fd fd
        0x0c108001d3b0: fa fa fa fa 00 00 00 00 00 00 00 00 00 00 00 fa
      =>0x0c108001d3c0: fa fa fa fa[fd]fd fd fd fd fd fd fd fd fd fd fd
        0x0c108001d3d0: fa fa fa fa 00 00 00 00 00 00 00 00 00 00 00 fa
        0x0c108001d3e0: fa fa fa fa fd fd fd fd fd fd fd fd fd fd fd fd
        0x0c108001d3f0: fa fa fa fa 00 00 00 00 00 00 00 00 00 00 00 fa
        0x0c108001d400: fa fa fa fa 00 00 00 00 00 00 00 00 00 00 00 fa
        0x0c108001d410: fa fa fa fa 00 00 00 00 00 00 00 00 00 00 00 fa
    
    Change-Id: I03ec9318526fbfc2da9b068eb3bbd9cd996efbca
    Reviewed-on: http://gerrit.cloudera.org:8080/16839
    Tested-by: Alexey Serbin <aser...@cloudera.com>
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
    (cherry picked from commit ae45cd15c2703e3cfaf62aabb20ba79b961b3135)
      Conflicts:
        src/kudu/client/client-test.cc
        src/kudu/client/client.h
        src/kudu/client/meta_cache.cc
        src/kudu/client/meta_cache.h
    Reviewed-on: http://gerrit.cloudera.org:8080/16862
    Tested-by: Kudu Jenkins
---
 src/kudu/client/client-test.cc | 32 +++++++++++++++++++++++++++++++-
 src/kudu/client/client.h       |  1 +
 src/kudu/client/meta_cache.cc  | 27 ++++++++++++++++++---------
 src/kudu/client/meta_cache.h   | 22 +++++++++++++++++-----
 4 files changed, 67 insertions(+), 15 deletions(-)

diff --git a/src/kudu/client/client-test.cc b/src/kudu/client/client-test.cc
index b9cf55f..57b340a 100644
--- a/src/kudu/client/client-test.cc
+++ b/src/kudu/client/client-test.cc
@@ -116,6 +116,7 @@
 #include "kudu/util/random_util.h"
 #include "kudu/util/semaphore.h"
 #include "kudu/util/slice.h"
+#include "kudu/util/scoped_cleanup.h"
 #include "kudu/util/status.h"
 #include "kudu/util/stopwatch.h"
 #include "kudu/util/test_macros.h"
@@ -2077,11 +2078,40 @@ TEST_F(ClientTest, TestMetaCacheExpiry) {
   ASSERT_FALSE(meta_cache->LookupEntryByKeyFastPath(client_table_.get(), "", 
&entry));
 
   // Force a lookup and ensure the entry is refreshed.
-  CHECK_NOTNULL(MetaCacheLookup(client_table_.get(), "").get());
+  ASSERT_NE(nullptr, MetaCacheLookup(client_table_.get(), ""));
+  ASSERT_TRUE(entry.stale());
   ASSERT_TRUE(meta_cache->LookupEntryByKeyFastPath(client_table_.get(), "", 
&entry));
   ASSERT_FALSE(entry.stale());
 }
 
+// This test scenario verifies that reset of the metacache is safe while 
working
+// with the client. The scenario calls MetaCache::ClearCache() in a rather
+// synthetic way, but the natural control flow does something similar to that
+// when alterting a table by adding a new range partition (see
+// KuduTableAlterer::Alter() for details).
+TEST_F(ClientTest, ClearCacheAndConcurrentWorkload) {
+  CountDownLatch latch(1);
+  thread cache_cleaner([&]() {
+    const auto sleep_interval = MonoDelta::FromMilliseconds(3);
+    while (!latch.WaitFor(sleep_interval)) {
+      client_->data_->meta_cache_->ClearCache();
+    }
+  });
+  auto thread_joiner = MakeScopedCleanup([&] {
+    latch.CountDown();
+    cache_cleaner.join();
+  });
+
+  constexpr const int kLowIdx = 0;
+  constexpr const int kHighIdx = 1000;
+  const auto runUntil = MonoTime::Now() + MonoDelta::FromSeconds(1);
+  while (MonoTime::Now() < runUntil) {
+    NO_FATALS(InsertTestRows(client_table_.get(), kHighIdx, kLowIdx));
+    NO_FATALS(UpdateTestRows(client_table_.get(), kLowIdx, kHighIdx));
+    NO_FATALS(DeleteTestRows(client_table_.get(), kLowIdx, kHighIdx));
+  }
+}
+
 TEST_F(ClientTest, TestGetTabletServerBlacklist) {
   shared_ptr<KuduTable> table;
   NO_FATALS(CreateTable("blacklist",
diff --git a/src/kudu/client/client.h b/src/kudu/client/client.h
index cd5df70..80f08d1 100644
--- a/src/kudu/client/client.h
+++ b/src/kudu/client/client.h
@@ -629,6 +629,7 @@ class KUDU_EXPORT KuduClient : public 
sp::enable_shared_from_this<KuduClient> {
   friend class tools::RemoteKsckCluster;
 
   FRIEND_TEST(kudu::ClientStressTest, TestUniqueClientIds);
+  FRIEND_TEST(ClientTest, ClearCacheAndConcurrentWorkload);
   FRIEND_TEST(ClientTest, TestCacheAuthzTokens);
   FRIEND_TEST(ClientTest, TestGetSecurityInfoFromMaster);
   FRIEND_TEST(ClientTest, TestGetTabletServerBlacklist);
diff --git a/src/kudu/client/meta_cache.cc b/src/kudu/client/meta_cache.cc
index 5e3d110..5a680e4 100644
--- a/src/kudu/client/meta_cache.cc
+++ b/src/kudu/client/meta_cache.cc
@@ -41,7 +41,6 @@
 #include "kudu/gutil/basictypes.h"
 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/port.h"
-#include "kudu/gutil/stl_util.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/master/master.pb.h"
 #include "kudu/master/master.proxy.h"
@@ -567,20 +566,30 @@ MetaCache::MetaCache(KuduClient* client,
       replica_visibility_(replica_visibility) {
 }
 
-MetaCache::~MetaCache() {
-  STLDeleteValues(&ts_cache_);
-}
-
 void MetaCache::UpdateTabletServer(const TSInfoPB& pb) {
   DCHECK(lock_.is_write_locked());
-  RemoteTabletServer* ts = FindPtrOrNull(ts_cache_, pb.permanent_uuid());
+  const auto& ts_uuid = pb.permanent_uuid();
+  auto* ts = FindPtrOrNull(ts_cache_, ts_uuid);
   if (ts) {
     ts->Update(pb);
     return;
   }
 
-  VLOG(1) << "Client caching new TabletServer " << pb.permanent_uuid();
-  InsertOrDie(&ts_cache_, pb.permanent_uuid(), new RemoteTabletServer(pb));
+  // First check whether the information about the tablet server is already
+  // present in the registry.
+  ts = FindPointeeOrNull(ts_registry_, ts_uuid);
+  if (ts) {
+    // If the tablet server is already registered, update the existing entry.
+    ts->Update(pb);
+  } else {
+    // If the tablet server isn't registered, add a new entry.
+    unique_ptr<RemoteTabletServer> entry(new RemoteTabletServer(pb));
+    ts = entry.get();
+    EmplaceOrDie(&ts_registry_, ts_uuid, std::move(entry));
+  }
+  // Now add the entry into the cache.
+  VLOG(1) << Substitute("client caching new TabletServer $0", ts_uuid);
+  InsertOrDie(&ts_cache_, ts_uuid, ts);
 }
 
 
@@ -1056,7 +1065,7 @@ void MetaCache::ClearNonCoveredRangeEntries(const 
std::string& table_id) {
 void MetaCache::ClearCache() {
   VLOG(3) << "Clearing cache";
   std::lock_guard<percpu_rwlock> l(lock_);
-  STLDeleteValues(&ts_cache_);
+  ts_cache_.clear();
   tablets_by_id_.clear();
   tablets_by_table_and_key_.clear();
 }
diff --git a/src/kudu/client/meta_cache.h b/src/kudu/client/meta_cache.h
index 9daa8e5..33eec99 100644
--- a/src/kudu/client/meta_cache.h
+++ b/src/kudu/client/meta_cache.h
@@ -146,6 +146,8 @@ struct RemoteReplica {
   bool failed;
 };
 
+typedef std::unordered_map<std::string, std::unique_ptr<RemoteTabletServer>>
+    TabletServerRegistry;
 typedef std::unordered_map<std::string, RemoteTabletServer*> TabletServerMap;
 
 // A ServerPicker for tablets servers, backed by the MetaCache.
@@ -394,7 +396,7 @@ class MetaCache : public RefCountedThreadSafe<MetaCache> {
  public:
   // The passed 'client' object must remain valid as long as MetaCache is 
alive.
   MetaCache(KuduClient* client, ReplicaController::Visibility 
replica_visibility);
-  ~MetaCache();
+  ~MetaCache() = default;
 
   // Determines what type of operation a MetaCache lookup is being done for.
   enum class LookupType {
@@ -493,11 +495,21 @@ class MetaCache : public RefCountedThreadSafe<MetaCache> {
 
   percpu_rwlock lock_;
 
-  // Cache of Tablet Server locations: TS UUID -> RemoteTabletServer*.
+  // Registry of all tablet servers as a map of tablet server's
+  // UUID -> std::unique_ptr<RemoteTabletServer>.
+  //
+  // Given that the set of tablet servers in a cluster is bounded by physical
+  // machines and every tablet server has its unique identifier, we never 
remove
+  // entries from this map until the MetaCache is destructed. Note that the
+  // ClearCache() method doesn't touch this registry, but updates ts_cache_ map
+  // below which contains raw pointers to the elements in this registry.
+  // So, there is no need to use shared_ptr and alike for the entries.
   //
-  // Given that the set of tablet servers is bounded by physical machines, we 
never
-  // evict entries from this map until the MetaCache is destructed. So, no 
need to use
-  // shared_ptr, etc.
+  // Protected by lock_.
+  TabletServerRegistry ts_registry_;
+
+  // Cache of Tablet Server locations: TS UUID -> RemoteTabletServer*.
+  // The cache can be cleared by the ClearCache() method.
   //
   // Protected by lock_.
   TabletServerMap ts_cache_;

Reply via email to