This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch branch-1.17.x
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/branch-1.17.x by this push:
     new 6e729fd76 KUDU-3562 fix integer overflow in available space metrics
6e729fd76 is described below

commit 6e729fd762bed42fe78e49b20a757ffd0f9b73aa
Author: Alexey Serbin <ale...@apache.org>
AuthorDate: Sun Mar 31 12:48:27 2024 -0700

    KUDU-3562 fix integer overflow in available space metrics
    
    This patch addresses KUDU-3562.  The problem was in implicit casting
    of 64-bit integers down to 32-bit ones when introducing a helper
    variable in CalculateAvailableSpace().  A new test scenario is added
    to cover the fixed issue and catch regressions in future, if any.
    
    In addition, this patch corrects the type of the related metrics in
    disk_failure-itest.cc.  It also clarifies on the description and tags
    of the --fs_{data_dirs,wal_dir}_available_space_cache_seconds flags.
    
    Change-Id: I974aea822626e4648886388c0de3741ac459f2ec
    Reviewed-on: http://gerrit.cloudera.org:8080/21227
    Reviewed-by: Abhishek Chennaka <achenn...@cloudera.com>
    Tested-by: Abhishek Chennaka <achenn...@cloudera.com>
    Reviewed-by: Mahesh Reddy <mre...@cloudera.com>
    (cherry picked from commit e54c80cf9ba129b943eed6af3266899753b1b4ac)
    Reviewed-on: http://gerrit.cloudera.org:8080/21228
---
 src/kudu/fs/data_dirs.cc                           |  8 +-
 src/kudu/integration-tests/disk_failure-itest.cc   |  8 +-
 .../integration-tests/disk_reservation-itest.cc    | 92 +++++++++++++++++++++-
 src/kudu/server/server_base.cc                     | 14 ++--
 4 files changed, 108 insertions(+), 14 deletions(-)

diff --git a/src/kudu/fs/data_dirs.cc b/src/kudu/fs/data_dirs.cc
index b5dc54c00..9c705fea4 100644
--- a/src/kudu/fs/data_dirs.cc
+++ b/src/kudu/fs/data_dirs.cc
@@ -77,18 +77,22 @@ TAG_FLAG(fs_data_dirs_reserved_bytes, runtime);
 TAG_FLAG(fs_data_dirs_reserved_bytes, evolving);
 
 DEFINE_int32(fs_data_dirs_available_space_cache_seconds, 10,
-             "Number of seconds we cache the available disk space in the block 
manager.");
+             "TTL for the cached metric of the available disk space "
+             "in the data directories, in seconds");
 DEFINE_validator(fs_data_dirs_available_space_cache_seconds,
                  [](const char* /*n*/, int32_t v) { return v >= 0; });
 TAG_FLAG(fs_data_dirs_available_space_cache_seconds, advanced);
 TAG_FLAG(fs_data_dirs_available_space_cache_seconds, evolving);
+TAG_FLAG(fs_data_dirs_available_space_cache_seconds, runtime);
 
 DEFINE_int32(fs_wal_dir_available_space_cache_seconds, 10,
-             "Number of seconds we cache the available disk space the WAL 
directory.");
+             "TTL for the cached metric of the available disk space "
+             "in the WAL directories, in seconds");
 DEFINE_validator(fs_wal_dir_available_space_cache_seconds,
                  [](const char* /*n*/, int32_t v) { return v >= 0; });
 TAG_FLAG(fs_wal_dir_available_space_cache_seconds, advanced);
 TAG_FLAG(fs_wal_dir_available_space_cache_seconds, evolving);
+TAG_FLAG(fs_wal_dir_available_space_cache_seconds, runtime);
 
 DEFINE_bool(fs_lock_data_dirs, true,
             "Lock the data directories to prevent concurrent usage. "
diff --git a/src/kudu/integration-tests/disk_failure-itest.cc 
b/src/kudu/integration-tests/disk_failure-itest.cc
index 3cbcddc62..2f20f7592 100644
--- a/src/kudu/integration-tests/disk_failure-itest.cc
+++ b/src/kudu/integration-tests/disk_failure-itest.cc
@@ -57,8 +57,8 @@ METRIC_DECLARE_gauge_int32(num_raft_leaders);
 METRIC_DECLARE_gauge_size(num_rowsets_on_disk);
 METRIC_DECLARE_gauge_uint64(data_dirs_failed);
 METRIC_DECLARE_gauge_uint32(tablets_num_failed);
-METRIC_DECLARE_gauge_uint64(wal_dir_space_available_bytes);
-METRIC_DECLARE_gauge_uint64(data_dirs_space_available_bytes);
+METRIC_DECLARE_gauge_int64(wal_dir_space_available_bytes);
+METRIC_DECLARE_gauge_int64(data_dirs_space_available_bytes);
 
 using kudu::client::sp::shared_ptr;
 using kudu::client::KuduClient;
@@ -321,7 +321,7 @@ TEST_P(TabletServerDiskErrorITest, TestFailOnBootstrap) {
   // Wait for the cluster to return to a healthy state.
   ClusterVerifier v(cluster_.get());
   NO_FATALS(v.CheckCluster());
-};
+}
 
 TEST_P(TabletServerDiskErrorITest, TestSpaceAvailableMetrics) {
   // Get the wal_dir_space_available_bytes, data_dirs_space_available_bytes 
and make sure
@@ -367,7 +367,7 @@ TEST_P(TabletServerDiskErrorITest, 
TestSpaceAvailableMetrics) {
   ASSERT_OK(get_metrics(&wal_dir_space, &data_dir_space));
   ASSERT_NE(wal_dir_space, -1);
   ASSERT_EQ(data_dir_space, -1);
-};
+}
 
 TEST_P(TabletServerDiskErrorITest, TestFailDuringScanWorkload) {
   // Make one server to be more likely to host leader replicas: its Raft
diff --git a/src/kudu/integration-tests/disk_reservation-itest.cc 
b/src/kudu/integration-tests/disk_reservation-itest.cc
index 7bc29f050..519a2dfa7 100644
--- a/src/kudu/integration-tests/disk_reservation-itest.cc
+++ b/src/kudu/integration-tests/disk_reservation-itest.cc
@@ -33,21 +33,24 @@
 #include "kudu/mini-cluster/external_mini_cluster.h"
 #include "kudu/util/metrics.h"
 #include "kudu/util/monotime.h"
+#include "kudu/util/net/net_util.h"
 #include "kudu/util/status.h"
 #include "kudu/util/test_macros.h"
 #include "kudu/util/test_util.h"
 
+using kudu::cluster::ExternalMiniClusterOptions;
+using kudu::itest::GetInt64Metric;
 using std::string;
 using std::vector;
 using strings::Substitute;
 
 METRIC_DECLARE_entity(server);
+METRIC_DECLARE_gauge_int64(wal_dir_space_available_bytes);
+METRIC_DECLARE_gauge_int64(data_dirs_space_available_bytes);
 METRIC_DECLARE_gauge_uint64(data_dirs_full);
 
 namespace kudu {
 
-using cluster::ExternalMiniClusterOptions;
-
 class DiskReservationITest : public ExternalMiniClusterITestBase {
 };
 
@@ -161,4 +164,89 @@ TEST_F(DiskReservationITest, TestWalWriteToFullDiskAborts) 
{
   workload.StopAndJoin();
 }
 
+// Make sure the metrics on the available space in the WAL and the data
+// directories behave consistently with and without space reservations.
+TEST_F(DiskReservationITest, AvailableSpaceMetrics) {
+  constexpr const char* const kMetricValue = "value";
+
+  // To speed up the test, do not cache the metrics on the available disk 
space.
+  const vector<string> ts_flags = {
+    "--fs_data_dirs_available_space_cache_seconds=0",
+    "--fs_wal_dir_available_space_cache_seconds=0",
+  };
+  NO_FATALS(StartCluster(ts_flags, {}, 1));
+
+  auto* ts = cluster_->tablet_server(0);
+  DCHECK_NE(nullptr, ts);
+  const auto& addr = ts->bound_http_hostport();
+
+  auto space_getter_data_dirs = [&](int64_t* available_bytes) {
+    return GetInt64Metric(addr,
+                          &METRIC_ENTITY_server,
+                          nullptr,
+                          &METRIC_data_dirs_space_available_bytes,
+                          kMetricValue,
+                          available_bytes);
+  };
+  auto space_getter_wal_dir = [&](int64_t* available_bytes) {
+    return GetInt64Metric(addr,
+                          &METRIC_ENTITY_server,
+                          nullptr,
+                          &METRIC_wal_dir_space_available_bytes,
+                          kMetricValue,
+                          available_bytes);
+  };
+
+  constexpr const char* const kFlagDataReserved = 
"fs_data_dirs_reserved_bytes";
+  constexpr const char* const kFlagWalReserved = "fs_wal_dir_reserved_bytes";
+
+  // Make sure metrics capture non-negative numbers if the space reservations
+  // use their default settings: 1% of the available disk space for the WAL
+  // and the data directories.
+  {
+    int64_t data_dirs_space_bytes = -1;
+    ASSERT_OK(space_getter_data_dirs(&data_dirs_space_bytes));
+    ASSERT_GE(data_dirs_space_bytes, 0);
+
+    int64_t wal_dir_space_bytes = -1;
+    ASSERT_OK(space_getter_wal_dir(&wal_dir_space_bytes));
+    ASSERT_GE(wal_dir_space_bytes, 0);
+  }
+
+  // Set space reservation to 0 bytes.
+  ASSERT_OK(cluster_->SetFlag(ts, kFlagDataReserved, "0"));
+  ASSERT_OK(cluster_->SetFlag(ts, kFlagWalReserved, "0"));
+
+  int64_t data_dirs_space_bytes_no_reserve = -1;
+  ASSERT_OK(space_getter_data_dirs(&data_dirs_space_bytes_no_reserve));
+  ASSERT_GE(data_dirs_space_bytes_no_reserve, 0);
+
+  int64_t wal_dir_space_bytes_no_reserve = -1;
+  ASSERT_OK(space_getter_wal_dir(&wal_dir_space_bytes_no_reserve));
+  ASSERT_GE(wal_dir_space_bytes_no_reserve, 0);
+
+  // Set space reservation to 1027 GiB: this is to test for integer overflow
+  // in the related code. Essentially, it would be enough to set the 
reservation
+  // to anything beyond 2 GiB. However, to make this test scenario stable upon
+  // concurrent activity at the test node, it's necessary to have a wide margin
+  // for the comparison between the available space metrics for data/WAL
+  // directories with&without space reservation.
+  constexpr const char* const k1027GiB = "1102732853248";
+  ASSERT_OK(cluster_->SetFlag(ts, kFlagDataReserved, k1027GiB));
+  ASSERT_OK(cluster_->SetFlag(ts, kFlagWalReserved, k1027GiB));
+
+  int64_t data_dirs_space_bytes_reserve = -1;
+  ASSERT_OK(space_getter_data_dirs(&data_dirs_space_bytes_reserve));
+  ASSERT_GE(data_dirs_space_bytes_reserve, 0);
+
+  int64_t wal_dir_space_bytes_reserve = -1;
+  ASSERT_OK(space_getter_wal_dir(&wal_dir_space_bytes_reserve));
+  ASSERT_GE(wal_dir_space_bytes_reserve, 0);
+
+  // The available space without reservation should not be less than
+  // the available space with reservation.
+  ASSERT_GE(data_dirs_space_bytes_no_reserve, data_dirs_space_bytes_reserve);
+  ASSERT_GE(wal_dir_space_bytes_no_reserve, wal_dir_space_bytes_reserve);
+}
+
 } // namespace kudu
diff --git a/src/kudu/server/server_base.cc b/src/kudu/server/server_base.cc
index 1dcae38dd..bfc1c85a4 100644
--- a/src/kudu/server/server_base.cc
+++ b/src/kudu/server/server_base.cc
@@ -582,15 +582,17 @@ shared_ptr<MemTracker> CreateMemTrackerForServer() {
 
 // Calculates the free space on the given WAL/data directory's disk. Returns 
-1 in case of disk
 // failure.
-inline int64_t CalculateAvailableSpace(const ServerBaseOptions& options, const 
string& dir,
-                                       int64_t flag_reserved_bytes, SpaceInfo* 
space_info) {
-  if (!options.env->GetSpaceInfo(dir, space_info).ok()) {
+int64_t CalculateAvailableSpace(const ServerBaseOptions& options,
+                                const string& dir,
+                                int64_t flag_reserved_bytes,
+                                SpaceInfo* space_info) {
+  if (PREDICT_FALSE(!options.env->GetSpaceInfo(dir, space_info).ok())) {
     return -1;
   }
-  bool should_reserve_one_percent = flag_reserved_bytes == -1;
-  int reserved_bytes = should_reserve_one_percent ?
+  const bool should_reserve_one_percent = flag_reserved_bytes == -1;
+  int64_t reserved_bytes = should_reserve_one_percent ?
       space_info->capacity_bytes / 100 : flag_reserved_bytes;
-  return std::max(static_cast<int64_t>(0), space_info->free_bytes - 
reserved_bytes);
+  return std::max<int64_t>(0, space_info->free_bytes - reserved_bytes);
 }
 
 int64_t GetFileCacheCapacity(Env* env) {

Reply via email to