This is an automated email from the ASF dual-hosted git repository.

wangdan pushed a commit to branch migrate-metrics-dev
in repository https://gitbox.apache.org/repos/asf/incubator-pegasus.git

commit 24e734cbf00ac48acaa8a8c96f895ab85e8e26f2
Author: Dan Wang <[email protected]>
AuthorDate: Thu Apr 6 11:41:34 2023 +0800

    feat(new_metrics): add disk-level metric entity and migrate disk-level 
metrics for fs_manager (#1427)
    
    https://github.com/apache/incubator-pegasus/issues/1425
    
    In perf counters, all metrics of `fs_manager` are server-level. For example,
    the total capacity and the available capacity of all disks where there are
    data of pegasus.
    
    However, sometimes the capacity and the available capacity of each disk
    seem more important: no space left on the disk will lead to serious 
problems.
    Therefore, after being migrated to new framework, the server-level metrics
    of perf counters become disk-level, including the capacity and the available
    capacity of a disk. As for another disk-level metric -- the available 
percentage
    of each disk used by a replica server, just use division operator.
    
    Once server-level metrics are needed, just aggregate on the disk-level ones.
    To compute another 2 server-level metrics -- the minimal/maximal available
    percentage among all disks used by a replica server in a node, for example,
    just use min/max operators over disk-level ones for Prometheus.
    
    To implement disk-level metrics, disk-level metric entity are also added.
---
 src/common/fs_manager.cpp                 | 93 +++++++++++++++++++------------
 src/common/fs_manager.h                   | 58 +++++++++++--------
 src/common/test/CMakeLists.txt            |  1 +
 src/replica/test/replica_disk_test_base.h |  1 -
 src/utils/metrics.h                       |  7 +++
 5 files changed, 99 insertions(+), 61 deletions(-)

diff --git a/src/common/fs_manager.cpp b/src/common/fs_manager.cpp
index a33d20e54..50000205e 100644
--- a/src/common/fs_manager.cpp
+++ b/src/common/fs_manager.cpp
@@ -43,7 +43,6 @@
 #include "common/replication_enums.h"
 #include "fmt/core.h"
 #include "fmt/ostream.h"
-#include "perf_counter/perf_counter.h"
 #include "runtime/api_layer1.h"
 #include "runtime/rpc/rpc_address.h"
 #include "utils/fail_point.h"
@@ -52,6 +51,18 @@
 #include "utils/ports.h"
 #include "utils/string_view.h"
 
+METRIC_DEFINE_entity(disk);
+
+METRIC_DEFINE_gauge_int64(disk,
+                          disk_capacity_total_mb,
+                          dsn::metric_unit::kMegaBytes,
+                          "The total disk capacity");
+
+METRIC_DEFINE_gauge_int64(disk,
+                          disk_capacity_avail_mb,
+                          dsn::metric_unit::kMegaBytes,
+                          "The available disk capacity");
+
 namespace dsn {
 namespace replication {
 
@@ -69,6 +80,34 @@ DSN_DEFINE_bool(replication,
                 true,
                 "true means ignore broken data disk when initialize");
 
+namespace {
+
+metric_entity_ptr instantiate_disk_metric_entity(const std::string &tag,
+                                                 const std::string &data_dir)
+{
+    auto entity_id = fmt::format("disk_{}", tag);
+
+    return METRIC_ENTITY_disk.instantiate(entity_id, {{"tag", tag}, 
{"data_dir", data_dir}});
+}
+
+} // anonymous namespace
+
+disk_capacity_metrics::disk_capacity_metrics(const std::string &tag, const 
std::string &data_dir)
+    : _disk_metric_entity(instantiate_disk_metric_entity(tag, data_dir)),
+      METRIC_VAR_INIT_disk(disk_capacity_total_mb),
+      METRIC_VAR_INIT_disk(disk_capacity_avail_mb)
+{
+}
+
+const metric_entity_ptr &disk_capacity_metrics::disk_metric_entity() const
+{
+    CHECK_NOTNULL(_disk_metric_entity,
+                  "disk metric entity should has been instantiated: "
+                  "uninitialized entity cannot be used to instantiate "
+                  "metric");
+    return _disk_metric_entity;
+}
+
 uint64_t dir_node::replicas_count() const
 {
     uint64_t sum = 0;
@@ -127,6 +166,9 @@ void dir_node::update_disk_stat()
     disk_available_ratio = static_cast<int>(
         disk_capacity_mb == 0 ? 0 : std::round(disk_available_mb * 100.0 / 
disk_capacity_mb));
 
+    METRIC_CALL_SET_METHOD(disk_capacity, disk_capacity_total_mb, 
disk_capacity_mb);
+    METRIC_CALL_SET_METHOD(disk_capacity, disk_capacity_avail_mb, 
disk_available_mb);
+
     auto old_status = status;
     auto new_status = disk_available_ratio < 
FLAGS_disk_min_available_space_ratio
                           ? disk_status::SPACE_INSUFFICIENT
@@ -143,30 +185,6 @@ void dir_node::update_disk_stat()
              enum_to_string(status));
 }
 
-fs_manager::fs_manager()
-{
-    _counter_total_capacity_mb.init_app_counter("eon.replica_stub",
-                                                "disk.capacity.total(MB)",
-                                                COUNTER_TYPE_NUMBER,
-                                                "total disk capacity in MB");
-    _counter_total_available_mb.init_app_counter("eon.replica_stub",
-                                                 "disk.available.total(MB)",
-                                                 COUNTER_TYPE_NUMBER,
-                                                 "total disk available in MB");
-    _counter_total_available_ratio.init_app_counter("eon.replica_stub",
-                                                    
"disk.available.total.ratio",
-                                                    COUNTER_TYPE_NUMBER,
-                                                    "total disk available 
ratio");
-    _counter_min_available_ratio.init_app_counter("eon.replica_stub",
-                                                  "disk.available.min.ratio",
-                                                  COUNTER_TYPE_NUMBER,
-                                                  "minimal disk available 
ratio in all disks");
-    _counter_max_available_ratio.init_app_counter("eon.replica_stub",
-                                                  "disk.available.max.ratio",
-                                                  COUNTER_TYPE_NUMBER,
-                                                  "maximal disk available 
ratio in all disks");
-}
-
 dir_node *fs_manager::get_dir_node(const std::string &subdir) const
 {
     std::string norm_subdir;
@@ -343,16 +361,22 @@ bool fs_manager::for_each_dir_node(const 
std::function<bool(const dir_node &)> &
 
 void fs_manager::update_disk_stat()
 {
+    _total_capacity_mb = 0;
+    _total_available_mb = 0;
+    int total_available_ratio = 0;
+    int min_available_ratio = 100;
+    int max_available_ratio = 0;
+
     zauto_write_lock l(_lock);
-    reset_disk_stat();
+
     for (auto &dn : _dir_nodes) {
         dn->update_disk_stat();
         _total_capacity_mb += dn->disk_capacity_mb;
         _total_available_mb += dn->disk_available_mb;
-        _min_available_ratio = std::min(dn->disk_available_ratio, 
_min_available_ratio);
-        _max_available_ratio = std::max(dn->disk_available_ratio, 
_max_available_ratio);
+        min_available_ratio = std::min(dn->disk_available_ratio, 
min_available_ratio);
+        max_available_ratio = std::max(dn->disk_available_ratio, 
max_available_ratio);
     }
-    _total_available_ratio = static_cast<int>(
+    total_available_ratio = static_cast<int>(
         _total_capacity_mb == 0 ? 0 : std::round(_total_available_mb * 100.0 / 
_total_capacity_mb));
 
     LOG_INFO("update disk space succeed: disk_count = {}, total_capacity_mb = 
{}, "
@@ -361,14 +385,9 @@ void fs_manager::update_disk_stat()
              _dir_nodes.size(),
              _total_capacity_mb,
              _total_available_mb,
-             _total_available_ratio,
-             _min_available_ratio,
-             _max_available_ratio);
-    _counter_total_capacity_mb->set(_total_capacity_mb);
-    _counter_total_available_mb->set(_total_available_mb);
-    _counter_total_available_ratio->set(_total_available_ratio);
-    _counter_min_available_ratio->set(_min_available_ratio);
-    _counter_max_available_ratio->set(_max_available_ratio);
+             total_available_ratio,
+             min_available_ratio,
+             max_available_ratio);
 }
 
 void fs_manager::add_new_dir_node(const std::string &data_dir, const 
std::string &tag)
diff --git a/src/common/fs_manager.h b/src/common/fs_manager.h
index be19d79b6..a63ce36b0 100644
--- a/src/common/fs_manager.h
+++ b/src/common/fs_manager.h
@@ -28,9 +28,12 @@
 
 #include "common/replication_other_types.h"
 #include "metadata_types.h"
-#include "perf_counter/perf_counter_wrapper.h"
+#include "utils/autoref_ptr.h"
+#include "utils/error_code.h"
 #include "utils/flags.h"
 #include "utils/string_view.h"
+#include "utils/metrics.h"
+#include "utils/ports.h"
 #include "utils/zlocks.h"
 
 namespace dsn {
@@ -40,6 +43,25 @@ namespace replication {
 
 DSN_DECLARE_int32(disk_min_available_space_ratio);
 
+class disk_capacity_metrics
+{
+public:
+    disk_capacity_metrics(const std::string &tag, const std::string &data_dir);
+    ~disk_capacity_metrics() = default;
+
+    const metric_entity_ptr &disk_metric_entity() const;
+
+    METRIC_DEFINE_SET_METHOD(disk_capacity_total_mb, int64_t)
+    METRIC_DEFINE_SET_METHOD(disk_capacity_avail_mb, int64_t)
+
+private:
+    const metric_entity_ptr _disk_metric_entity;
+    METRIC_VAR_DECLARE_gauge_int64(disk_capacity_total_mb);
+    METRIC_VAR_DECLARE_gauge_int64(disk_capacity_avail_mb);
+
+    DISALLOW_COPY_AND_ASSIGN(disk_capacity_metrics);
+};
+
 struct dir_node
 {
 public:
@@ -53,6 +75,9 @@ public:
     std::map<app_id, std::set<gpid>> holding_primary_replicas;
     std::map<app_id, std::set<gpid>> holding_secondary_replicas;
 
+private:
+    disk_capacity_metrics disk_capacity;
+
 public:
     dir_node(const std::string &tag_,
              const std::string &dir_,
@@ -65,7 +90,8 @@ public:
           disk_capacity_mb(disk_capacity_mb_),
           disk_available_mb(disk_available_mb_),
           disk_available_ratio(disk_available_ratio_),
-          status(status_)
+          status(status_),
+          disk_capacity(tag_, dir_)
     {
     }
     // All functions are not thread-safe. However, they are only used in 
fs_manager
@@ -83,7 +109,8 @@ public:
 class fs_manager
 {
 public:
-    fs_manager();
+    fs_manager() = default;
+    ~fs_manager() = default;
 
     // Should be called before open/load any replicas.
     // NOTE: 'data_dirs' and 'data_dir_tags' must have the same size and in 
the same order.
@@ -127,35 +154,20 @@ public:
     bool is_dir_node_available(const std::string &data_dir, const std::string 
&tag) const;
 
 private:
-    void reset_disk_stat()
-    {
-        _total_capacity_mb = 0;
-        _total_available_mb = 0;
-        _total_available_ratio = 0;
-        _min_available_ratio = 100;
-        _max_available_ratio = 0;
-    }
-
     dir_node *get_dir_node(const std::string &subdir) const;
 
-    // when visit the tag/storage of the _dir_nodes map, there's no need to 
protect by the lock.
-    // but when visit the holding_replicas, you must take care.
+    // TODO(wangdan): _dir_nodes should be protected by lock since 
add_new_disk are supported:
+    // it might be updated arbitrarily at any time.
+    //
+    // Especially when visiting the holding_replicas, you must take care.
     mutable zrwlock_nr _lock; // [ lock
+
     int64_t _total_capacity_mb = 0;
     int64_t _total_available_mb = 0;
-    int _total_available_ratio = 0;
-    int _min_available_ratio = 100;
-    int _max_available_ratio = 0;
 
     std::vector<std::shared_ptr<dir_node>> _dir_nodes;
     // ] end of lock
 
-    perf_counter_wrapper _counter_total_capacity_mb;
-    perf_counter_wrapper _counter_total_available_mb;
-    perf_counter_wrapper _counter_total_available_ratio;
-    perf_counter_wrapper _counter_min_available_ratio;
-    perf_counter_wrapper _counter_max_available_ratio;
-
     friend class replica_test;
     friend class replica_stub;
     friend class mock_replica_stub;
diff --git a/src/common/test/CMakeLists.txt b/src/common/test/CMakeLists.txt
index 78d94000c..74a9cdf3e 100644
--- a/src/common/test/CMakeLists.txt
+++ b/src/common/test/CMakeLists.txt
@@ -27,6 +27,7 @@ set(MY_PROJ_NAME dsn_replication_common_test)
 set(MY_SRC_SEARCH_MODE "GLOB")
 
 set(MY_PROJ_LIBS
+        dsn_http
         dsn_replication_common
         dsn_runtime
         gtest
diff --git a/src/replica/test/replica_disk_test_base.h 
b/src/replica/test/replica_disk_test_base.h
index 08e7ffc39..e20883cd7 100644
--- a/src/replica/test/replica_disk_test_base.h
+++ b/src/replica/test/replica_disk_test_base.h
@@ -64,7 +64,6 @@ public:
         generate_mock_app_info();
 
         stub->_fs_manager._dir_nodes.clear();
-        stub->_fs_manager.reset_disk_stat();
         generate_mock_dir_nodes(dir_nodes_count);
         generate_mock_empty_dir_node(empty_dir_nodes_count);
 
diff --git a/src/utils/metrics.h b/src/utils/metrics.h
index e69268006..a230aa1f9 100644
--- a/src/utils/metrics.h
+++ b/src/utils/metrics.h
@@ -165,6 +165,7 @@ class error_code;
     _##name(METRIC_##name.instantiate(entity##_metric_entity(), ##__VA_ARGS__))
 #define METRIC_VAR_INIT_replica(name, ...) METRIC_VAR_INIT(name, replica, 
##__VA_ARGS__)
 #define METRIC_VAR_INIT_server(name, ...) METRIC_VAR_INIT(name, server, 
##__VA_ARGS__)
+#define METRIC_VAR_INIT_disk(name, ...) METRIC_VAR_INIT(name, disk, 
##__VA_ARGS__)
 
 // Perform increment-related operations on metrics including gauge and counter.
 #define METRIC_VAR_INCREMENT_BY(name, x)                                       
                    \
@@ -194,6 +195,11 @@ class error_code;
 
 #define METRIC_VAR_AUTO_LATENCY_DURATION_NS(name) 
__##name##_auto_latency.duration_ns()
 
+#define METRIC_DEFINE_SET_METHOD(name, value_type)                             
                    \
+    void set_##name(value_type value) { METRIC_VAR_SET(name, value); }
+
+#define METRIC_CALL_SET_METHOD(obj, name, value) obj.set_##name(value)
+
 namespace dsn {
 class metric;                  // IWYU pragma: keep
 class metric_entity_prototype; // IWYU pragma: keep
@@ -614,6 +620,7 @@ enum class metric_unit : size_t
     kBytes,
     kMegaBytes,
     kCapacityUnits,
+    kPercent,
     kRequests,
     kSeeks,
     kPointLookups,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to