This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 9d5468d1987 [branch-2.1](memory) BE memory info compatible with 
CgroupV2 (#39799)
9d5468d1987 is described below

commit 9d5468d1987051fc8c078738986ab2eccfe4dbb8
Author: Xinyi Zou <[email protected]>
AuthorDate: Fri Aug 23 02:03:00 2024 +0800

    [branch-2.1](memory) BE memory info compatible with CgroupV2 (#39799)
    
    pick #39256
---
 be/src/common/cgroup_memory_ctl.cpp          | 196 +++++++++++++++++++++++
 be/src/common/cgroup_memory_ctl.h            |  45 ++++++
 be/src/common/status.h                       |   4 +-
 be/src/util/cgroup_util.cpp                  | 228 ++++++++++++++-------------
 be/src/util/cgroup_util.h                    |  82 +++++++---
 be/src/util/mem_info.cpp                     |  61 ++-----
 be/test/util/cgroup_util_test.cpp            |  49 +++++-
 be/test/util/test_data/memory.limit_in_bytes |   1 +
 be/test/util/test_data/memory.stat           |  36 +++++
 9 files changed, 512 insertions(+), 190 deletions(-)

diff --git a/be/src/common/cgroup_memory_ctl.cpp 
b/be/src/common/cgroup_memory_ctl.cpp
new file mode 100644
index 00000000000..a29432bdb4e
--- /dev/null
+++ b/be/src/common/cgroup_memory_ctl.cpp
@@ -0,0 +1,196 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+// 
https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/CgroupsMemoryUsageObserver.cpp
+// and modified by Doris
+
+#include "common/cgroup_memory_ctl.h"
+
+#include <filesystem>
+#include <fstream>
+#include <memory>
+#include <utility>
+
+#include "common/status.h"
+#include "util/cgroup_util.h"
+
+namespace doris {
+
+// Is the memory controller of cgroups v2 enabled on the system?
+// Assumes that cgroupsv2_enable() is enabled.
+Status cgroupsv2_memory_controller_enabled(bool* ret) {
+#if defined(OS_LINUX)
+    if (!CGroupUtil::cgroupsv2_enable()) {
+        return Status::CgroupError("cgroupsv2_enable is false");
+    }
+    // According to https://docs.kernel.org/admin-guide/cgroup-v2.html, file 
"cgroup.controllers" defines which controllers are available
+    // for the current + child cgroups. The set of available controllers can 
be restricted from level to level using file
+    // "cgroups.subtree_control". It is therefore sufficient to check the 
bottom-most nested "cgroup.controllers" file.
+    std::string cgroup = CGroupUtil::cgroupv2_of_process();
+    auto cgroup_dir = cgroup.empty() ? default_cgroups_mount : 
(default_cgroups_mount / cgroup);
+    std::ifstream controllers_file(cgroup_dir / "cgroup.controllers");
+    if (!controllers_file.is_open()) {
+        *ret = false;
+        return Status::CgroupError("open cgroup.controllers failed");
+    }
+    std::string controllers;
+    std::getline(controllers_file, controllers);
+    *ret = controllers.find("memory") != std::string::npos;
+    return Status::OK();
+#else
+    *ret = false;
+    return Status::CgroupError("cgroupsv2 only support Linux");
+#endif
+}
+
+struct CgroupsV1Reader : CGroupMemoryCtl::ICgroupsReader {
+    explicit CgroupsV1Reader(std::filesystem::path mount_file_dir)
+            : _mount_file_dir(std::move(mount_file_dir)) {}
+
+    Status read_memory_limit(int64_t* value) override {
+        RETURN_IF_ERROR(CGroupUtil::read_int_line_from_cgroup_file(
+                (_mount_file_dir / "memory.limit_in_bytes"), value));
+        return Status::OK();
+    }
+
+    Status read_memory_usage(int64_t* value) override {
+        std::unordered_map<std::string, int64_t> metrics_map;
+        CGroupUtil::read_int_metric_from_cgroup_file((_mount_file_dir / 
"memory.stat"),
+                                                     metrics_map);
+        *value = metrics_map["rss"];
+        return Status::OK();
+    }
+
+private:
+    std::filesystem::path _mount_file_dir;
+};
+
+struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader {
+    explicit CgroupsV2Reader(std::filesystem::path mount_file_dir)
+            : _mount_file_dir(std::move(mount_file_dir)) {}
+
+    Status read_memory_limit(int64_t* value) override {
+        
RETURN_IF_ERROR(CGroupUtil::read_int_line_from_cgroup_file((_mount_file_dir / 
"memory.max"),
+                                                                   value));
+        return Status::OK();
+    }
+
+    Status read_memory_usage(int64_t* value) override {
+        // memory.current contains a single number
+        // the reason why we subtract it described here: 
https://github.com/ClickHouse/ClickHouse/issues/64652#issuecomment-2149630667
+        RETURN_IF_ERROR(CGroupUtil::read_int_line_from_cgroup_file(
+                (_mount_file_dir / "memory.current"), value));
+        std::unordered_map<std::string, int64_t> metrics_map;
+        CGroupUtil::read_int_metric_from_cgroup_file((_mount_file_dir / 
"memory.stat"),
+                                                     metrics_map);
+        if (*value < metrics_map["inactive_file"]) {
+            return Status::CgroupError("CgroupsV2Reader read_memory_usage 
negative memory usage");
+        }
+        *value -= metrics_map["inactive_file"];
+        return Status::OK();
+    }
+
+private:
+    std::filesystem::path _mount_file_dir;
+};
+
+std::pair<std::string, CGroupUtil::CgroupsVersion> get_cgroups_path() {
+    bool enable_controller;
+    auto cgroupsv2_memory_controller_st = 
cgroupsv2_memory_controller_enabled(&enable_controller);
+    if (CGroupUtil::cgroupsv2_enable() && cgroupsv2_memory_controller_st.ok() 
&&
+        enable_controller) {
+        auto v2_memory_stat_path = 
CGroupUtil::get_cgroupsv2_path("memory.stat");
+        auto v2_memory_current_path = 
CGroupUtil::get_cgroupsv2_path("memory.current");
+        auto v2_memory_max_path = CGroupUtil::get_cgroupsv2_path("memory.max");
+        if (v2_memory_stat_path.has_value() && 
v2_memory_current_path.has_value() &&
+            v2_memory_max_path.has_value() && v2_memory_stat_path == 
v2_memory_current_path &&
+            v2_memory_current_path == v2_memory_max_path) {
+            return {*v2_memory_stat_path, CGroupUtil::CgroupsVersion::V2};
+        }
+    }
+
+    std::string cgroup_path;
+    auto st = CGroupUtil::find_abs_cgroupv1_path("memory", &cgroup_path);
+    if (st.ok()) {
+        return {cgroup_path, CGroupUtil::CgroupsVersion::V1};
+    }
+
+    return {"", CGroupUtil::CgroupsVersion::V1};
+}
+
+Status get_cgroups_reader(std::shared_ptr<CGroupMemoryCtl::ICgroupsReader>& 
reader) {
+    const auto [cgroup_path, version] = get_cgroups_path();
+    if (cgroup_path.empty()) {
+        bool enable_controller;
+        auto st = cgroupsv2_memory_controller_enabled(&enable_controller);
+        return Status::CgroupError(
+                "Cannot find cgroups v1 or v2 current memory file, 
cgroupsv2_enable: {},{}, "
+                "cgroupsv2_memory_controller_enabled: {}, cgroupsv1_enable: 
{}",
+                CGroupUtil::cgroupsv2_enable(), enable_controller, 
st.to_string(),
+                CGroupUtil::cgroupsv1_enable());
+    }
+
+    if (version == CGroupUtil::CgroupsVersion::V2) {
+        reader = std::make_shared<CgroupsV2Reader>(cgroup_path);
+    } else {
+        reader = std::make_shared<CgroupsV1Reader>(cgroup_path);
+    }
+    return Status::OK();
+}
+
+Status CGroupMemoryCtl::find_cgroup_mem_limit(int64_t* bytes) {
+    std::shared_ptr<CGroupMemoryCtl::ICgroupsReader> reader;
+    RETURN_IF_ERROR(get_cgroups_reader(reader));
+    RETURN_IF_ERROR(reader->read_memory_limit(bytes));
+    return Status::OK();
+}
+
+Status CGroupMemoryCtl::find_cgroup_mem_usage(int64_t* bytes) {
+    std::shared_ptr<CGroupMemoryCtl::ICgroupsReader> reader;
+    RETURN_IF_ERROR(get_cgroups_reader(reader));
+    RETURN_IF_ERROR(reader->read_memory_usage(bytes));
+    return Status::OK();
+}
+
+std::string CGroupMemoryCtl::debug_string() {
+    const auto [cgroup_path, version] = get_cgroups_path();
+    if (cgroup_path.empty()) {
+        bool enable_controller;
+        auto st = cgroupsv2_memory_controller_enabled(&enable_controller);
+        return fmt::format(
+                "Cannot find cgroups v1 or v2 current memory file, 
cgroupsv2_enable: {},{}, "
+                "cgroupsv2_memory_controller_enabled: {}, cgroupsv1_enable: 
{}",
+                CGroupUtil::cgroupsv2_enable(), enable_controller, 
st.to_string(),
+                CGroupUtil::cgroupsv1_enable());
+    }
+
+    int64_t mem_limit;
+    auto mem_limit_st = find_cgroup_mem_limit(&mem_limit);
+
+    int64_t mem_usage;
+    auto mem_usage_st = find_cgroup_mem_usage(&mem_usage);
+
+    return fmt::format(
+            "Process CGroup Memory Info (cgroups path: {}, cgroup version: 
{}): memory limit: "
+            "{}, "
+            "memory usage: {}",
+            cgroup_path, (version == CGroupUtil::CgroupsVersion::V1) ? "v1" : 
"v2",
+            mem_limit_st.ok() ? std::to_string(mem_limit) : 
mem_limit_st.to_string(),
+            mem_usage_st.ok() ? std::to_string(mem_usage) : 
mem_usage_st.to_string());
+}
+
+} // namespace doris
diff --git a/be/src/common/cgroup_memory_ctl.h 
b/be/src/common/cgroup_memory_ctl.h
new file mode 100644
index 00000000000..83f33e03cda
--- /dev/null
+++ b/be/src/common/cgroup_memory_ctl.h
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "common/status.h"
+
+namespace doris {
+
+class CGroupMemoryCtl {
+public:
+    // Inherited by cgroup v1 and v2
+    struct ICgroupsReader {
+        virtual ~ICgroupsReader() = default;
+
+        virtual Status read_memory_limit(int64_t* value) = 0;
+
+        virtual Status read_memory_usage(int64_t* value) = 0;
+    };
+
+    // Determines the CGroup memory limit from the current processes' cgroup.
+    // If the limit is more than INT64_MAX, INT64_MAX is returned (since that 
is
+    // effectively unlimited anyway). Does not take into account memory limits
+    // set on any ancestor CGroups.
+    static Status find_cgroup_mem_limit(int64_t* bytes);
+
+    // 
https://serverfault.com/questions/902009/the-memory-usage-reported-in-cgroup-differs-from-the-free-command
+    static Status find_cgroup_mem_usage(int64_t* bytes);
+
+    // Returns a human-readable string with information about CGroups.
+    static std::string debug_string();
+};
+} // namespace doris
diff --git a/be/src/common/status.h b/be/src/common/status.h
index 94988cbdae3..8847bb7c087 100644
--- a/be/src/common/status.h
+++ b/be/src/common/status.h
@@ -281,7 +281,8 @@ namespace ErrorCode {
     E(INVERTED_INDEX_ANALYZER_ERROR, -6011, false);          \
     E(KEY_NOT_FOUND, -7000, false);                          \
     E(KEY_ALREADY_EXISTS, -7001, false);                     \
-    E(ENTRY_NOT_FOUND, -7002, false);
+    E(ENTRY_NOT_FOUND, -7002, false);                        \
+    E(CGROUP_ERROR, -7411, false);
 
 // Define constexpr int error_code_name = error_code_value
 #define M(NAME, ERRORCODE, ENABLESTACKTRACE) constexpr int NAME = ERRORCODE;
@@ -469,6 +470,7 @@ public:
     ERROR_CTOR(NotAuthorized, NOT_AUTHORIZED)
     ERROR_CTOR(HttpError, HTTP_ERROR)
     ERROR_CTOR(NeedSendAgain, NEED_SEND_AGAIN)
+    ERROR_CTOR(CgroupError, CGROUP_ERROR)
 #undef ERROR_CTOR
 
     template <int code>
diff --git a/be/src/util/cgroup_util.cpp b/be/src/util/cgroup_util.cpp
index 9ad78696a6f..8109e38559f 100644
--- a/be/src/util/cgroup_util.cpp
+++ b/be/src/util/cgroup_util.cpp
@@ -18,10 +18,7 @@
 #include "util/cgroup_util.h"
 
 #include <algorithm>
-#include <cfloat>
 #include <fstream>
-#include <iomanip>
-#include <memory>
 #include <utility>
 #include <vector>
 
@@ -40,14 +37,33 @@ using std::pair;
 
 namespace doris {
 
-Status CGroupUtil::find_global_cgroup(const string& subsystem, string* path) {
+bool CGroupUtil::cgroupsv1_enable() {
+    bool exists = true;
+    Status st = io::global_local_filesystem()->exists("/proc/cgroups", 
&exists);
+    return st.ok() && exists;
+}
+
+bool CGroupUtil::cgroupsv2_enable() {
+#if defined(OS_LINUX)
+    // This file exists iff the host has cgroups v2 enabled.
+    auto controllers_file = default_cgroups_mount / "cgroup.controllers";
+    bool exists = true;
+    Status st = io::global_local_filesystem()->exists(controllers_file, 
&exists);
+    return st.ok() && exists;
+#else
+    return false;
+#endif
+}
+
+Status CGroupUtil::find_global_cgroupv1(const string& subsystem, string* path) 
{
     std::ifstream proc_cgroups("/proc/self/cgroup", std::ios::in);
     string line;
     while (true) {
         if (proc_cgroups.fail()) {
-            return Status::IOError("Error reading /proc/self/cgroup: {}", 
get_str_err_msg());
+            return Status::CgroupError("Error reading /proc/self/cgroup: {}", 
get_str_err_msg());
         } else if (proc_cgroups.peek() == std::ifstream::traits_type::eof()) {
-            return Status::NotFound("Could not find subsystem {} in 
/proc/self/cgroup", subsystem);
+            return Status::CgroupError("Could not find subsystem {} in 
/proc/self/cgroup",
+                                       subsystem);
         }
         // The line format looks like this:
         // 4:memory:/user.slice
@@ -82,32 +98,15 @@ static Status unescape_path(const string& escaped, string* 
unescaped) {
     return Status::OK();
 }
 
-static Status read_cgroup_value(const string& limit_file_path, int64_t* val) {
-    std::ifstream limit_file(limit_file_path, std::ios::in);
-    string line;
-    getline(limit_file, line);
-    if (limit_file.fail() || limit_file.bad()) {
-        return Status::IOError("Error reading {}: {}", limit_file_path, 
get_str_err_msg());
-    }
-    StringParser::ParseResult pr;
-    // Parse into an int64_t If it overflows, returning the max value of 
int64_t is ok because that
-    // is effectively unlimited.
-    *val = StringParser::string_to_int<int64_t>(line.c_str(), line.size(), 
&pr);
-    if ((pr != StringParser::PARSE_SUCCESS && pr != 
StringParser::PARSE_OVERFLOW)) {
-        return Status::InvalidArgument("Failed to parse {} as int64: '{}'", 
limit_file_path, line);
-    }
-    return Status::OK();
-}
-
-Status CGroupUtil::find_cgroup_mounts(const string& subsystem, pair<string, 
string>* result) {
+Status CGroupUtil::find_cgroupv1_mounts(const string& subsystem, pair<string, 
string>* result) {
     std::ifstream mountinfo("/proc/self/mountinfo", std::ios::in);
     string line;
     while (true) {
         if (mountinfo.fail() || mountinfo.bad()) {
-            return Status::IOError("Error reading /proc/self/mountinfo: {}", 
get_str_err_msg());
+            return Status::CgroupError("Error reading /proc/self/mountinfo: 
{}", get_str_err_msg());
         } else if (mountinfo.eof()) {
-            return Status::NotFound("Could not find subsystem {} in 
/proc/self/mountinfo",
-                                    subsystem);
+            return Status::CgroupError("Could not find subsystem {} in 
/proc/self/mountinfo",
+                                       subsystem);
         }
         // The relevant lines look like below (see proc manpage for full 
documentation). The
         // first example is running outside of a container, the second example 
is running
@@ -118,14 +117,18 @@ Status CGroupUtil::find_cgroup_mounts(const string& 
subsystem, pair<string, stri
         // 275 271 0:28 /docker/f23eee6f88c2ba99fcce /sys/fs/cgroup/memory
         //    ro,nosuid,nodev,noexec,relatime master:15 - cgroup cgroup 
rw,memory
         getline(mountinfo, line);
-        if (!mountinfo.good()) continue;
+        if (!mountinfo.good()) {
+            continue;
+        }
         std::vector<string> fields = Split(line, " ", SkipWhitespace());
         if (fields.size() < 7) {
             return Status::InvalidArgument(
                     "Could not parse line from /proc/self/mountinfo - had {} > 
7 tokens: '{}'",
                     fields.size(), line);
         }
-        if (fields[fields.size() - 3] != "cgroup") continue;
+        if (fields[fields.size() - 3] != "cgroup") {
+            continue;
+        }
         // This is a cgroup mount. Check if it's the mount we're looking for.
         std::vector<string> cgroup_opts = Split(fields[fields.size() - 1], 
",", SkipWhitespace());
         auto it = std::find(cgroup_opts.begin(), cgroup_opts.end(), subsystem);
@@ -138,16 +141,21 @@ Status CGroupUtil::find_cgroup_mounts(const string& 
subsystem, pair<string, stri
         RETURN_IF_ERROR(unescape_path(fields[3], &system_path));
         // Strip trailing "/" so that both returned paths match in whether 
they have a
         // trailing "/".
-        if (system_path[system_path.size() - 1] == '/') system_path.pop_back();
+        if (system_path[system_path.size() - 1] == '/') {
+            system_path.pop_back();
+        }
         *result = {mount_path, system_path};
         return Status::OK();
     }
 }
 
-Status CGroupUtil::find_abs_cgroup_path(const string& subsystem, string* path) 
{
-    RETURN_IF_ERROR(find_global_cgroup(subsystem, path));
+Status CGroupUtil::find_abs_cgroupv1_path(const string& subsystem, string* 
path) {
+    if (!cgroupsv1_enable()) {
+        return Status::InvalidArgument("cgroup is not enabled!");
+    }
+    RETURN_IF_ERROR(find_global_cgroupv1(subsystem, path));
     pair<string, string> paths;
-    RETURN_IF_ERROR(find_cgroup_mounts(subsystem, &paths));
+    RETURN_IF_ERROR(find_cgroupv1_mounts(subsystem, &paths));
     const string& mount_path = paths.first;
     const string& system_path = paths.second;
     if (path->compare(0, system_path.size(), system_path) != 0) {
@@ -158,98 +166,98 @@ Status CGroupUtil::find_abs_cgroup_path(const string& 
subsystem, string* path) {
     return Status::OK();
 }
 
-Status CGroupUtil::find_cgroup_mem_limit(int64_t* bytes) {
-    if (!enable()) {
-        return Status::InvalidArgument("cgroup is not enabled!");
+std::string CGroupUtil::cgroupv2_of_process() {
+#if defined(OS_LINUX)
+    if (!cgroupsv2_enable()) {
+        return "";
+    }
+    // All PIDs assigned to a cgroup are in 
/sys/fs/cgroups/{cgroup_name}/cgroup.procs
+    // A simpler way to get the membership is:
+    std::ifstream cgroup_name_file("/proc/self/cgroup");
+    if (!cgroup_name_file.is_open()) {
+        return "";
+    }
+    // With cgroups v2, there will be a *single* line with prefix "0::/"
+    // (see https://docs.kernel.org/admin-guide/cgroup-v2.html)
+    std::string cgroup;
+    std::getline(cgroup_name_file, cgroup);
+    static const std::string v2_prefix = "0::/";
+    if (!cgroup.starts_with(v2_prefix)) {
+        return "";
     }
-    string cgroup_path;
-    RETURN_IF_ERROR(find_abs_cgroup_path("memory", &cgroup_path));
-    string limit_file_path = cgroup_path + "/memory.limit_in_bytes";
-    return read_cgroup_value(limit_file_path, bytes);
+    cgroup = cgroup.substr(v2_prefix.length());
+    return cgroup;
+#else
+    return "";
+#endif
 }
 
-Status CGroupUtil::find_cgroup_mem_usage(int64_t* bytes) {
-    if (!enable()) {
-        return Status::InvalidArgument("cgroup is not enabled!");
+std::optional<std::string> CGroupUtil::get_cgroupsv2_path(const std::string& 
subsystem) {
+    if (!CGroupUtil::cgroupsv2_enable()) {
+        return {};
     }
-    string cgroup_path;
-    RETURN_IF_ERROR(find_abs_cgroup_path("memory", &cgroup_path));
-    string usage_file_path = cgroup_path + "/memory.usage_in_bytes";
-    return read_cgroup_value(usage_file_path, bytes);
-}
 
-Status CGroupUtil::find_cgroup_mem_info(std::string* file_path) {
-    if (!enable()) {
-        return Status::InvalidArgument("cgroup is not enabled!");
+    std::string cgroup = CGroupUtil::cgroupv2_of_process();
+    auto current_cgroup = cgroup.empty() ? default_cgroups_mount : 
(default_cgroups_mount / cgroup);
+
+    // Return the bottom-most nested current memory file. If there is no such 
file at the current
+    // level, try again at the parent level as memory settings are inherited.
+    while (current_cgroup != default_cgroups_mount.parent_path()) {
+        if (std::filesystem::exists(current_cgroup / subsystem)) {
+            return {current_cgroup};
+        }
+        current_cgroup = current_cgroup.parent_path();
     }
-    string cgroup_path;
-    RETURN_IF_ERROR(find_abs_cgroup_path("memory", &cgroup_path));
-    *file_path = cgroup_path + "/memory.stat";
-    return Status::OK();
+    return {};
 }
 
-Status CGroupUtil::find_cgroup_cpu_limit(float* cpu_count) {
-    if (!enable()) {
-        return Status::InvalidArgument("cgroup is not enabled!");
-    }
-    int64_t quota;
-    int64_t period;
-    string cgroup_path;
-    if (!find_abs_cgroup_path("cpu", &cgroup_path).ok()) {
-        RETURN_IF_ERROR(find_abs_cgroup_path("cpuacct", &cgroup_path));
-    }
-    string cfs_quota_filename = cgroup_path + "/cpu.cfs_quota_us";
-    RETURN_IF_ERROR(read_cgroup_value(cfs_quota_filename, &quota));
-    if (quota <= 0) {
-        *cpu_count = -1;
-        return Status::OK();
-    }
-    string cfs_period_filename = cgroup_path + "/cpu.cfs_period_us";
-    RETURN_IF_ERROR(read_cgroup_value(cfs_period_filename, &period));
-    if (quota <= period) {
-        return Status::InvalidArgument("quota <= period");
+Status CGroupUtil::read_int_line_from_cgroup_file(const std::filesystem::path& 
file_path,
+                                                  int64_t* val) {
+    std::ifstream file_stream(file_path, std::ios::in);
+    string line;
+    getline(file_stream, line);
+    if (file_stream.fail() || file_stream.bad()) {
+        return Status::CgroupError("Error reading {}: {}", file_path.string(), 
get_str_err_msg());
     }
-    *cpu_count = float(quota) / float(period);
-    if (*cpu_count >= FLT_MAX) {
-        return Status::InvalidArgument("unknown");
+    StringParser::ParseResult pr;
+    // Parse into an int64_t If it overflows, returning the max value of 
int64_t is ok because that
+    // is effectively unlimited.
+    *val = StringParser::string_to_int<int64_t>(line.c_str(), line.size(), 
&pr);
+    if ((pr != StringParser::PARSE_SUCCESS && pr != 
StringParser::PARSE_OVERFLOW)) {
+        return Status::InvalidArgument("Failed to parse {} as int64: '{}'", 
file_path.string(),
+                                       line);
     }
     return Status::OK();
 }
 
-std::string CGroupUtil::debug_string() {
-    if (!enable()) {
-        return std::string("cgroup is not enabled!");
-    }
-    string mem_limit_str;
-    int64_t mem_limit;
-    Status status = find_cgroup_mem_limit(&mem_limit);
-    if (status.ok()) {
-        mem_limit_str = strings::Substitute("$0", mem_limit);
-    } else {
-        mem_limit_str = status.to_string();
-    }
-    string cpu_limit_str;
-    float cpu_limit;
-    status = find_cgroup_cpu_limit(&cpu_limit);
-    if (status.ok()) {
-        if (cpu_limit > 0) {
-            std::stringstream stream;
-            stream << std::fixed << std::setprecision(1) << cpu_limit;
-            cpu_limit_str = stream.str();
-        } else {
-            cpu_limit_str = "unlimited";
+void CGroupUtil::read_int_metric_from_cgroup_file(
+        const std::filesystem::path& file_path,
+        std::unordered_map<std::string, int64_t>& metrics_map) {
+    std::ifstream cgroup_file(file_path, std::ios::in);
+    std::string line;
+    while (cgroup_file.good() && !cgroup_file.eof()) {
+        getline(cgroup_file, line);
+        std::vector<std::string> fields = strings::Split(line, " ", 
strings::SkipWhitespace());
+        if (fields.size() < 2) {
+            continue;
         }
-    } else {
-        cpu_limit_str = status.to_string();
-    }
-    return strings::Substitute("Process CGroup Info: memory.limit_in_bytes=$0, 
cpu cfs limits: $1",
-                               mem_limit_str, cpu_limit_str);
-}
+        std::string key = fields[0].substr(0, fields[0].size());
 
-bool CGroupUtil::enable() {
-    bool exists = true;
-    Status st = io::global_local_filesystem()->exists("/proc/cgroups", 
&exists);
-    return st.ok() && exists;
+        StringParser::ParseResult result;
+        auto value =
+                StringParser::string_to_int<int64_t>(fields[1].data(), 
fields[1].size(), &result);
+
+        if (result == StringParser::PARSE_SUCCESS) {
+            if (fields.size() == 2) {
+                metrics_map[key] = value;
+            } else if (fields[2] == "kB") {
+                metrics_map[key] = value * 1024L;
+            }
+        }
+    }
+    if (cgroup_file.is_open()) {
+        cgroup_file.close();
+    }
 }
 
 } // namespace doris
diff --git a/be/src/util/cgroup_util.h b/be/src/util/cgroup_util.h
index 2152720ccdd..cf922ba5063 100644
--- a/be/src/util/cgroup_util.h
+++ b/be/src/util/cgroup_util.h
@@ -18,50 +18,88 @@
 #pragma once
 
 #include <cstdint>
+#include <filesystem>
+#include <optional>
 #include <string>
 #include <utility>
 
 #include "common/status.h"
 namespace doris {
-class CGroupUtil {
-public:
-    // Determines the CGroup memory limit from the current processes' cgroup.
-    // If the limit is more than INT64_MAX, INT64_MAX is returned (since that 
is
-    // effectively unlimited anyway). Does not take into account memory limits
-    // set on any ancestor CGroups.
-    static Status find_cgroup_mem_limit(int64_t* bytes);
-
-    // memory.usage_in_bytes ~= free.used + free.(buff/cache) - (buff)
-    // 
https://serverfault.com/questions/902009/the-memory-usage-reported-in-cgroup-differs-from-the-free-command
-    static Status find_cgroup_mem_usage(int64_t* bytes);
-    static Status find_cgroup_mem_info(std::string* file_path);
 
-    // Determines the CGroup cpu cores limit from the current processes' 
cgroup.
-    static Status find_cgroup_cpu_limit(float* cpu_count);
+#if defined(OS_LINUX)
+// I think it is possible to mount the cgroups hierarchy somewhere else (e.g. 
when in containers).
+// /sys/fs/cgroup was still symlinked to the actual mount in the cases that I 
have seen.
+static inline const std::filesystem::path default_cgroups_mount = 
"/sys/fs/cgroup";
+#endif
 
-    // Returns a human-readable string with information about CGroups.
-    static std::string debug_string();
+/* Cgroup debugging steps
+ * CgroupV1:
+ *  sudo cgcreate -t username:username -g memory:test
+ *  sudo sh -c "echo 6000M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes"
+ *  // process started by the current terminal will join Cgroup test
+ *  sudo sh -c "echo $$ >> /sys/fs/cgroup/memory/test/cgroup.procs"
+ *
+ * CgroupV2:
+ *  sudo mkdir /sys/fs/cgroup/test
+ *  sudo echo 3000M > /sys/fs/cgroup/test/memory.max
+ *  // process started by the current terminal will join Cgroup test
+ *  sudo sh -c "echo $$ >> /sys/fs/cgroup/test/cgroup.procs"
+ *  or
+ *  // only memory allocated after joining the Cgroup is counted in 
`memory.current`.
+ *  sudo echo pid > /sys/fs/cgroup/test/cgroup.procs
+*/
+class CGroupUtil {
+public:
+    enum class CgroupsVersion : uint8_t { V1, V2 };
 
     // detect if cgroup is enabled
-    static bool enable();
+    static bool cgroupsv1_enable();
+    static bool cgroupsv2_enable();
 
-private:
     // return the global cgroup path of subsystem like 12:memory:/user.slice 
-> user.slice
-    static Status find_global_cgroup(const std::string& subsystem, 
std::string* path);
+    static Status find_global_cgroupv1(const std::string& subsystem, 
std::string* path);
 
     // Returns the absolute path to the CGroup from inside the container.
     // E.g. if this process belongs to
     // /sys/fs/cgroup/memory/kubepods/burstable/pod-<long unique id>, which is 
mounted at
     // /sys/fs/cgroup/memory inside the container, this function returns
     // "/sys/fs/cgroup/memory".
-    static Status find_abs_cgroup_path(const std::string& subsystem, 
std::string* path);
+    static Status find_abs_cgroupv1_path(const std::string& subsystem, 
std::string* path);
 
     // Figures out the mapping of the cgroup root from the container's point 
of view to
     // the full path relative to the system-wide cgroups outside of the 
container.
     // E.g. /sys/fs/cgroup/memory/kubepods/burstable/pod-<long unique id> may 
be mounted at
     // /sys/fs/cgroup/memory inside the container. In that case this function 
would return
     // ("/sys/fs/cgroup/memory", "kubepods/burstable/pod-<long unique id>").
-    static Status find_cgroup_mounts(const std::string& subsystem,
-                                     std::pair<std::string, std::string>* 
result);
+    static Status find_cgroupv1_mounts(const std::string& subsystem,
+                                       std::pair<std::string, std::string>* 
result);
+
+    // Which cgroup does the process belong to?
+    // Returns an empty string if the cgroup cannot be determined.
+    // Assumes that cgroupsV2Enabled() is enabled.
+    static std::string cgroupv2_of_process();
+
+    // Caveats:
+    // - All of the logic in this file assumes that the current process is the 
only process in the
+    //   containing cgroup (or more precisely: the only process with 
significant memory consumption).
+    //   If this is not the case, then other processe's memory consumption may 
affect the internal
+    //   memory tracker ...
+    // - Cgroups v1 and v2 allow nested cgroup hierarchies. As v1 is 
deprecated for over half a
+    //   decade and will go away at some point, hierarchical detection is only 
implemented for v2.
+    // - I did not test what happens if a host has v1 and v2 simultaneously 
enabled. I believe such
+    //   systems existed only for a short transition period.
+    static std::optional<std::string> get_cgroupsv2_path(const std::string& 
subsystem);
+
+    // Cgroup file with only one line of numbers.
+    static Status read_int_line_from_cgroup_file(const std::filesystem::path& 
file_path,
+                                                 int64_t* val);
+
+    // Multi-line Cgroup files, format is
+    //   kernel 5
+    //   rss 15
+    //   [...]
+    static void read_int_metric_from_cgroup_file(
+            const std::filesystem::path& file_path,
+            std::unordered_map<std::string, int64_t>& metrics_map);
 };
 } // namespace doris
diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp
index d0703c985ea..8be1db5cb85 100644
--- a/be/src/util/mem_info.cpp
+++ b/be/src/util/mem_info.cpp
@@ -20,6 +20,8 @@
 
 #include "mem_info.h"
 
+#include "gutil/strings/split.h"
+
 #ifdef __APPLE__
 #include <sys/sysctl.h>
 #endif
@@ -34,11 +36,10 @@
 #include <boost/algorithm/string/trim.hpp>
 #include <fstream>
 #include <unordered_map>
-#include <vector>
 
+#include "common/cgroup_memory_ctl.h"
 #include "common/config.h"
 #include "common/status.h"
-#include "gutil/strings/split.h"
 #include "runtime/memory/global_memory_arbitrator.h"
 #include "util/cgroup_util.h"
 #include "util/parse_util.h"
@@ -75,7 +76,6 @@ std::atomic<int64_t> MemInfo::_s_virtual_memory_used = 0;
 
 int64_t MemInfo::_s_cgroup_mem_limit = std::numeric_limits<int64_t>::max();
 int64_t MemInfo::_s_cgroup_mem_usage = std::numeric_limits<int64_t>::min();
-static std::unordered_map<std::string, int64_t> _s_cgroup_mem_info_bytes;
 bool MemInfo::_s_cgroup_mem_refresh_state = false;
 int64_t MemInfo::_s_cgroup_mem_refresh_wait_times = 0;
 
@@ -179,7 +179,7 @@ void MemInfo::refresh_proc_meminfo() {
         if (result == StringParser::PARSE_SUCCESS) {
             if (fields.size() == 2) {
                 _mem_info_bytes[key] = mem_value;
-            } else if (fields[2].compare("kB") == 0) {
+            } else if (fields[2] == "kB") {
                 _mem_info_bytes[key] = mem_value * 1024L;
             }
         }
@@ -194,65 +194,28 @@ void MemInfo::refresh_proc_meminfo() {
         int64_t cgroup_mem_usage = -1;
         std::string cgroup_mem_info_file_path;
         _s_cgroup_mem_refresh_state = true;
-        Status status = CGroupUtil::find_cgroup_mem_limit(&cgroup_mem_limit);
-        if (!status.ok() || cgroup_mem_limit <= 0) {
-            _s_cgroup_mem_refresh_state = false;
-        }
-        status = CGroupUtil::find_cgroup_mem_usage(&cgroup_mem_usage);
-        if (!status.ok() || cgroup_mem_usage <= 0) {
+        Status status = 
CGroupMemoryCtl::find_cgroup_mem_limit(&cgroup_mem_limit);
+        if (!status.ok()) {
             _s_cgroup_mem_refresh_state = false;
         }
-        status = CGroupUtil::find_cgroup_mem_info(&cgroup_mem_info_file_path);
-        if (status.ok()) {
-            std::ifstream cgroup_meminfo(cgroup_mem_info_file_path, 
std::ios::in);
-            std::string line;
-
-            while (cgroup_meminfo.good() && !cgroup_meminfo.eof()) {
-                getline(cgroup_meminfo, line);
-                std::vector<std::string> fields =
-                        strings::Split(line, " ", strings::SkipWhitespace());
-                if (fields.size() < 2) {
-                    continue;
-                }
-                std::string key = fields[0].substr(0, fields[0].size());
-
-                StringParser::ParseResult result;
-                auto mem_value = 
StringParser::string_to_int<int64_t>(fields[1].data(),
-                                                                      
fields[1].size(), &result);
-
-                if (result == StringParser::PARSE_SUCCESS) {
-                    if (fields.size() == 2) {
-                        _s_cgroup_mem_info_bytes[key] = mem_value;
-                    } else if (fields[2] == "kB") {
-                        _s_cgroup_mem_info_bytes[key] = mem_value * 1024L;
-                    }
-                }
-            }
-            if (cgroup_meminfo.is_open()) {
-                cgroup_meminfo.close();
-            }
-        } else {
+        status = CGroupMemoryCtl::find_cgroup_mem_usage(&cgroup_mem_usage);
+        if (!status.ok()) {
             _s_cgroup_mem_refresh_state = false;
         }
 
         if (_s_cgroup_mem_refresh_state) {
             _s_cgroup_mem_limit = cgroup_mem_limit;
-            // 
https://serverfault.com/questions/902009/the-memory-usage-reported-in-cgroup-differs-from-the-free-command
-            // memory.usage_in_bytes ~= free.used + free.(buff/cache) - (buff)
-            // so, memory.usage_in_bytes - memory.meminfo["Cached"]
-            _s_cgroup_mem_usage = cgroup_mem_usage - 
_s_cgroup_mem_info_bytes["cache"];
+            _s_cgroup_mem_usage = cgroup_mem_usage;
             // wait 10s, 100 * 100ms, avoid too frequently.
             _s_cgroup_mem_refresh_wait_times = -100;
             LOG(INFO) << "Refresh cgroup memory win, refresh again after 10s, 
cgroup mem limit: "
-                      << _s_cgroup_mem_limit << ", cgroup mem usage: " << 
_s_cgroup_mem_usage
-                      << ", cgroup mem info cached: " << 
_s_cgroup_mem_info_bytes["cache"];
+                      << _s_cgroup_mem_limit << ", cgroup mem usage: " << 
_s_cgroup_mem_usage;
         } else {
             // find cgroup failed, wait 300s, 1000 * 100ms.
             _s_cgroup_mem_refresh_wait_times = -3000;
             LOG(INFO)
                     << "Refresh cgroup memory failed, refresh again after 
300s, cgroup mem limit: "
-                    << _s_cgroup_mem_limit << ", cgroup mem usage: " << 
_s_cgroup_mem_usage
-                    << ", cgroup mem info cached: " << 
_s_cgroup_mem_info_bytes["cache"];
+                    << _s_cgroup_mem_limit << ", cgroup mem usage: " << 
_s_cgroup_mem_usage;
         }
     } else {
         if (config::enable_use_cgroup_memory_info) {
@@ -435,7 +398,7 @@ std::string MemInfo::debug_string() {
     stream << "Physical Memory: " << PrettyPrinter::print(_s_physical_mem, 
TUnit::BYTES)
            << std::endl;
     stream << "Memory Limt: " << PrettyPrinter::print(_s_mem_limit, 
TUnit::BYTES) << std::endl;
-    stream << "CGroup Info: " << doris::CGroupUtil::debug_string() << 
std::endl;
+    stream << "CGroup Info: " << doris::CGroupMemoryCtl::debug_string() << 
std::endl;
     return stream.str();
 }
 
diff --git a/be/test/util/cgroup_util_test.cpp 
b/be/test/util/cgroup_util_test.cpp
index 1553e5f8163..92102120327 100644
--- a/be/test/util/cgroup_util_test.cpp
+++ b/be/test/util/cgroup_util_test.cpp
@@ -23,23 +23,56 @@
 
 #include <fstream>
 
+#include "common/cgroup_memory_ctl.h"
 #include "gtest/gtest_pred_impl.h"
+#include "testutil/test_util.h"
+#include "util/cgroup_util.h"
 
 namespace doris {
 
 class CGroupUtilTest : public ::testing::Test {
 protected:
     CGroupUtilTest() {}
-    virtual ~CGroupUtilTest() {}
+    ~CGroupUtilTest() override = default;
 };
+
+TEST_F(CGroupUtilTest, ReadMetrics) {
+    std::string dir_path = GetCurrentRunningDir();
+    std::string memory_limit_in_bytes_path(dir_path);
+    memory_limit_in_bytes_path += "/util/test_data/memory.limit_in_bytes";
+    int64_t value;
+    auto st = 
CGroupUtil::read_int_line_from_cgroup_file(memory_limit_in_bytes_path, &value);
+    EXPECT_TRUE(st.ok());
+    EXPECT_EQ(6291456000, value);
+
+    std::string memory_stat_path(dir_path);
+    memory_stat_path += "/util/test_data/memory.stat";
+    std::unordered_map<std::string, int64_t> metrics_map;
+    CGroupUtil::read_int_metric_from_cgroup_file(memory_stat_path, 
metrics_map);
+    EXPECT_EQ(5443584, metrics_map["inactive_file"]);
+    EXPECT_EQ(0, metrics_map["rss"]);
+
+    std::string error_memory_limit_in_bytes_path(dir_path);
+    error_memory_limit_in_bytes_path += 
"/util/test_data/Zzz/memory.limit_in_bytes";
+    int64_t error_value;
+    auto st2 = 
CGroupUtil::read_int_line_from_cgroup_file(error_memory_limit_in_bytes_path,
+                                                          &error_value);
+    EXPECT_FALSE(st2.ok());
+
+    std::string error_memory_stat_path(dir_path);
+    error_memory_stat_path += "/util/test_data/Zzz/memory.stat";
+    std::unordered_map<std::string, int64_t> error_metrics_map;
+    CGroupUtil::read_int_metric_from_cgroup_file(error_memory_stat_path, 
error_metrics_map);
+    EXPECT_TRUE(error_metrics_map.empty());
+}
+
 TEST_F(CGroupUtilTest, memlimit) {
-    int64_t bytes;
-    float cpu_counts;
-    CGroupUtil cgroup_util;
-    LOG(INFO) << cgroup_util.debug_string();
-    Status status1 = cgroup_util.find_cgroup_mem_limit(&bytes);
-    Status status2 = cgroup_util.find_cgroup_cpu_limit(&cpu_counts);
-    if (cgroup_util.enable()) {
+    LOG(INFO) << CGroupMemoryCtl::debug_string();
+    int64_t mem_limit;
+    int64_t mem_usage;
+    auto status1 = CGroupMemoryCtl::find_cgroup_mem_limit(&mem_limit);
+    auto status2 = CGroupMemoryCtl::find_cgroup_mem_usage(&mem_usage);
+    if (CGroupUtil::cgroupsv1_enable() || CGroupUtil::cgroupsv2_enable()) {
         std::ifstream file("/proc/self/cgroup");
         if (file.peek() == std::ifstream::traits_type::eof()) {
             EXPECT_FALSE(status1.ok());
diff --git a/be/test/util/test_data/memory.limit_in_bytes 
b/be/test/util/test_data/memory.limit_in_bytes
new file mode 100644
index 00000000000..2081025feea
--- /dev/null
+++ b/be/test/util/test_data/memory.limit_in_bytes
@@ -0,0 +1 @@
+6291456000
diff --git a/be/test/util/test_data/memory.stat 
b/be/test/util/test_data/memory.stat
new file mode 100644
index 00000000000..858c8433005
--- /dev/null
+++ b/be/test/util/test_data/memory.stat
@@ -0,0 +1,36 @@
+cache 19202048
+rss 0
+rss_huge 0
+shmem 0
+mapped_file 5271552
+dirty 135168
+writeback 0
+swap 0
+pgpgin 14061432
+pgpgout 14058118
+pgfault 17403111
+pgmajfault 15180
+inactive_anon 0
+active_anon 2494464
+inactive_file 5443584
+active_file 10158080
+unevictable 0
+hierarchical_memory_limit 6291456000
+hierarchical_memsw_limit 9223372036854771712
+total_cache 19202048
+total_rss 0
+total_rss_huge 0
+total_shmem 0
+total_mapped_file 5271552
+total_dirty 135168
+total_writeback 0
+total_swap 0
+total_pgpgin 14061432
+total_pgpgout 14058118
+total_pgfault 17403111
+total_pgmajfault 15180
+total_inactive_anon 0
+total_active_anon 2494464
+total_inactive_file 5443584
+total_active_file 10158080
+total_unevictable 0


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to