This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new 4c627a11290 branch-4.1: [improvement](cgroup) inactive_file should be 
treated as available memory to avoid query be cancelled #64347 (#64411)
4c627a11290 is described below

commit 4c627a1129053fb028d9ecdadd90d832bed765c8
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue Jun 16 14:23:26 2026 +0800

    branch-4.1: [improvement](cgroup) inactive_file should be treated as 
available memory to avoid query be cancelled #64347 (#64411)
    
    Cherry-picked from #64347
    
    Co-authored-by: yiguolei <[email protected]>
---
 be/src/common/cgroup_memory_ctl.cpp                | 34 ++++++++++++++++++----
 be/src/util/cgroup_util.cpp                        |  7 +++++
 be/src/util/mem_info.cpp                           | 16 +++++-----
 .../meta_service_rate_limit_helper.cpp             |  2 +-
 4 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/be/src/common/cgroup_memory_ctl.cpp 
b/be/src/common/cgroup_memory_ctl.cpp
index dddcbd50338..9a70a615dcc 100644
--- a/be/src/common/cgroup_memory_ctl.cpp
+++ b/be/src/common/cgroup_memory_ctl.cpp
@@ -93,6 +93,8 @@ struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader {
             return Status::CgroupError("Error reading {}: {}", 
file_path.string(),
                                        get_str_err_msg());
         }
+        // This means no limit, for example, all process in linux will belong 
to a cgroup, and
+        // the default value of the memory limit in memory.max file is  "max", 
which means no limit.
         if (line == "max") {
             *value = std::numeric_limits<int64_t>::max();
             return Status::OK();
@@ -107,15 +109,37 @@ struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader {
         std::unordered_map<std::string, int64_t> metrics_map;
         CGroupUtil::read_int_metric_from_cgroup_file((_mount_file_dir / 
"memory.stat"),
                                                      metrics_map);
-        if (*value < metrics_map["inactive_file"]) {
-            return Status::CgroupError("CgroupsV2Reader read_memory_usage 
negative memory usage");
+        int64_t inactive_file =
+                metrics_map.contains("inactive_file") ? 
metrics_map["inactive_file"] : 0;
+        int64_t active_file = metrics_map.contains("active_file") ? 
metrics_map["active_file"] : 0;
+        int64_t slab_reclaimable =
+                metrics_map.contains("slab_reclaimable") ? 
metrics_map["slab_reclaimable"] : 0;
+        if (inactive_file < 0 || active_file < 0 || slab_reclaimable < 0) {
+            // In this scenario, not return error, ignore it and print log.
+            LOG(WARNING) << "CgroupsV2Reader read_memory_usage missing 
expected metrics in "
+                            "memory.stat, inactive_file: "
+                         << inactive_file << ", active_file: " << active_file
+                         << ", slab_reclaimable: " << slab_reclaimable;
+            return Status::OK();
+        }
+
+        const int64_t reclaimable_usage = inactive_file + active_file + 
slab_reclaimable;
+        if (*value < reclaimable_usage) {
+            LOG(WARNING)
+                    << "CgroupsV2Reader read_memory_usage negative memory 
usage, not - reclaimable "
+                       "usage any more, just return memory.current: "
+                    << *value << ", inactive_file: " << inactive_file
+                    << ", active_file: " << active_file
+                    << ", slab_reclaimable: " << slab_reclaimable;
+            // In this case, do not return an error, just ignore the negative 
usage and continue.
+            // If return error, the upper system will use os available memory 
instead of cgroup available memory, which may cause OOM more easily.
+            return Status::OK();
         }
-        // the reason why we subtract inactive_file described here:
+        // The reclaimable file cache described here should not be counted as 
used memory:
         // 
https://github.com/ClickHouse/ClickHouse/issues/64652#issuecomment-2149630667
-        *value -= metrics_map["inactive_file"];
         // Part of "slab" that might be reclaimed, such as dentries and inodes.
         // https://arthurchiao.art/blog/cgroupv2-zh/
-        *value -= metrics_map["slab_reclaimable"];
+        *value -= reclaimable_usage;
         return Status::OK();
     }
 
diff --git a/be/src/util/cgroup_util.cpp b/be/src/util/cgroup_util.cpp
index cfda0a4fb77..cc52a60ed75 100644
--- a/be/src/util/cgroup_util.cpp
+++ b/be/src/util/cgroup_util.cpp
@@ -178,6 +178,9 @@ std::string CGroupUtil::cgroupv2_of_process() {
     }
     // With cgroups v2, there will be a *single* line with prefix "0::/"
     // (see https://docs.kernel.org/admin-guide/cgroup-v2.html)
+    // such as 0::/user.slice/user-1005.slice/session-213906.scope this is the 
cgroup name
+    // it should be combined with the default cgroup mount point to get the 
full path to the cgroup, e.g.
+    // /sys/fs/cgroup/user.slice/user-1005.slice/session-213906.scope
     std::string cgroup;
     std::getline(cgroup_name_file, cgroup);
     static const std::string v2_prefix = "0::/";
@@ -198,6 +201,7 @@ std::optional<std::string> 
CGroupUtil::get_cgroupsv2_path(const std::string& sub
     }
 
     std::string cgroup = CGroupUtil::cgroupv2_of_process();
+    //    /sys/fs/cgroup/user.slice/user-1005.slice/session-213906.scope
     auto current_cgroup = cgroup.empty() ? default_cgroups_mount : 
(default_cgroups_mount / cgroup);
 
     // Return the bottom-most nested current memory file. If there is no such 
file at the current
@@ -259,6 +263,9 @@ void CGroupUtil::read_int_metric_from_cgroup_file(
                 metrics_map[key] = value;
             } else if (fields[2] == "kB") {
                 metrics_map[key] = value * 1024L;
+            } else {
+                LOG(WARNING) << "Unknown unit in cgroup file " << 
file_path.string()
+                             << ", line: " << line;
             }
         }
     }
diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp
index 4b48d40d2d9..ebd6f4a5442 100644
--- a/be/src/util/mem_info.cpp
+++ b/be/src/util/mem_info.cpp
@@ -94,7 +94,7 @@ void MemInfo::refresh_proc_meminfo() {
     if (meminfo.is_open()) {
         meminfo.close();
     }
-
+    _s_cgroup_mem_refresh_state = false;
     // refresh cgroup memory
     if (config::enable_use_cgroup_memory_info) {
         if (_s_cgroup_mem_refresh_wait_times >= 0) {
@@ -119,12 +119,13 @@ void MemInfo::refresh_proc_meminfo() {
 
         // cgroup mem limit is refreshed every 10 seconds,
         // cgroup mem usage is refreshed together with memInfo every time, 
which is very frequent.
+        // If _s_cgroup_mem_limit == max, it means get cgroup mem limit failed 
OR the cgroup has no memory limit for example
+        // there is just "max" in memory.max file.
         if (_s_cgroup_mem_limit != std::numeric_limits<int64_t>::max()) {
             int64_t cgroup_mem_usage;
             auto status = 
CGroupMemoryCtl::find_cgroup_mem_usage(&cgroup_mem_usage);
             if (!status.ok()) {
                 _s_cgroup_mem_usage = std::numeric_limits<int64_t>::min();
-                _s_cgroup_mem_refresh_state = false;
                 LOG_EVERY_N(WARNING, 500)
                         << "Refresh cgroup memory usage failed, cgroup mem 
limit: "
                         << _s_cgroup_mem_limit << ", " << status;
@@ -132,17 +133,14 @@ void MemInfo::refresh_proc_meminfo() {
                 _s_cgroup_mem_usage = cgroup_mem_usage;
                 _s_cgroup_mem_refresh_state = true;
             }
-        } else {
-            _s_cgroup_mem_refresh_state = false;
         }
-    } else {
-        _s_cgroup_mem_refresh_state = false;
     }
 
     // 1. calculate physical_mem
     int64_t physical_mem = -1;
-
-    physical_mem = _mem_info_bytes["MemTotal"];
+    if (_mem_info_bytes.find("MemTotal") != _mem_info_bytes.end()) {
+        physical_mem = _mem_info_bytes["MemTotal"];
+    }
     if (_s_cgroup_mem_refresh_state) {
         // In theory, always cgroup_mem_limit < physical_mem
         if (physical_mem < 0) {
@@ -200,7 +198,7 @@ void MemInfo::refresh_proc_meminfo() {
         // Process `MemAvailable = MemFree - LowWaterMark + (PageCache - 
min(PageCache / 2, LowWaterMark))`,
         // from `MemAvailable` in `/proc/meminfo`, calculated by OS.
         // CgroupV2 `MemAvailable = cgroup_mem_limit - cgroup_mem_usage`,
-        // `cgroup_mem_usage = memory.current - inactive_file - 
slab_reclaimable`, in fact,
+        // `cgroup_mem_usage = memory.current - inactive_file - active_file - 
slab_reclaimable`, in fact,
         // there seems to be some memory that can be reused in 
`cgroup_mem_usage`.
         if (mem_available < 0) {
             mem_available = _s_cgroup_mem_limit - _s_cgroup_mem_usage;
diff --git a/cloud/src/meta-service/meta_service_rate_limit_helper.cpp 
b/cloud/src/meta-service/meta_service_rate_limit_helper.cpp
index 0f8c750be6c..9af2aead05c 100644
--- a/cloud/src/meta-service/meta_service_rate_limit_helper.cpp
+++ b/cloud/src/meta-service/meta_service_rate_limit_helper.cpp
@@ -360,7 +360,7 @@ std::optional<CgroupMemoryInfo> get_cgroup_memory_info() {
             }
             auto metrics = read_metrics_map(*dir / "memory.stat");
             int64_t adjusted_usage = *usage;
-            adjusted_usage -= metrics["inactive_file"];
+            adjusted_usage -= metrics["inactive_file"] + 
metrics["active_file"];
             adjusted_usage -= metrics["slab_reclaimable"];
             adjusted_usage = std::max<int64_t>(0, adjusted_usage);
             return CgroupMemoryInfo {limit_bytes, adjusted_usage};


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to