This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 92f800ac443 [improvement](cgroup) inactive_file should be treated as 
available memory to avoid query be cancelled (#64347)
92f800ac443 is described below

commit 92f800ac4434a64d693b150e349bed05f87b5524
Author: yiguolei <[email protected]>
AuthorDate: Thu Jun 11 15:55:30 2026 +0800

    [improvement](cgroup) inactive_file should be treated as available memory 
to avoid query be cancelled (#64347)
    
    ### What problem does this PR solve?
    
    Issue Number: close #xxx
    
    Related PR: #xxx
    
    Problem Summary:
    
    Sometimes we may see this errors in cgroup or k8s environment. Allocator
    sys memory check failed: Cannot alloc:5343, ...,process memory used
    85.41 GB exceed limit 108.00 GB or sys available memory 5.88 GB less
    than low water mark 6.00 GB.
    The mem_limit term is false (85.41 < 108). The 5343-byte allocation is
    rejected only by sys available memory 5.88 GB < low water mark 6.00 GB.
    5.88 GiB available implies cgroup_mem_usage of about 114 GiB, roughly 29
    GiB above process memory used (85.41 GiB); that gap is unmapped read
    page cache. The kernel reclaims clean page cache before OOM, so the
    memory is available, but Doris cannot reclaim it and the rejection
    repeats on later allocations. (low water mark 6.00 GB is the default:
    min(120 - 108, 120 * 5%) = 6.)
    
    Before this PR, cgroup_mem_usage = memory.current - inactive_file -
    slab_reclaimable. So some active files page cache is not treated as
    recycleable memory. So cgroup_mem_usage is a bit larger than RSS.
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test <!-- At least one of them must be included. -->
        - [ ] Regression test
        - [ ] Unit Test
        - [ ] Manual test (add detailed scripts or steps below)
        - [ ] No need to test or manual test. Explain why:
    - [ ] This is a refactor/code format and no logic has been changed.
            - [ ] Previous test can cover this change.
            - [ ] No code files have been changed.
            - [ ] Other reason <!-- Add your reason?  -->
    
    - Behavior changed:
        - [ ] No.
        - [ ] Yes. <!-- Explain the behavior change -->
    
    - Does this need documentation?
        - [ ] No.
    - [ ] Yes. <!-- Add document PR link here. eg:
    https://github.com/apache/doris-website/pull/1214 -->
    
    ### Check List (For Reviewer who merge this PR)
    
    - [ ] Confirm the release note
    - [ ] Confirm test cases
    - [ ] Confirm document
    - [ ] Add branch pick label <!-- Add branch pick label that this PR
    should merge into -->
---
 be/src/common/cgroup_memory_ctl.cpp                | 34 ++++++++++++++++++----
 be/src/util/cgroup_util.cpp                        |  7 +++++
 be/src/util/mem_info.cpp                           | 16 +++++-----
 .../meta_service_rate_limit_helper.cpp             |  2 +-
 4 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/be/src/common/cgroup_memory_ctl.cpp 
b/be/src/common/cgroup_memory_ctl.cpp
index dddcbd50338..9a70a615dcc 100644
--- a/be/src/common/cgroup_memory_ctl.cpp
+++ b/be/src/common/cgroup_memory_ctl.cpp
@@ -93,6 +93,8 @@ struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader {
             return Status::CgroupError("Error reading {}: {}", 
file_path.string(),
                                        get_str_err_msg());
         }
+        // This means no limit, for example, all process in linux will belong 
to a cgroup, and
+        // the default value of the memory limit in memory.max file is  "max", 
which means no limit.
         if (line == "max") {
             *value = std::numeric_limits<int64_t>::max();
             return Status::OK();
@@ -107,15 +109,37 @@ struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader {
         std::unordered_map<std::string, int64_t> metrics_map;
         CGroupUtil::read_int_metric_from_cgroup_file((_mount_file_dir / 
"memory.stat"),
                                                      metrics_map);
-        if (*value < metrics_map["inactive_file"]) {
-            return Status::CgroupError("CgroupsV2Reader read_memory_usage 
negative memory usage");
+        int64_t inactive_file =
+                metrics_map.contains("inactive_file") ? 
metrics_map["inactive_file"] : 0;
+        int64_t active_file = metrics_map.contains("active_file") ? 
metrics_map["active_file"] : 0;
+        int64_t slab_reclaimable =
+                metrics_map.contains("slab_reclaimable") ? 
metrics_map["slab_reclaimable"] : 0;
+        if (inactive_file < 0 || active_file < 0 || slab_reclaimable < 0) {
+            // In this scenario, not return error, ignore it and print log.
+            LOG(WARNING) << "CgroupsV2Reader read_memory_usage missing 
expected metrics in "
+                            "memory.stat, inactive_file: "
+                         << inactive_file << ", active_file: " << active_file
+                         << ", slab_reclaimable: " << slab_reclaimable;
+            return Status::OK();
+        }
+
+        const int64_t reclaimable_usage = inactive_file + active_file + 
slab_reclaimable;
+        if (*value < reclaimable_usage) {
+            LOG(WARNING)
+                    << "CgroupsV2Reader read_memory_usage negative memory 
usage, not - reclaimable "
+                       "usage any more, just return memory.current: "
+                    << *value << ", inactive_file: " << inactive_file
+                    << ", active_file: " << active_file
+                    << ", slab_reclaimable: " << slab_reclaimable;
+            // In this case, do not return an error, just ignore the negative 
usage and continue.
+            // If return error, the upper system will use os available memory 
instead of cgroup available memory, which may cause OOM more easily.
+            return Status::OK();
         }
-        // the reason why we subtract inactive_file described here:
+        // The reclaimable file cache described here should not be counted as 
used memory:
         // 
https://github.com/ClickHouse/ClickHouse/issues/64652#issuecomment-2149630667
-        *value -= metrics_map["inactive_file"];
         // Part of "slab" that might be reclaimed, such as dentries and inodes.
         // https://arthurchiao.art/blog/cgroupv2-zh/
-        *value -= metrics_map["slab_reclaimable"];
+        *value -= reclaimable_usage;
         return Status::OK();
     }
 
diff --git a/be/src/util/cgroup_util.cpp b/be/src/util/cgroup_util.cpp
index cfda0a4fb77..cc52a60ed75 100644
--- a/be/src/util/cgroup_util.cpp
+++ b/be/src/util/cgroup_util.cpp
@@ -178,6 +178,9 @@ std::string CGroupUtil::cgroupv2_of_process() {
     }
     // With cgroups v2, there will be a *single* line with prefix "0::/"
     // (see https://docs.kernel.org/admin-guide/cgroup-v2.html)
+    // such as 0::/user.slice/user-1005.slice/session-213906.scope this is the 
cgroup name
+    // it should be combined with the default cgroup mount point to get the 
full path to the cgroup, e.g.
+    // /sys/fs/cgroup/user.slice/user-1005.slice/session-213906.scope
     std::string cgroup;
     std::getline(cgroup_name_file, cgroup);
     static const std::string v2_prefix = "0::/";
@@ -198,6 +201,7 @@ std::optional<std::string> 
CGroupUtil::get_cgroupsv2_path(const std::string& sub
     }
 
     std::string cgroup = CGroupUtil::cgroupv2_of_process();
+    //    /sys/fs/cgroup/user.slice/user-1005.slice/session-213906.scope
     auto current_cgroup = cgroup.empty() ? default_cgroups_mount : 
(default_cgroups_mount / cgroup);
 
     // Return the bottom-most nested current memory file. If there is no such 
file at the current
@@ -259,6 +263,9 @@ void CGroupUtil::read_int_metric_from_cgroup_file(
                 metrics_map[key] = value;
             } else if (fields[2] == "kB") {
                 metrics_map[key] = value * 1024L;
+            } else {
+                LOG(WARNING) << "Unknown unit in cgroup file " << 
file_path.string()
+                             << ", line: " << line;
             }
         }
     }
diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp
index 4b48d40d2d9..ebd6f4a5442 100644
--- a/be/src/util/mem_info.cpp
+++ b/be/src/util/mem_info.cpp
@@ -94,7 +94,7 @@ void MemInfo::refresh_proc_meminfo() {
     if (meminfo.is_open()) {
         meminfo.close();
     }
-
+    _s_cgroup_mem_refresh_state = false;
     // refresh cgroup memory
     if (config::enable_use_cgroup_memory_info) {
         if (_s_cgroup_mem_refresh_wait_times >= 0) {
@@ -119,12 +119,13 @@ void MemInfo::refresh_proc_meminfo() {
 
         // cgroup mem limit is refreshed every 10 seconds,
         // cgroup mem usage is refreshed together with memInfo every time, 
which is very frequent.
+        // If _s_cgroup_mem_limit == max, it means get cgroup mem limit failed 
OR the cgroup has no memory limit for example
+        // there is just "max" in memory.max file.
         if (_s_cgroup_mem_limit != std::numeric_limits<int64_t>::max()) {
             int64_t cgroup_mem_usage;
             auto status = 
CGroupMemoryCtl::find_cgroup_mem_usage(&cgroup_mem_usage);
             if (!status.ok()) {
                 _s_cgroup_mem_usage = std::numeric_limits<int64_t>::min();
-                _s_cgroup_mem_refresh_state = false;
                 LOG_EVERY_N(WARNING, 500)
                         << "Refresh cgroup memory usage failed, cgroup mem 
limit: "
                         << _s_cgroup_mem_limit << ", " << status;
@@ -132,17 +133,14 @@ void MemInfo::refresh_proc_meminfo() {
                 _s_cgroup_mem_usage = cgroup_mem_usage;
                 _s_cgroup_mem_refresh_state = true;
             }
-        } else {
-            _s_cgroup_mem_refresh_state = false;
         }
-    } else {
-        _s_cgroup_mem_refresh_state = false;
     }
 
     // 1. calculate physical_mem
     int64_t physical_mem = -1;
-
-    physical_mem = _mem_info_bytes["MemTotal"];
+    if (_mem_info_bytes.find("MemTotal") != _mem_info_bytes.end()) {
+        physical_mem = _mem_info_bytes["MemTotal"];
+    }
     if (_s_cgroup_mem_refresh_state) {
         // In theory, always cgroup_mem_limit < physical_mem
         if (physical_mem < 0) {
@@ -200,7 +198,7 @@ void MemInfo::refresh_proc_meminfo() {
         // Process `MemAvailable = MemFree - LowWaterMark + (PageCache - 
min(PageCache / 2, LowWaterMark))`,
         // from `MemAvailable` in `/proc/meminfo`, calculated by OS.
         // CgroupV2 `MemAvailable = cgroup_mem_limit - cgroup_mem_usage`,
-        // `cgroup_mem_usage = memory.current - inactive_file - 
slab_reclaimable`, in fact,
+        // `cgroup_mem_usage = memory.current - inactive_file - active_file - 
slab_reclaimable`, in fact,
         // there seems to be some memory that can be reused in 
`cgroup_mem_usage`.
         if (mem_available < 0) {
             mem_available = _s_cgroup_mem_limit - _s_cgroup_mem_usage;
diff --git a/cloud/src/meta-service/meta_service_rate_limit_helper.cpp 
b/cloud/src/meta-service/meta_service_rate_limit_helper.cpp
index 0f8c750be6c..9af2aead05c 100644
--- a/cloud/src/meta-service/meta_service_rate_limit_helper.cpp
+++ b/cloud/src/meta-service/meta_service_rate_limit_helper.cpp
@@ -360,7 +360,7 @@ std::optional<CgroupMemoryInfo> get_cgroup_memory_info() {
             }
             auto metrics = read_metrics_map(*dir / "memory.stat");
             int64_t adjusted_usage = *usage;
-            adjusted_usage -= metrics["inactive_file"];
+            adjusted_usage -= metrics["inactive_file"] + 
metrics["active_file"];
             adjusted_usage -= metrics["slab_reclaimable"];
             adjusted_usage = std::max<int64_t>(0, adjusted_usage);
             return CgroupMemoryInfo {limit_bytes, adjusted_usage};


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to