This is an automated email from the ASF dual-hosted git repository.

maxyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit caeee2140794295b00c06c0a83a1d0cc40acdcee
Author: zhangyue <[email protected]>
AuthorDate: Sun Aug 31 07:02:33 2025 +0000

    Fix gpcheckresgroupv2impl to use dynamic cgroup parent from GUC parameter
    
    This commit fixes issues introduced in
    "Add guc: gp_resource_group_cgroup_parent (#16738)" where the
    gp_resource_group_cgroup_parent GUC parameter was added but the
    gpcheckresgroupv2impl script still used hardcoded "gpdb" paths.
    
    Changes:
    - Implement get_cgroup_parent() method to dynamically retrieve the
      gp_resource_group_cgroup_parent value from database
    - Replace all hardcoded "gpdb" paths with dynamic cgroup parent value
    - Improve error handling in cgroup.c with more descriptive error messages
    - Fix test configuration order: set gp_resource_group_cgroup_parent before
      enabling gp_resource_manager=group-v2 to avoid validation failures
    
    This ensures the cgroup validation script works correctly with custom
    cgroup parent directories configured via the GUC parameter, making the
    resource group feature more flexible for different deployment scenarios.
---
 gpMgmt/bin/gpcheckresgroupv2impl                   | 59 +++++++++++++++++-----
 src/backend/utils/resgroup/cgroup.c                | 12 +++++
 src/include/utils/cgroup.h                         |  9 ++++
 .../resgroup/resgroup_auxiliary_tools_v2.out       | 19 ++++++-
 .../sql/resgroup/resgroup_auxiliary_tools_v2.sql   | 12 ++++-
 5 files changed, 95 insertions(+), 16 deletions(-)

diff --git a/gpMgmt/bin/gpcheckresgroupv2impl b/gpMgmt/bin/gpcheckresgroupv2impl
index 4e15d048eb1..b71d2562628 100755
--- a/gpMgmt/bin/gpcheckresgroupv2impl
+++ b/gpMgmt/bin/gpcheckresgroupv2impl
@@ -2,8 +2,17 @@
 # Copyright (c) 2017, VMware, Inc. or its affiliates.
 
 import os
+import sys
 from functools import reduce
 
+# Add the gppylib path to sys.path to import database connection modules
+try:
+    from gppylib.db import dbconn
+    from pg import DatabaseError
+except ImportError as err:
+    sys.exit('Cannot import modules. Please check that you have sourced '
+             'cloudberry-env.sh. Detail: ' + str(err))
+
 
 class ValidationException(Exception):
     def __init__(self, message):
@@ -29,6 +38,7 @@ class CgroupValidationVersionTwo(CgroupValidation):
     def __init__(self):
         self.mount_point = self.detect_cgroup_mount_point()
         self.tab = {"r": os.R_OK, "w": os.W_OK, "x": os.X_OK, "f": os.F_OK}
+        self.cgroup_parent = self.get_cgroup_parent()
 
     def validate_all(self):
         """
@@ -43,23 +53,46 @@ class CgroupValidationVersionTwo(CgroupValidation):
 
         self.validate_permission("cgroup.procs", "rw")
 
-        self.validate_permission("gpdb/", "rwx")
-        self.validate_permission("gpdb/cgroup.procs", "rw")
+        self.validate_permission(self.cgroup_parent + "/", "rwx")
+        self.validate_permission(self.cgroup_parent + "/cgroup.procs", "rw")
+
+        self.validate_permission(self.cgroup_parent + "/cpu.max", "rw")
+        self.validate_permission(self.cgroup_parent + "/cpu.weight", "rw")
+        self.validate_permission(self.cgroup_parent + "/cpu.weight.nice", "rw")
+        self.validate_permission(self.cgroup_parent + "/cpu.stat", "r")
 
-        self.validate_permission("gpdb/cpu.max", "rw")
-        self.validate_permission("gpdb/cpu.weight", "rw")
-        self.validate_permission("gpdb/cpu.weight.nice", "rw")
-        self.validate_permission("gpdb/cpu.stat", "r")
+        self.validate_permission(self.cgroup_parent + "/cpuset.cpus", "rw")
+        self.validate_permission(self.cgroup_parent + 
"/cpuset.cpus.partition", "rw")
+        self.validate_permission(self.cgroup_parent + "/cpuset.mems", "rw")
+        self.validate_permission(self.cgroup_parent + 
"/cpuset.cpus.effective", "r")
+        self.validate_permission(self.cgroup_parent + 
"/cpuset.mems.effective", "r")
 
-        self.validate_permission("gpdb/cpuset.cpus", "rw")
-        self.validate_permission("gpdb/cpuset.cpus.partition", "rw")
-        self.validate_permission("gpdb/cpuset.mems", "rw")
-        self.validate_permission("gpdb/cpuset.cpus.effective", "r")
-        self.validate_permission("gpdb/cpuset.mems.effective", "r")
+        self.validate_permission(self.cgroup_parent + "/memory.current", "r")
 
-        self.validate_permission("gpdb/memory.current", "r")
+        self.validate_permission(self.cgroup_parent + "/io.max", "rw")
 
-        self.validate_permission("gpdb/io.max", "rw")
+    def get_cgroup_parent(self):
+        """
+        Get the cgroup parent directory from the database GUC parameter
+        gp_resource_group_cgroup_parent. If unable to connect to database
+        or retrieve the parameter, report error using die function.
+        """
+        try:
+            dburl = dbconn.DbURL()
+
+            with dbconn.connect(dburl, utility=True) as conn:
+                # Query the GUC parameter value
+                sql = "SHOW gp_resource_group_cgroup_parent"
+                cursor = dbconn.query(conn, sql)
+                result = cursor.fetchone()
+
+                if result and result[0]:
+                    return result[0]
+                else:
+                    self.die("failed to retrieve 
gp_resource_group_cgroup_parent parameter from database")
+ 
+        except Exception as e:
+            self.die("failed to retrieve gp_resource_group_cgroup_parent 
parameter: {}".format(str(e)))
 
     def die(self, msg):
         raise ValidationException("cgroup is not properly configured: 
{}".format(msg))
diff --git a/src/backend/utils/resgroup/cgroup.c 
b/src/backend/utils/resgroup/cgroup.c
index 7c783fbbf6d..983d247dc96 100644
--- a/src/backend/utils/resgroup/cgroup.c
+++ b/src/backend/utils/resgroup/cgroup.c
@@ -665,6 +665,12 @@ permListCheck(const PermList *permlist, Oid group, bool 
report)
                                                                        prop[0] 
? "file" : "directory",
                                                                        path);
                        }
+                       else
+                       {
+                               CGROUP_CONFIG_WARNING("invalid %s name '%s': 
%m",
+                                                                         
prop[0] ? "file" : "directory",
+                                                                         path);
+                       }
                        return false;
                }
 
@@ -678,6 +684,12 @@ permListCheck(const PermList *permlist, Oid group, bool 
report)
                                                                        prop[0] 
? "file" : "directory",
                                                                        path);
                        }
+                       else
+                       {
+                               CGROUP_CONFIG_WARNING("can't access %s '%s': 
%m",
+                                                                         
prop[0] ? "file" : "directory",
+                                                                         path);
+                       }
                        return false;
                }
        }
diff --git a/src/include/utils/cgroup.h b/src/include/utils/cgroup.h
index 39ef82aeb52..e43e1a88412 100644
--- a/src/include/utils/cgroup.h
+++ b/src/include/utils/cgroup.h
@@ -18,9 +18,18 @@
 #include "postgres.h"
 #include "nodes/pg_list.h"
 
+extern char *gp_resource_group_cgroup_parent;
+
 #define MAX_CGROUP_PATHLEN 256
 #define MAX_CGROUP_CONTENTLEN 1024  
 
+#define CGROUP_CONFIG_WARNING(msg, ...) \
+       ereport(WARNING, \
+                       (errmsg("cgroup is not properly configured: " msg, 
##__VA_ARGS__), \
+                        errhint("Please check whether the directory 
'/sys/fs/cgroup/%s' exists when gp_resource_manager = 'group-v2' and 
gp_resource_group_cgroup_parent = '%s'.", \
+                        gp_resource_group_cgroup_parent ? 
gp_resource_group_cgroup_parent : "", \
+                        gp_resource_group_cgroup_parent ? 
gp_resource_group_cgroup_parent : "")))
+
 #define CGROUP_ERROR(...) elog(ERROR, __VA_ARGS__)
 #define CGROUP_CONFIG_ERROR(...) \
        CGROUP_ERROR("cgroup is not properly configured: " __VA_ARGS__)
diff --git 
a/src/test/isolation2/expected/resgroup/resgroup_auxiliary_tools_v2.out 
b/src/test/isolation2/expected/resgroup/resgroup_auxiliary_tools_v2.out
index de159f4910e..473c2fff185 100644
--- a/src/test/isolation2/expected/resgroup/resgroup_auxiliary_tools_v2.out
+++ b/src/test/isolation2/expected/resgroup/resgroup_auxiliary_tools_v2.out
@@ -20,10 +20,20 @@ ERROR:  language "plpython3u" already exists
 -- end_ignore
 
 -- enable resource group and restart cluster.
+-- prerequisites:
+--     1. '/sys/fs/cgroup/gpdb' must exist,
+--        otherwise create it before run installcheck-resgroup-v2;
+--     2. 'gpconfig -c gp_resource_group_cgroup_parent -v "gpdb" && gpstop 
-rai'
+--        must run before 'gpconfig -c gp_resource_manager -v group-v2', 
because
+--        during the process of setting gp_resource_manager to group-v2, the
+--        system will check whether the directory
+--        '/sys/fs/cgroup/$gp_resource_group_cgroup_parent' exists.
 -- start_ignore
-! gpconfig -c gp_resource_manager -v group-v2;
+! gpconfig -c gp_resource_group_cgroup_parent -v "gpdb";
 
-! gpconfig -c gp_resource_group_cgroup_parent -v "gpdb"
+! gpstop -rai;
+
+! gpconfig -c gp_resource_manager -v group-v2;
 
 ! gpconfig -c max_connections -v 250 -m 25;
 
@@ -39,6 +49,11 @@ ERROR:  language "plpython3u" already exists
 ---------------------
  group-v2            
 (1 row)
+0: SHOW gp_resource_group_cgroup_parent;
+ gp_resource_group_cgroup_parent 
+---------------------------------
+ gpdb                            
+(1 row)
 
 -- resource queue statistics should not crash
 0: SELECT * FROM pg_resqueue_status;
diff --git a/src/test/isolation2/sql/resgroup/resgroup_auxiliary_tools_v2.sql 
b/src/test/isolation2/sql/resgroup/resgroup_auxiliary_tools_v2.sql
index e274acbb8f3..cf708526ad3 100644
--- a/src/test/isolation2/sql/resgroup/resgroup_auxiliary_tools_v2.sql
+++ b/src/test/isolation2/sql/resgroup/resgroup_auxiliary_tools_v2.sql
@@ -19,9 +19,18 @@ CREATE LANGUAGE plpython3u;
 -- end_ignore
 
 -- enable resource group and restart cluster.
+-- prerequisites:
+--     1. '/sys/fs/cgroup/gpdb' must exist,
+--        otherwise create it before run installcheck-resgroup-v2;
+--     2. 'gpconfig -c gp_resource_group_cgroup_parent -v "gpdb" && gpstop 
-rai'
+--        must run before 'gpconfig -c gp_resource_manager -v group-v2', 
because
+--        during the process of setting gp_resource_manager to group-v2, the
+--        system will check whether the directory
+--        '/sys/fs/cgroup/$gp_resource_group_cgroup_parent' exists.
 -- start_ignore
+! gpconfig -c gp_resource_group_cgroup_parent -v "gpdb";
+! gpstop -rai;
 ! gpconfig -c gp_resource_manager -v group-v2;
-! gpconfig -c gp_resource_group_cgroup_parent -v "gpdb"
 ! gpconfig -c max_connections -v 250 -m 25;
 ! gpconfig -c runaway_detector_activation_percent -v 100;
 ! gpstop -rai;
@@ -30,6 +39,7 @@ CREATE LANGUAGE plpython3u;
 -- after the restart we need a new connection to run the queries
 
 0: SHOW gp_resource_manager;
+0: SHOW gp_resource_group_cgroup_parent;
 
 -- resource queue statistics should not crash
 0: SELECT * FROM pg_resqueue_status;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to