On the worker node, check if cgroups are mounted

grep cgroup /proc/mounts

(normally it's in /sys/fs/cgroup )

then check if Slurm is setting up the cgroup

find /sys/fs/cgroup | grep slurm

e.g.

[root@spartan-gpgpu164 ~]# find /sys/fs/cgroup/memory | grep slurm
/sys/fs/cgroup/memory/slurm
/sys/fs/cgroup/memory/slurm/uid_14633
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.kmem.tcp.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.kmem.tcp.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.kmem.tcp.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.kmem.tcp.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.memsw.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.memsw.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.memsw.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.memsw.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.kmem.slabinfo
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.kmem.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.kmem.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.kmem.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.kmem.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.numa_stat
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.pressure_level
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.oom_control
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.move_charge_at_immigrate
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.swappiness
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.use_hierarchy
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.force_empty
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.stat
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.soft_limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/memory.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/cgroup.clone_children
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/cgroup.event_control
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/notify_on_release
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/cgroup.procs
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_0/tasks
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.kmem.tcp.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.kmem.tcp.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.kmem.tcp.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.kmem.tcp.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.memsw.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.memsw.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.memsw.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.memsw.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.kmem.slabinfo
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.kmem.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.kmem.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.kmem.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.kmem.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.numa_stat
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.pressure_level
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.oom_control
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.move_charge_at_immigrate
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.swappiness
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.use_hierarchy
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.force_empty
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.stat
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.soft_limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/memory.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/cgroup.clone_children
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/cgroup.event_control
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/notify_on_release
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/cgroup.procs
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_batch/tasks
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.kmem.tcp.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.kmem.tcp.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.kmem.tcp.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.kmem.tcp.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.memsw.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.memsw.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.memsw.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.memsw.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.kmem.slabinfo
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.kmem.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.kmem.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.kmem.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.kmem.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.numa_stat
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.pressure_level
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.oom_control
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.move_charge_at_immigrate
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.swappiness
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.use_hierarchy
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.force_empty
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.stat
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.soft_limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/memory.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/cgroup.clone_children
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/cgroup.event_control
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/notify_on_release
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/cgroup.procs
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/step_extern/tasks
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.kmem.tcp.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.kmem.tcp.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.kmem.tcp.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.kmem.tcp.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.memsw.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.memsw.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.memsw.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.memsw.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.kmem.slabinfo
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.kmem.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.kmem.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.kmem.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.kmem.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.numa_stat
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.pressure_level
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.oom_control
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.move_charge_at_immigrate
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.swappiness
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.use_hierarchy
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.force_empty
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.stat
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.failcnt
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.soft_limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.limit_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.max_usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/memory.usage_in_bytes
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/cgroup.clone_children
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/cgroup.event_control
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/notify_on_release
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/cgroup.procs
/sys/fs/cgroup/memory/slurm/uid_14633/job_48210004/tasks

Note that this will work for cgroups V1. If you are using later OS's (e.g. 
Ubuntu 22.04), you have to use cgroups V2

Sean
________________________________
From: slurm-users <slurm-users-boun...@lists.schedmd.com> on behalf of Boris 
Yazlovitsky <boris...@gmail.com>
Sent: Friday, 23 June 2023 12:49
To: Slurm User Community List <slurm-users@lists.schedmd.com>
Subject: Re: [slurm-users] [EXT] --mem is not limiting the job's memory

External email: Please exercise caution

________________________________
it's still not constraining memory...

a memhog job continues to memhog:

boris@rod:~/scripts$ sacct --starttime=2023-05-01 
--format=jobid,user,start,elapsed,reqmem,maxrss,maxvmsize,nodelist,state,exit 
-j 199
JobID             User               Start    Elapsed     ReqMem     MaxRSS  
MaxVMSize        NodeList      State ExitCode
------------ --------- ------------------- ---------- ---------- ---------- 
---------- --------------- ---------- --------
199              boris 2023-06-23T02:42:30   00:01:21         1M                
              milhouse  COMPLETED      0:0
199.batch              2023-06-23T02:42:30   00:01:21            104857988K 
104858064K        milhouse  COMPLETED      0:0

One thing I noticed is that the machines I'm working on do not have libcgroup 
and libcgroup-dev installed - but slurm does have its own cgroup 
implementation? the slurmd processes do utilize /usr/lib/slurm/*cgroup.so 
objects.  I will try to recompile slurm with those cgrouplib packages present.

On Thu, Jun 22, 2023 at 6:04 PM Ozeryan, Vladimir 
<vladimir.ozer...@jhuapl.edu<mailto:vladimir.ozer...@jhuapl.edu>> wrote:

No worries,

No, we don’t have any OS level settings, only “allowed_devices.conf” which just 
has /dev/random, /dev/tty and stuff like that.



But I think this could be the culprit, check out man page for cgroup.conf
AllowedRAMSpace=100



I would just leave these four:

CgroupAutomount=yes
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes



Vlad.



From: slurm-users 
<slurm-users-boun...@lists.schedmd.com<mailto:slurm-users-boun...@lists.schedmd.com>>
 On Behalf Of Boris Yazlovitsky
Sent: Thursday, June 22, 2023 5:40 PM
To: Slurm User Community List 
<slurm-users@lists.schedmd.com<mailto:slurm-users@lists.schedmd.com>>
Subject: Re: [slurm-users] [EXT] --mem is not limiting the job's memory



APL external email warning: Verify sender 
slurm-users-boun...@lists.schedmd.com<mailto:slurm-users-boun...@lists.schedmd.com>
 before clicking links or attachments



thank you Vlad - looks like we have the same yes's

Do you remember if you had to make any settings on the OS level or in the 
kernel to make it work?



-b



On Thu, Jun 22, 2023 at 5:31 PM Ozeryan, Vladimir 
<vladimir.ozer...@jhuapl.edu<mailto:vladimir.ozer...@jhuapl.edu>> wrote:

Hello,



We have the following configured and it seems to be working ok.



CgroupAutomount=yes
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes

Vlad.



From: slurm-users 
<slurm-users-boun...@lists.schedmd.com<mailto:slurm-users-boun...@lists.schedmd.com>>
 On Behalf Of Boris Yazlovitsky
Sent: Thursday, June 22, 2023 4:50 PM
To: Slurm User Community List 
<slurm-users@lists.schedmd.com<mailto:slurm-users@lists.schedmd.com>>
Subject: Re: [slurm-users] [EXT] --mem is not limiting the job's memory



APL external email warning: Verify sender 
slurm-users-boun...@lists.schedmd.com<mailto:slurm-users-boun...@lists.schedmd.com>
 before clicking links or attachments



Hello Vladimir, thank you for your response.



this is the cgroups.conf file:

CgroupAutomount=yes
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes
ConstrainSwapSpace=yes
MaxRAMPercent=90
AllowedSwapSpace=0
AllowedRAMSpace=100
MemorySwappiness=0
MaxSwapPercent=0



/etc/default/grub:

GRUB_DEFAULT=0
GRUB_TIMEOUT_STYLE=hidden
GRUB_TIMEOUT=0
GRUB_DISTRIBUTOR=`lsb_release -i -s 2> /dev/null || echo Debian`
GRUB_CMDLINE_LINUX_DEFAULT=""
GRUB_CMDLINE_LINUX="net.ifnames=0 biosdevname=0 cgroup_enable=memory 
swapaccount=1"



what other cgroup settings need to be set?



&& thank you!

-b



On Thu, Jun 22, 2023 at 4:02 PM Ozeryan, Vladimir 
<vladimir.ozer...@jhuapl.edu<mailto:vladimir.ozer...@jhuapl.edu>> wrote:

--mem=5G. Should allocate 5G of memory per node.

Are your cgroups configured?



From: slurm-users 
<slurm-users-boun...@lists.schedmd.com<mailto:slurm-users-boun...@lists.schedmd.com>>
 On Behalf Of Boris Yazlovitsky
Sent: Thursday, June 22, 2023 3:28 PM
To: slurm-users@lists.schedmd.com<mailto:slurm-users@lists.schedmd.com>
Subject: [EXT] [slurm-users] --mem is not limiting the job's memory



APL external email warning: Verify sender 
slurm-users-boun...@lists.schedmd.com<mailto:slurm-users-boun...@lists.schedmd.com>
 before clicking links or attachments



Running slurm 22.03.02 on Ubunutu 22.04 server.

Jobs submitted with --mem=5g are able to allocate an unlimited amount of memory.



how to limit on the job submission level how much memory it can grab?



thanks, and best regards!
Boris


Reply via email to