Il 09/11/20 12:53, Diego Zuccato ha scritto: > Seems my corrections actually work only for single-node jobs. > In case of multi-node jobs, it only considers the memory used on one > node, hence understimates the real efficiency. > Someone more knowledgeable than me can spot the error?Seems I managed to have > it account for the memory on all the nodes. See attached file. The results seem quite meaningful and match the ones done by hand.
-- Diego Zuccato DIFA - Dip. di Fisica e Astronomia Servizi Informatici Alma Mater Studiorum - Università di Bologna V.le Berti-Pichat 6/2 - 40127 Bologna - Italy tel.: +39 051 20 95786
#!/usr/bin/perl use warnings; use strict qw/vars/; use Getopt::Std; use POSIX qw/pow/; use Sys::Hostname; use Slurmdb ':all'; use Slurm ':all'; #use Data::Dumper; my $VERSION = "2.1"; # This script is roughtly equivalent to: # sacct -P -n -a --format JobID,User,Group,State,Cluster,AllocCPUS,REQMEM,TotalCPU,Elapsed,MaxRSS,ExitCode,NNodes,NTasks -j <job_id> my %opts; getopts('hvdf:',\%opts); if (exists $opts{v}) { print "seff Version $VERSION\n"; exit 1; } if (exists $opts{h} || scalar @ARGV != 1) { print "Usage: seff [Options] <Jobid>\n"; print " Options:\n"; print " -h Help menu\n"; print " -v Version\n"; print " -d Debug mode: display raw Slurm data\n"; exit 1; } my $mydebug = 0; if (exists $opts{d}) { $mydebug = 1; } my $jobid_arg = $ARGV[0]; my $db_conn = Slurmdb::connection_get(); my $slurm = Slurm::new(); # Get cluster name from SLurm config file. my $conf = $slurm->load_ctl_conf(); my $clustername = $conf->{'cluster_name'}; my %job_cond = (); $job_cond{without_usage_truncation} = 1; $job_cond{cluster_list} = [$clustername]; $job_cond{step_list} = $jobid_arg; $job_cond{usage_start} = 0; $job_cond{usage_end} = 0; # Get and test for a single job. my $jobs = Slurmdb::jobs_get($db_conn, \%job_cond); if (scalar @$jobs < 1) { print STDERR "Job not found.\n"; exit 2; } my $job = @$jobs[0]; #print Dumper($job); my $jobid = $job->{'jobid'}; my $user = $job->{'user'}; my $group = getgrgid($job->{'gid'}); my $state = $slurm->job_state_string($job->{'state'}); $clustername = $job->{'cluster'}; my $ncpus = $job->{'req_cpus'}; #@@@ was alloc_cpus # Check for missing number of cpus. if ($ncpus == 0) { $ncpus = 1; } my $reqmem = $job->{'req_mem'}; my $nnodes = $job->{'alloc_nodes'}; # Check for missing number of nodes. if ($nnodes == 0) { $nnodes = 1; } my $pernode; if ($reqmem & MEM_PER_CPU) { $reqmem = ($reqmem & ~MEM_PER_CPU) * 1024 * $ncpus; $pernode = 0; } else { $reqmem = $reqmem * 1024 * $nnodes; $pernode = 1; } my $walltime = $job->{'elapsed'}; # Only use hi-order byte for error code. my $exit_status = $job->{'exitcode'} >> 8; my $array_job_id = $job->{'array_job_id'}; my $array_jobid = ""; if ($array_job_id != 0) { # Convert array_task_id to a signed long integer. my $array_task_id = unpack('l', pack('l', $job->{'array_task_id'})); if ($array_task_id == -2) { print STDERR "Badly formatted array jobid $array_job_id with task_id = -2\n"; exit 3; } $array_jobid = "${array_job_id}_${array_task_id}"; } my $tot_cpu_sec = 0; my $tot_cpu_usec = 0; my $mem = 0; my $ntasks = 0; for my $step (@{$job->{'steps'}}) { $tot_cpu_sec += $step->{'tot_cpu_sec'}; $tot_cpu_usec += $step->{'tot_cpu_usec'}; # my $lmem = $step->{'stats'}{'rss_max'}; #@@@ deve usare la seconda voce di tres_usage_in_max my %hash = split /[,=]/, $step->{'stats'}{'tres_usage_in_max'}; my $lmem=$hash{'2'}/1024; # if ($mem < $lmem) { # $mem = $lmem; # $ntasks = $step->{'ntasks'}; # } my $ltasks=$step->{'ntasks'}; $ntasks += $ltasks; $mem += $lmem*$ltasks; } my $cput = $tot_cpu_sec + int(($tot_cpu_usec / 1000000) + 0.5); #$mem = $mem * $ntasks; if ($mydebug) { print "Slurm data: JobID ArrayJobID User Group State Clustername Ncpus Nnodes Ntasks Reqmem PerNode Cput Walltime Mem ExitStatus\n"; print "Slurm data: $jobid $array_jobid $user $group $state $clustername $ncpus $nnodes $ntasks $reqmem $pernode $cput $walltime $mem $exit_status\n\n"; } print "Job ID: $jobid\n"; if (length $array_jobid) { print "Array Job ID: $array_jobid\n"; } print "Cluster: $clustername\n"; print "User/Group: $user/$group\n"; if ($state eq "PENDING" || $state eq "RUNNING") { print "State: $state\n"; } else { print "State: $state (exit code $exit_status)\n"; } if ($ncpus == 1) { print "Cores: $ncpus\n"; } else { print "Nodes: $nnodes\n"; printf "Cores per node: %d\n", $ncpus/$nnodes; } if ($state ne "PENDING") { my $corewalltime = $walltime * $ncpus; my $cpu_eff; if ($corewalltime != 0) { $cpu_eff = $cput / $corewalltime * 100; } else { $cpu_eff = 0.0; } printf("CPU Utilized: %s\n", time2str($cput)); printf("CPU Efficiency: %.2f%% of %s core-walltime\n", $cpu_eff, time2str($corewalltime)); if ($ntasks == 1) { printf("Memory Utilized: %s\n", kbytes2str($mem)); } else { printf("Memory Utilized: %s (estimated maximum)\n", kbytes2str($mem)); } my $mem_eff; if ($reqmem != 0) { $mem_eff = $mem / $reqmem * 100; } else { $mem_eff = 0.0; } if ($ntasks == 1) { printf("Memory Efficiency: %.2f%% of %s\n", $mem_eff, kbytes2str($reqmem)); } else { if ($pernode) { printf("Memory Efficiency: %.2f%% of %s (%s\/node)\n", $mem_eff, kbytes2str($reqmem), kbytes2str($reqmem / $nnodes)); } else { printf("Memory Efficiency: %.2f%% of %s (%s\/core)\n", $mem_eff, kbytes2str($reqmem), kbytes2str($reqmem / $ncpus)); } } if ($state eq "RUNNING") { print "WARNING: Efficiency statistics may be misleading for $state jobs.\n"; } } else { print "Efficiency not available for jobs in the PENDING state.\n"; } # Convert elapsed time to string. sub time2str { my $time = shift; my $days = int($time / 86400); $time -= ($days * 86400); my $hours = int($time / 3600); $time -= ($hours * 3600); my $minutes = int($time / 60); my $seconds = $time % 60; $days = $days < 1 ? '' : "$days-"; $time = $days . sprintf("%02s:%02s:%02s", $hours, $minutes, $seconds); return $time; } # Convert memory to human-readable string. sub kbytes2str { my $kbytes = shift; if ($kbytes == 0) { return sprintf("%.2f %sB", 0.0, 'M'); } my $mul = 1024; my $exp = int(log($kbytes) / log($mul)); my @pre = qw/ M G T P E /; my $pre = $pre[$exp-1]; return sprintf("%.2f %sB", ($kbytes / pow($mul, $exp)), $pre); }