Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package golang-github-vpenso-prometheus_slurm_exporter for openSUSE:Factory checked in at 2021-03-18 22:55:39 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/golang-github-vpenso-prometheus_slurm_exporter (Old) and /work/SRC/openSUSE:Factory/.golang-github-vpenso-prometheus_slurm_exporter.new.2401 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "golang-github-vpenso-prometheus_slurm_exporter" Thu Mar 18 22:55:39 2021 rev:3 rq:879924 version:0.17 Changes: -------- --- /work/SRC/openSUSE:Factory/golang-github-vpenso-prometheus_slurm_exporter/golang-github-vpenso-prometheus_slurm_exporter.changes 2020-12-03 18:44:03.570263161 +0100 +++ /work/SRC/openSUSE:Factory/.golang-github-vpenso-prometheus_slurm_exporter.new.2401/golang-github-vpenso-prometheus_slurm_exporter.changes 2021-03-18 22:55:39.459577319 +0100 @@ -1,0 +2,8 @@ +Thu Mar 18 15:04:01 UTC 2021 - Ana Guerrero Lopez <aguerr...@suse.com> + +- New version 0.17 + * Export information about shares via sshare +- New version 0.16 + * Add support to provide information about GPU GREs usage + +------------------------------------------------------------------- Old: ---- golang-github-vpenso-prometheus_slurm_exporter-0.15.tar.gz New: ---- golang-github-vpenso-prometheus_slurm_exporter-0.17.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ golang-github-vpenso-prometheus_slurm_exporter.spec ++++++ --- /var/tmp/diff_new_pack.Dhsyvs/_old 2021-03-18 22:55:39.935577833 +0100 +++ /var/tmp/diff_new_pack.Dhsyvs/_new 2021-03-18 22:55:39.939577838 +0100 @@ -1,7 +1,7 @@ # # spec file for package golang-github-vpenso-prometheus_slurm_exporter # -# Copyright (c) 2020 SUSE LLC +# Copyright (c) 2021 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -19,7 +19,7 @@ %{go_nostrip} Name: golang-github-vpenso-prometheus_slurm_exporter -Version: 0.15 +Version: 0.17 Release: 0 Summary: Prometheus exporter for Slurm metrics License: GPL-3.0-or-later ++++++ golang-github-vpenso-prometheus_slurm_exporter-0.15.tar.gz -> golang-github-vpenso-prometheus_slurm_exporter-0.17.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/prometheus-slurm-exporter-0.15/DEVELOPMENT.md new/prometheus-slurm-exporter-0.17/DEVELOPMENT.md --- old/prometheus-slurm-exporter-0.15/DEVELOPMENT.md 2020-10-14 14:45:06.000000000 +0200 +++ new/prometheus-slurm-exporter-0.17/DEVELOPMENT.md 2021-02-04 12:33:20.000000000 +0100 @@ -30,7 +30,7 @@ Build the exporter: ```bash -go build -o bin/prometheus-slurm-exporter {main,accounts,cpus,nodes,queue,scheduler,users}.go +go build -o bin/prometheus-slurm-exporter {main,accounts,cpus,gpus,partitions,nodes,queue,scheduler,sshare,users}.go ``` Run all tests included in `_test.go` files: @@ -43,7 +43,14 @@ ```bash bin/prometheus-slurm-exporter +``` + +If you wish to run the exporter on a different port, or the default port (8080) is already in use, run with the following argument: + +```bash +bin/prometheus-slurm-exporter --listen-address="0.0.0.0:<port>" ... + # query all metrics (default port) curl http://localhost:8080/metrics ``` @@ -56,4 +63,3 @@ * [Metric Types](https://prometheus.io/docs/concepts/metric_types/) * [Writing Exporters](https://prometheus.io/docs/instrumenting/writing_exporters/) * [Available Exporters](https://prometheus.io/docs/instrumenting/exporters/) - diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/prometheus-slurm-exporter-0.15/Makefile new/prometheus-slurm-exporter-0.17/Makefile --- old/prometheus-slurm-exporter-0.15/Makefile 2020-10-14 14:45:06.000000000 +0200 +++ new/prometheus-slurm-exporter-0.17/Makefile 2021-02-04 12:33:20.000000000 +0100 @@ -2,7 +2,7 @@ ifndef GOPATH GOPATH=$(shell pwd):/usr/share/gocode endif -GOFILES=accounts.go cpus.go main.go nodes.go partitions.go queue.go scheduler.go users.go +GOFILES=accounts.go cpus.go gpus.go main.go nodes.go partitions.go queue.go scheduler.go sshare.go users.go GOBIN=bin/$(PROJECT_NAME) build: diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/prometheus-slurm-exporter-0.15/README.md new/prometheus-slurm-exporter-0.17/README.md --- old/prometheus-slurm-exporter-0.15/README.md 2020-10-14 14:45:06.000000000 +0200 +++ new/prometheus-slurm-exporter-0.17/README.md 2021-02-04 12:33:20.000000000 +0100 @@ -11,9 +11,19 @@ * **Other**: CPUs which are unavailable for use at the moment. * **Total**: total number of CPUs. -- [Information extracted from the SLURM **sinfo** command](https://slurm.schedmd.com/sinfo.html) +- Information extracted from the SLURM [**sinfo**](https://slurm.schedmd.com/sinfo.html) command. - [Slurm CPU Management User and Administrator Guide](https://slurm.schedmd.com/cpu_management.html) +### State of the GPUs + +* **Allocated**: GPUs which have been allocated to a job. +* **Other**: GPUs which are unavailable for use at the moment. +* **Total**: total number of GPUs. +* **Utilization**: total GPU utiliazation on the cluster. + +- Information extracted from the SLURM [**sinfo**](https://slurm.schedmd.com/sinfo.html) and [**sacct**](https://slurm.schedmd.com/sacct.html) command. +- [Slurm GRES scheduling](https://slurm.schedmd.com/gres.html) + ### State of the Nodes * **Allocated**: nodes which has been allocated to one or more jobs. @@ -29,7 +39,7 @@ * **Mixed**: nodes which have some of their CPUs ALLOCATED while others are IDLE. * **Resv**: these nodes are in an advanced reservation and not generally available. -[Information extracted from the SLURM **sinfo** command](https://slurm.schedmd.com/sinfo.html) +- Information extracted from the SLURM [**sinfo**](https://slurm.schedmd.com/sinfo.html) command. ### Status of the Jobs @@ -46,9 +56,14 @@ * **PREEMPTED**: Jobs terminated due to preemption. * **NODE_FAIL**: Jobs terminated due to failure of one or more allocated nodes. -[Information extracted from the SLURM **squeue** command](https://slurm.schedmd.com/squeue.html) +- Information extracted from the SLURM [**squeue**](https://slurm.schedmd.com/squeue.html) command. + +### State of the Partitions -### Jobs information per Account and UserID +* Running/suspended Jobs per partitions, divided between Slurm accounts and users. +* CPUs total/allocated/idle per partition plus used CPU per user ID. + +### Jobs information per Account and User The following information about jobs are also extracted via [squeue](https://slurm.schedmd.com/squeue.html): @@ -57,7 +72,7 @@ ### Scheduler Information -* **Server Thread count**: The number of current active ``slurmctld`` threads. +* **Server Thread count**: The number of current active ``slurmctld`` threads. * **Queue size**: The length of the scheduler queue. * **DBD Agent queue size**: The length of the message queue for _SlurmDBD_. * **Last cycle**: Time in microseconds for last scheduling cycle. @@ -70,11 +85,11 @@ * **(Backfill) Total Backfilled Jobs** (since last stats cycle start): number of jobs started thanks to backfilling since last time stats where reset. * **(Backfill) Total backfilled heterogeneous Job components**: number of heterogeneous job components started thanks to backfilling since last Slurm start. -[Information extracted from the SLURM **sdiag** command](https://slurm.schedmd.com/sdiag.html) +- Information extracted from the SLURM [**sdiag**](https://slurm.schedmd.com/sdiag.html) command. *DBD Agent queue size*: it is particularly important to keep track of it, since an increasing number of messages counted with this parameter almost always indicates three issues: -* the _SlurmDBD_ daemon is down; +* the _SlurmDBD_ daemon is down; * the database is either down or unreachable; * the status of the Slurm accounting DB may be inconsistent (e.g. ``sreport`` missing data, weird utilization of the cluster, etc.). @@ -82,7 +97,7 @@ ## Installation * Read [DEVELOPMENT.md](DEVELOPMENT.md) in order to build the Prometheus Slurm Exporter. After a successful build copy the executable -`bin/prometheus-slurm-exporter` to a node with access to the Slurm command-line interface. +`bin/prometheus-slurm-exporter` to a node with access to the Slurm command-line interface. * A [Systemd Unit][sdu] file to run the executable as service is available in [lib/systemd/prometheus-slurm-exporter.service](lib/systemd/prometheus-slurm-exporter.service). @@ -99,7 +114,7 @@ # # SLURM resource manager: -# +# - job_name: 'my_slurm_exporter' scrape_interval: 30s @@ -146,5 +161,3 @@ This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/. - - diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/prometheus-slurm-exporter-0.15/gpus.go new/prometheus-slurm-exporter-0.17/gpus.go --- old/prometheus-slurm-exporter-0.15/gpus.go 1970-01-01 01:00:00.000000000 +0100 +++ new/prometheus-slurm-exporter-0.17/gpus.go 2021-02-04 12:33:20.000000000 +0100 @@ -0,0 +1,141 @@ +/* Copyright 2020 Joeri Hermans, Victor Penso, Matteo Dessalvi + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +package main + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/log" + "io/ioutil" + "os/exec" + "strings" + "strconv" +) + +type GPUsMetrics struct { + alloc float64 + idle float64 + total float64 + utilization float64 +} + +func GPUsGetMetrics() *GPUsMetrics { + return ParseGPUsMetrics() +} + +func ParseAllocatedGPUs() float64 { + var num_gpus = 0.0 + + args := []string{"-a", "-X", "--format=Allocgres", "--state=RUNNING", "--noheader", "--parsable2"} + output := string(Execute("sacct", args)) + if len(output) > 0 { + for _, line := range strings.Split(output, "\n") { + if len(line) > 0 { + line = strings.Trim(line, "\"") + descriptor := strings.TrimPrefix(line, "gpu:") + job_gpus, _ := strconv.ParseFloat(descriptor, 64) + num_gpus += job_gpus + } + } + } + + return num_gpus +} + +func ParseTotalGPUs() float64 { + var num_gpus = 0.0 + + args := []string{"-h", "-o \"%n %G\""} + output := string(Execute("sinfo", args)) + if len(output) > 0 { + for _, line := range strings.Split(output, "\n") { + if len(line) > 0 { + line = strings.Trim(line, "\"") + descriptor := strings.Fields(line)[1] + descriptor = strings.TrimPrefix(descriptor, "gpu:") + descriptor = strings.Split(descriptor, "(")[0] + node_gpus, _ := strconv.ParseFloat(descriptor, 64) + num_gpus += node_gpus + } + } + } + + return num_gpus +} + +func ParseGPUsMetrics() *GPUsMetrics { + var gm GPUsMetrics + total_gpus := ParseTotalGPUs() + allocated_gpus := ParseAllocatedGPUs() + gm.alloc = allocated_gpus + gm.idle = total_gpus - allocated_gpus + gm.total = total_gpus + gm.utilization = allocated_gpus / total_gpus + return &gm +} + +// Execute the sinfo command and return its output +func Execute(command string, arguments []string) []byte { + cmd := exec.Command(command, arguments...) + stdout, err := cmd.StdoutPipe() + if err != nil { + log.Fatal(err) + } + if err := cmd.Start(); err != nil { + log.Fatal(err) + } + out, _ := ioutil.ReadAll(stdout) + if err := cmd.Wait(); err != nil { + log.Fatal(err) + } + return out +} + +/* + * Implement the Prometheus Collector interface and feed the + * Slurm scheduler metrics into it. + * https://godoc.org/github.com/prometheus/client_golang/prometheus#Collector + */ + +func NewGPUsCollector() *GPUsCollector { + return &GPUsCollector{ + alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil), + idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil), + total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil), + utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil), + } +} + +type GPUsCollector struct { + alloc *prometheus.Desc + idle *prometheus.Desc + total *prometheus.Desc + utilization *prometheus.Desc +} + +// Send all metric descriptions +func (cc *GPUsCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- cc.alloc + ch <- cc.idle + ch <- cc.total + ch <- cc.utilization +} +func (cc *GPUsCollector) Collect(ch chan<- prometheus.Metric) { + cm := GPUsGetMetrics() + ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, cm.alloc) + ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, cm.idle) + ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, cm.total) + ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, cm.utilization) +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/prometheus-slurm-exporter-0.15/main.go new/prometheus-slurm-exporter-0.17/main.go --- old/prometheus-slurm-exporter-0.15/main.go 2020-10-14 14:45:06.000000000 +0200 +++ new/prometheus-slurm-exporter-0.17/main.go 2021-02-04 12:33:20.000000000 +0100 @@ -1,4 +1,4 @@ -/* Copyright 2017-2020 Victor Penso, Matteo Dessalvi +/* Copyright 2017-2020 Victor Penso, Matteo Dessalvi, Joeri Hermans This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -25,13 +25,15 @@ func init() { // Metrics have to be registered to be exposed - prometheus.MustRegister(NewSchedulerCollector()) // from scheduler.go - prometheus.MustRegister(NewQueueCollector()) // from queue.go - prometheus.MustRegister(NewNodesCollector()) // from nodes.go - prometheus.MustRegister(NewCPUsCollector()) // from cpus.go prometheus.MustRegister(NewAccountsCollector()) // from accounts.go - prometheus.MustRegister(NewUsersCollector()) // from users.go + prometheus.MustRegister(NewCPUsCollector()) // from cpus.go + prometheus.MustRegister(NewGPUsCollector()) // from gpus.go + prometheus.MustRegister(NewNodesCollector()) // from nodes.go prometheus.MustRegister(NewPartitionsCollector()) // from partitions.go + prometheus.MustRegister(NewQueueCollector()) // from queue.go + prometheus.MustRegister(NewSchedulerCollector()) // from scheduler.go + prometheus.MustRegister(NewFairShareCollector()) // from sshare.go + prometheus.MustRegister(NewUsersCollector()) // from users.go } var listenAddress = flag.String( diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/prometheus-slurm-exporter-0.15/sshare.go new/prometheus-slurm-exporter-0.17/sshare.go --- old/prometheus-slurm-exporter-0.15/sshare.go 1970-01-01 01:00:00.000000000 +0100 +++ new/prometheus-slurm-exporter-0.17/sshare.go 2021-02-04 12:33:20.000000000 +0100 @@ -0,0 +1,86 @@ +/* Copyright 2021 Victor Penso + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +package main + +import ( + "io/ioutil" + "os/exec" + "log" + "strings" + "strconv" + "github.com/prometheus/client_golang/prometheus" +) + +func FairShareData() []byte { + cmd := exec.Command( "sshare", "-n", "-P", "-o", "account,fairshare" ) + stdout, err := cmd.StdoutPipe() + if err != nil { + log.Fatal(err) + } + if err := cmd.Start(); err != nil { + log.Fatal(err) + } + out, _ := ioutil.ReadAll(stdout) + if err := cmd.Wait(); err != nil { + log.Fatal(err) + } + return out +} + +type FairShareMetrics struct { + fairshare float64 +} + +func ParseFairShareMetrics() map[string]*FairShareMetrics { + accounts := make(map[string]*FairShareMetrics) + lines := strings.Split(string(FairShareData()), "\n") + for _, line := range lines { + if ! strings.HasPrefix(line," ") { + if strings.Contains(line,"|") { + account := strings.Trim(strings.Split(line,"|")[0]," ") + _,key := accounts[account] + if !key { + accounts[account] = &FairShareMetrics{0} + } + fairshare,_ := strconv.ParseFloat(strings.Split(line,"|")[1],64) + accounts[account].fairshare = fairshare + } + } + } + return accounts +} + +type FairShareCollector struct { + fairshare *prometheus.Desc +} + +func NewFairShareCollector() *FairShareCollector { + labels := []string{"account"} + return &FairShareCollector{ + fairshare: prometheus.NewDesc("slurm_account_fairshare","FairShare for account" , labels,nil), + } +} + +func (fsc *FairShareCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- fsc.fairshare +} + +func (fsc *FairShareCollector) Collect(ch chan<- prometheus.Metric) { + fsm := ParseFairShareMetrics() + for f := range fsm { + ch <- prometheus.MustNewConstMetric(fsc.fairshare, prometheus.GaugeValue, fsm[f].fairshare, f) + } +} ++++++ vendor.tar.gz ++++++