This is an automated email from the ASF dual-hosted git repository. DImuthuUpe pushed a commit to branch slurm-integration-test in repository https://gitbox.apache.org/repos/asf/airavata-custos.git
commit 3eb24bbdd39af5017e8b7acb0d8ece20e477417d Author: DImuthuUpe <[email protected]> AuthorDate: Thu May 21 08:38:45 2026 -0400 Setting up a local slurm cluster and running tests against it --- .../operations/accounts_integration_test.go | 131 ++++++++++++++++++ dev-ops/local-slurm/Makefile | 33 +++++ dev-ops/local-slurm/compose.yaml | 150 +++++++++++++++++++++ dev-ops/local-slurm/docker/Dockerfile.base | 33 +++++ dev-ops/local-slurm/docker/Dockerfile.login | 11 ++ dev-ops/local-slurm/docker/Dockerfile.slurmctld | 8 ++ dev-ops/local-slurm/docker/Dockerfile.slurmd | 6 + dev-ops/local-slurm/docker/Dockerfile.slurmdbd | 7 + dev-ops/local-slurm/docker/Dockerfile.slurmrestd | 8 ++ .../local-slurm/scripts/bootstrap-accounting.sh | 30 +++++ dev-ops/local-slurm/scripts/entrypoint-ctld.sh | 19 +++ dev-ops/local-slurm/scripts/entrypoint-dbd.sh | 24 ++++ dev-ops/local-slurm/scripts/entrypoint-login.sh | 17 +++ dev-ops/local-slurm/scripts/entrypoint-restd.sh | 24 ++++ dev-ops/local-slurm/scripts/entrypoint-slurmd.sh | 24 ++++ dev-ops/local-slurm/scripts/init-keys.sh | 30 +++++ dev-ops/local-slurm/slurm/cgroup.conf | 1 + dev-ops/local-slurm/slurm/gres.conf | 2 + dev-ops/local-slurm/slurm/slurm.conf | 38 ++++++ dev-ops/local-slurm/slurm/slurmdbd.conf | 18 +++ 20 files changed, 614 insertions(+) diff --git a/connectors/SLURM/Association-Mapper/internal/operations/accounts_integration_test.go b/connectors/SLURM/Association-Mapper/internal/operations/accounts_integration_test.go new file mode 100644 index 000000000..fc86c33b3 --- /dev/null +++ b/connectors/SLURM/Association-Mapper/internal/operations/accounts_integration_test.go @@ -0,0 +1,131 @@ +package operations + +import ( + "os" + "testing" +) + +func crearteAndValidateAccount(t *testing.T, client *Client) { + + err := client.CreateAccount(Account{ + Name: "test_account", + Description: "Test account for integration testing", + Organization: "Test Organization", + }, "artisan") + + if err != nil { + t.Fatalf("Failed to create account: %v", err) + } + + accounts, err := client.ListAccounts() + if err != nil { + t.Fatalf("Failed to list accounts: %v", err) + } + + if len(accounts) == 0 { + t.Fatal("No accounts found after creation") + } + + for _, account := range accounts { + if account.Name == "test_account" { + t.Logf("Successfully created account: %+v\n", account) + return + } + } +} + +func isLocalSlurmConfigAvailable() bool { + if os.Getenv("TEST_SLURM_API") == "" || os.Getenv("TEST_SLURM_USER") == "" || os.Getenv("TEST_SLURM_TOKEN") == "" || os.Getenv("TEST_SLURM_API_VERSION") == "" { + return false + } + return true +} + +func TestAccountCreatiion_Integration(t *testing.T) { + + if !isLocalSlurmConfigAvailable() { + t.Skip("Skipping integration test for account creation because local SLURM config is not available") + } + + apiUrl := os.Getenv("TEST_SLURM_API") + user := os.Getenv("TEST_SLURM_USER") + token := os.Getenv("TEST_SLURM_TOKEN") + apiVersion := os.Getenv("TEST_SLURM_API_VERSION") + + client := New(apiUrl, user, token, apiVersion) + + client.DeleteAccount("test_account") // clean up before test in case it was left over from a previous failed test run + defer client.DeleteAccount("test_account") // clean up after test + crearteAndValidateAccount(t, client) +} + +func TestAccountDeletion_Integration(t *testing.T) { + + if !isLocalSlurmConfigAvailable() { + t.Skip("Skipping integration test for account deletion because local SLURM config is not available") + } + + apiUrl := os.Getenv("TEST_SLURM_API") + user := os.Getenv("TEST_SLURM_USER") + token := os.Getenv("TEST_SLURM_TOKEN") + apiVersion := os.Getenv("TEST_SLURM_API_VERSION") + + client := New(apiUrl, user, token, apiVersion) + + crearteAndValidateAccount(t, client) + + err := client.DeleteAccount("test_account") + if err != nil { + t.Fatalf("Failed to delete account: %v", err) + } + + accounts, err := client.ListAccounts() + if err != nil { + t.Fatalf("Failed to list accounts: %v", err) + } + + for _, account := range accounts { + if account.Name == "test_account" { + t.Fatalf("Account was not deleted: %+v\n", account) + } + } + + t.Logf("Successfully deleted account. Remaining accounts: %+v\n", accounts) +} + +func TestGetAccount_Integration(t *testing.T) { + + if !isLocalSlurmConfigAvailable() { + t.Skip("Skipping integration test for get account because local SLURM config is not available") + } + + apiUrl := os.Getenv("TEST_SLURM_API") + user := os.Getenv("TEST_SLURM_USER") + token := os.Getenv("TEST_SLURM_TOKEN") + apiVersion := os.Getenv("TEST_SLURM_API_VERSION") + + client := New(apiUrl, user, token, apiVersion) + + client.DeleteAccount("test_account") // clean up before test in case it was left over from a previous failed test run + defer client.DeleteAccount("test_account") // clean up after test + crearteAndValidateAccount(t, client) + + account, err := client.GetAccount("test_account") + if err != nil { + t.Fatalf("Failed to get account: %v", err) + } + + if account.Name != "test_account" { + t.Fatalf("Expected account name 'test_account', got '%s'", account.Name) + } + + if account.Description != "Test account for integration testing" { + t.Fatalf("Expected account description 'Test account for integration testing', got '%s'", account.Description) + } + + if account.Organization != "Test Organization" { + t.Fatalf("Expected account organization 'Test Organization', got '%s'", account.Organization) + } + + t.Logf("Successfully retrieved account: %+v\n", account) +} diff --git a/dev-ops/local-slurm/Makefile b/dev-ops/local-slurm/Makefile new file mode 100644 index 000000000..e7ac5f228 --- /dev/null +++ b/dev-ops/local-slurm/Makefile @@ -0,0 +1,33 @@ +# Makefile +SHELL := /bin/bash +.PHONY: base up down build cli test test-integration smoke lint keys logs + +base: + docker build -f docker/Dockerfile.base -t slurmrest/base:24.05 . + +up: base + docker compose up -d --build + +down: + docker compose down -v + +build: base + docker compose build + +smoke: + docker compose exec login sbatch --wrap 'hostname' -o /tmp/out.txt + sleep 5 + docker compose exec login sacct -n -o JobID,State --starttime now-5minutes | tail -n 5 + docker compose exec login cat /tmp/out.txt + +lint: + cd cli && go vet ./... + +keys: + docker compose up init-keys + +logs: + docker compose logs -f --tail=100 + +token: + docker compose exec login scontrol token \ No newline at end of file diff --git a/dev-ops/local-slurm/compose.yaml b/dev-ops/local-slurm/compose.yaml new file mode 100644 index 000000000..f86beb03b --- /dev/null +++ b/dev-ops/local-slurm/compose.yaml @@ -0,0 +1,150 @@ +# compose.yaml +name: slurmrest + +x-slurm-env: &slurm-env + MARIADB_PASSWORD: ${MARIADB_PASSWORD:-slurm} + +services: + init-keys: + image: slurmrest/base:24.05 + command: ["/usr/local/bin/init-keys.sh"] + volumes: + - munge-key:/etc/munge + - jwt-key:/keys + - ./scripts/init-keys.sh:/usr/local/bin/init-keys.sh:ro + restart: "no" + + mariadb: + image: mariadb:11 + environment: + MARIADB_ROOT_PASSWORD: ${MARIADB_ROOT_PASSWORD:-rootpass} + MARIADB_DATABASE: slurm_acct_db + MARIADB_USER: slurm + MARIADB_PASSWORD: ${MARIADB_PASSWORD:-slurm} + volumes: + - mariadb-data:/var/lib/mysql + healthcheck: + test: ["CMD", "mariadb-admin", "ping", "-uslurm", "-p${MARIADB_PASSWORD:-slurm}"] + interval: 5s + timeout: 3s + retries: 20 + + slurmdbd: + build: + context: . + dockerfile: docker/Dockerfile.slurmdbd + hostname: slurmdbd + environment: *slurm-env + depends_on: + init-keys: + condition: service_completed_successfully + mariadb: + condition: service_healthy + volumes: + - munge-key:/etc/munge:ro + - jwt-key:/keys:ro + - ./slurm:/etc/slurm.readonly:ro + healthcheck: + test: ["CMD-SHELL", "pgrep -x slurmdbd >/dev/null"] + interval: 5s + timeout: 3s + retries: 20 + + slurmctld: + build: + context: . + dockerfile: docker/Dockerfile.slurmctld + hostname: slurmctld + environment: + CLUSTER_NAME: ${CLUSTER_NAME:-artisan} + depends_on: + slurmdbd: + condition: service_healthy + volumes: + - munge-key:/etc/munge:ro + - jwt-key:/keys:ro + - ./slurm:/etc/slurm.readonly:ro + - slurmctld-state:/var/spool/slurm + healthcheck: + test: ["CMD-SHELL", "scontrol ping >/dev/null 2>&1"] + interval: 5s + timeout: 3s + retries: 30 + + c1: + build: + context: . + dockerfile: docker/Dockerfile.slurmd + hostname: c1 + environment: + SLURMD_NODENAME: c1 + depends_on: + slurmctld: + condition: service_healthy + volumes: + - munge-key:/etc/munge:ro + - jwt-key:/keys:ro + - ./slurm:/etc/slurm.readonly:ro + healthcheck: + test: ["CMD-SHELL", "pgrep -x slurmd >/dev/null"] + interval: 5s + timeout: 3s + retries: 20 + + c2: + build: + context: . + dockerfile: docker/Dockerfile.slurmd + hostname: c2 + environment: + SLURMD_NODENAME: c2 + depends_on: + slurmctld: + condition: service_healthy + volumes: + - munge-key:/etc/munge:ro + - jwt-key:/keys:ro + - ./slurm:/etc/slurm.readonly:ro + healthcheck: + test: ["CMD-SHELL", "pgrep -x slurmd >/dev/null"] + interval: 5s + timeout: 3s + retries: 20 + + login: + build: + context: . + dockerfile: docker/Dockerfile.login + hostname: login + ports: + - "${LOGIN_SSH_PORT:-2222}:22" + depends_on: + slurmctld: + condition: service_healthy + volumes: + - munge-key:/etc/munge:ro + - jwt-key:/keys:ro + - ./slurm:/etc/slurm.readonly:ro + + slurmrestd: + build: + context: . + dockerfile: docker/Dockerfile.slurmrestd + hostname: slurmrestd + ports: + - "${REST_PORT:-6820}:6820" + depends_on: + slurmctld: + condition: service_healthy + slurmdbd: + condition: service_healthy + volumes: + - munge-key:/etc/munge:ro + - jwt-key:/keys:ro + - ./slurm:/etc/slurm.readonly:ro + +volumes: + munge-key: + jwt-key: + mariadb-data: + slurmctld-state: diff --git a/dev-ops/local-slurm/docker/Dockerfile.base b/dev-ops/local-slurm/docker/Dockerfile.base new file mode 100644 index 000000000..f252d76d5 --- /dev/null +++ b/dev-ops/local-slurm/docker/Dockerfile.base @@ -0,0 +1,33 @@ +# docker/Dockerfile.base +FROM rockylinux:9 + +ARG SLURM_VERSION=24.05.5 + +RUN dnf -y install epel-release \ + && dnf -y install dnf-plugins-core \ + && dnf config-manager --set-enabled crb \ + && dnf -y install \ + munge munge-libs munge-devel \ + mariadb-connector-c mariadb-connector-c-devel \ + http-parser-devel json-c-devel libyaml-devel libjwt-devel \ + dbus-devel \ + pam-devel readline-devel perl perl-Switch \ + gcc make wget which procps-ng iproute bzip2 \ + openssh-server openssh-clients \ + python3 python3-pip \ + && dnf clean all + +RUN useradd -r -u 995 -g 0 -s /sbin/nologin slurm \ + && install -d -o slurm -g 0 -m 0755 /var/spool/slurm /var/log/slurm /var/run/slurm /etc/slurm + +RUN wget -q https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 \ + && tar -xjf slurm-${SLURM_VERSION}.tar.bz2 \ + && cd slurm-${SLURM_VERSION} \ + && ./configure --prefix=/usr --sysconfdir=/etc/slurm \ + --enable-slurmrestd \ + && make -j"$(nproc)" && make install \ + && cd .. && rm -rf slurm-${SLURM_VERSION}* + +RUN ssh-keygen -A + +CMD ["/bin/bash"] diff --git a/dev-ops/local-slurm/docker/Dockerfile.login b/dev-ops/local-slurm/docker/Dockerfile.login new file mode 100644 index 000000000..204834631 --- /dev/null +++ b/dev-ops/local-slurm/docker/Dockerfile.login @@ -0,0 +1,11 @@ +FROM slurmrest/base:24.05 + +RUN echo "PermitRootLogin yes" >> /etc/ssh/sshd_config \ + && echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config \ + && echo "root:rootpass" | chpasswd + +COPY scripts/entrypoint-login.sh /usr/local/bin/entrypoint-login.sh +RUN chmod +x /usr/local/bin/entrypoint-login.sh + +EXPOSE 22 +ENTRYPOINT ["/usr/local/bin/entrypoint-login.sh"] diff --git a/dev-ops/local-slurm/docker/Dockerfile.slurmctld b/dev-ops/local-slurm/docker/Dockerfile.slurmctld new file mode 100644 index 000000000..27aa426ee --- /dev/null +++ b/dev-ops/local-slurm/docker/Dockerfile.slurmctld @@ -0,0 +1,8 @@ +# docker/Dockerfile.slurmctld +FROM slurmrest/base:24.05 + +COPY scripts/entrypoint-ctld.sh /usr/local/bin/entrypoint-ctld.sh +COPY scripts/bootstrap-accounting.sh /usr/local/bin/bootstrap-accounting.sh +RUN chmod +x /usr/local/bin/entrypoint-ctld.sh /usr/local/bin/bootstrap-accounting.sh + +ENTRYPOINT ["/usr/local/bin/entrypoint-ctld.sh"] diff --git a/dev-ops/local-slurm/docker/Dockerfile.slurmd b/dev-ops/local-slurm/docker/Dockerfile.slurmd new file mode 100644 index 000000000..97ab1c857 --- /dev/null +++ b/dev-ops/local-slurm/docker/Dockerfile.slurmd @@ -0,0 +1,6 @@ +FROM slurmrest/base:24.05 + +COPY scripts/entrypoint-slurmd.sh /usr/local/bin/entrypoint-slurmd.sh +RUN chmod +x /usr/local/bin/entrypoint-slurmd.sh + +ENTRYPOINT ["/usr/local/bin/entrypoint-slurmd.sh"] diff --git a/dev-ops/local-slurm/docker/Dockerfile.slurmdbd b/dev-ops/local-slurm/docker/Dockerfile.slurmdbd new file mode 100644 index 000000000..9b3060658 --- /dev/null +++ b/dev-ops/local-slurm/docker/Dockerfile.slurmdbd @@ -0,0 +1,7 @@ +# docker/Dockerfile.slurmdbd +FROM slurmrest/base:24.05 + +COPY scripts/entrypoint-dbd.sh /usr/local/bin/entrypoint-dbd.sh +RUN chmod +x /usr/local/bin/entrypoint-dbd.sh + +ENTRYPOINT ["/usr/local/bin/entrypoint-dbd.sh"] diff --git a/dev-ops/local-slurm/docker/Dockerfile.slurmrestd b/dev-ops/local-slurm/docker/Dockerfile.slurmrestd new file mode 100644 index 000000000..70bf9915e --- /dev/null +++ b/dev-ops/local-slurm/docker/Dockerfile.slurmrestd @@ -0,0 +1,8 @@ +# docker/Dockerfile.slurmrestd +FROM slurmrest/base:24.05 + +COPY scripts/entrypoint-restd.sh /usr/local/bin/entrypoint-restd.sh +RUN chmod +x /usr/local/bin/entrypoint-restd.sh + +EXPOSE 6820 +ENTRYPOINT ["/usr/local/bin/entrypoint-restd.sh"] diff --git a/dev-ops/local-slurm/scripts/bootstrap-accounting.sh b/dev-ops/local-slurm/scripts/bootstrap-accounting.sh new file mode 100755 index 000000000..a8494393c --- /dev/null +++ b/dev-ops/local-slurm/scripts/bootstrap-accounting.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Idempotent: creates the demo cluster, root account, root admin user. +set -euo pipefail + +SENTINEL=/var/spool/slurm/ctld/.bootstrap-done + +if [[ -f "$SENTINEL" ]]; then + echo "[bootstrap] sentinel present, skipping" + exit 0 +fi + +# Wait until slurmdbd answers +until sacctmgr -i show cluster >/dev/null 2>&1; do + echo "[bootstrap] waiting for slurmdbd..." + sleep 2 +done + +CLUSTER="${CLUSTER_NAME:-artisan}" +if ! sacctmgr -in show cluster format=cluster | grep -qw "$CLUSTER"; then + sacctmgr -i add cluster "$CLUSTER" +fi +if ! sacctmgr -in show account format=account | grep -qw "root"; then + sacctmgr -i add account root Description="root account" Organization="$CLUSTER" +fi +if ! sacctmgr -in show user format=user | grep -qw "root"; then + sacctmgr -i add user root Account=root AdminLevel=Administrator +fi + +touch "$SENTINEL" +echo "[bootstrap] done" diff --git a/dev-ops/local-slurm/scripts/entrypoint-ctld.sh b/dev-ops/local-slurm/scripts/entrypoint-ctld.sh new file mode 100755 index 000000000..f4cec4ca4 --- /dev/null +++ b/dev-ops/local-slurm/scripts/entrypoint-ctld.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -euo pipefail + +install -m 0644 /etc/slurm.readonly/slurm.conf /etc/slurm/slurm.conf +install -m 0644 /etc/slurm.readonly/cgroup.conf /etc/slurm/cgroup.conf +ln -sf /keys/jwt.hs256.key /etc/slurm/jwt.hs256.key + +# Ensure StateSaveLocation exists (the slurmctld-state named volume is empty +# on first boot; /var/spool/slurm itself is created by the base image). +install -d -m 0755 -o slurm -g 0 /var/spool/slurm/ctld + +install -d -m 0755 -o munge -g munge /var/run/munge +install -d -m 0700 -o munge -g munge /var/log/munge /var/lib/munge +runuser -u munge -- /usr/sbin/munged --force + +# Bootstrap accounting in the background after slurmctld comes up +( sleep 5; /usr/local/bin/bootstrap-accounting.sh ) & + +exec slurmctld -D -vvv diff --git a/dev-ops/local-slurm/scripts/entrypoint-dbd.sh b/dev-ops/local-slurm/scripts/entrypoint-dbd.sh new file mode 100755 index 000000000..40a7becd8 --- /dev/null +++ b/dev-ops/local-slurm/scripts/entrypoint-dbd.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Copy slurmdbd.conf with the 0600 perms that slurmdbd requires +# (slurm user's primary group is root/gid 0 — there is no 'slurm' group) +install -m 0600 -o slurm -g 0 /etc/slurm.readonly/slurmdbd.conf /etc/slurm/slurmdbd.conf +ln -sf /keys/jwt.hs256.key /etc/slurm/jwt.hs256.key + +# Start munge. /var/run/munge must be world-readable (0755) so non-munge +# users (slurm) can open the munge socket; /var/log and /var/lib stay 0700. +install -d -m 0755 -o munge -g munge /var/run/munge +install -d -m 0700 -o munge -g munge /var/log/munge /var/lib/munge +runuser -u munge -- /usr/sbin/munged --force + +# Wait for MariaDB to accept TCP connections. The compose healthcheck on the +# mariadb service already gates startup via depends_on, but this adds a +# belt-and-suspenders TCP probe (base image has no mariadb client binary). +until (exec 3<>/dev/tcp/mariadb/3306) 2>/dev/null; do + echo "[slurmdbd] waiting for mariadb..." + sleep 2 +done +exec 3<&- 3>&- || true + +exec slurmdbd -D -vvv diff --git a/dev-ops/local-slurm/scripts/entrypoint-login.sh b/dev-ops/local-slurm/scripts/entrypoint-login.sh new file mode 100755 index 000000000..4f77e0ca5 --- /dev/null +++ b/dev-ops/local-slurm/scripts/entrypoint-login.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -euo pipefail + +install -m 0644 /etc/slurm.readonly/slurm.conf /etc/slurm/slurm.conf +ln -sf /keys/jwt.hs256.key /etc/slurm/jwt.hs256.key + +install -d -m 0755 -o munge -g munge /var/run/munge +install -d -m 0700 -o munge -g munge /var/log/munge /var/lib/munge +runuser -u munge -- /usr/sbin/munged --force + +# Ensure the default test user exists +id -u testuser >/dev/null 2>&1 || useradd -m -s /bin/bash testuser +id -u testuser2 >/dev/null 2>&1 || useradd -m -s /bin/bash testuser2 +id -u testuser3 >/dev/null 2>&1 || useradd -m -s /bin/bash testuser3 + +# Start sshd in the foreground +exec /usr/sbin/sshd -D -e diff --git a/dev-ops/local-slurm/scripts/entrypoint-restd.sh b/dev-ops/local-slurm/scripts/entrypoint-restd.sh new file mode 100755 index 000000000..2060025b7 --- /dev/null +++ b/dev-ops/local-slurm/scripts/entrypoint-restd.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +install -m 0644 /etc/slurm.readonly/slurm.conf /etc/slurm/slurm.conf +ln -sf /keys/jwt.hs256.key /etc/slurm/jwt.hs256.key + +install -d -m 0755 -o munge -g munge /var/run/munge +install -d -m 0700 -o munge -g munge /var/log/munge /var/lib/munge +runuser -u munge -- /usr/sbin/munged --force + +# slurmrestd must not run as root. +# SLURM_JWT=daemon makes slurmrestd trust its own internal JWT for daemon-to-daemon calls; +# external requests still require X-SLURM-USER-TOKEN. +# SLURMRESTD_SECURITY flags: +# disable_unshare_sysv/files: Docker denies CLONE_NEWIPC without CAP_SYS_ADMIN, +# which we don't want to grant; skip those hardening steps. +# disable_user_check: the base image's 'slurm' user is slurm:0 (no slurm group +# exists), matching how slurmdbd/slurmctld already run. slurmrestd's default +# check rejects root primary group; we opt out since the daemon itself is not +# running as uid 0. +exec runuser -u slurm -- env \ + SLURM_JWT=daemon \ + SLURMRESTD_SECURITY=disable_unshare_sysv,disable_unshare_files,disable_user_check \ + slurmrestd -f /etc/slurm/slurm.conf -a rest_auth/jwt 0.0.0.0:6820 -vvv diff --git a/dev-ops/local-slurm/scripts/entrypoint-slurmd.sh b/dev-ops/local-slurm/scripts/entrypoint-slurmd.sh new file mode 100755 index 000000000..15166d449 --- /dev/null +++ b/dev-ops/local-slurm/scripts/entrypoint-slurmd.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +install -m 0644 /etc/slurm.readonly/slurm.conf /etc/slurm/slurm.conf +install -m 0644 /etc/slurm.readonly/cgroup.conf /etc/slurm/cgroup.conf +install -m 0644 /etc/slurm.readonly/gres.conf /etc/slurm/gres.conf +ln -sf /keys/jwt.hs256.key /etc/slurm/jwt.hs256.key + +# Ensure SlurmdSpoolDir exists (slurm.conf sets it to /var/spool/slurm/d; +# /var/spool/slurm itself is created by the base image but the subdir is not). +install -d -m 0755 -o slurm -g 0 /var/spool/slurm/d + +# Create two fake GPU device files so slurmd can register Gres=gpu:2 against +# distinct File= entries. These are just /dev/null-style sinks — there are no +# real GPUs. gres.conf references /dev/nullgpu0 and /dev/nullgpu1. +for i in 0 1; do + [ -e "/dev/nullgpu${i}" ] || mknod -m 0666 "/dev/nullgpu${i}" c 1 3 +done + +install -d -m 0755 -o munge -g munge /var/run/munge +install -d -m 0700 -o munge -g munge /var/log/munge /var/lib/munge +runuser -u munge -- /usr/sbin/munged --force + +exec slurmd -D -N "${SLURMD_NODENAME:-$(hostname)}" -vvv diff --git a/dev-ops/local-slurm/scripts/init-keys.sh b/dev-ops/local-slurm/scripts/init-keys.sh new file mode 100755 index 000000000..5bfdac8c5 --- /dev/null +++ b/dev-ops/local-slurm/scripts/init-keys.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# scripts/init-keys.sh — generate munge + JWT keys into shared volumes if missing. +set -euo pipefail + +MUNGE_KEY=/etc/munge/munge.key +# jwt-key volume is mounted at /keys; each service symlinks into /etc/slurm/ +JWT_KEY=/keys/jwt.hs256.key + +if [[ ! -s "$MUNGE_KEY" ]]; then + echo "[init-keys] generating $MUNGE_KEY" + install -d -m 0700 -o munge -g munge /etc/munge + dd if=/dev/urandom of="$MUNGE_KEY" bs=1 count=1024 status=none + chown munge:munge "$MUNGE_KEY" + chmod 0400 "$MUNGE_KEY" +else + echo "[init-keys] $MUNGE_KEY already present" +fi + +if [[ ! -s "$JWT_KEY" ]]; then + echo "[init-keys] generating $JWT_KEY" + install -d -m 0755 /keys + openssl rand -base64 32 | tr -d '\n' > "$JWT_KEY" + # slurm user's primary group is root (gid 0) in the base image; chown accordingly + chown slurm:0 "$JWT_KEY" + chmod 0400 "$JWT_KEY" +else + echo "[init-keys] $JWT_KEY already present" +fi + +echo "[init-keys] done" diff --git a/dev-ops/local-slurm/slurm/cgroup.conf b/dev-ops/local-slurm/slurm/cgroup.conf new file mode 100644 index 000000000..e59e9aeea --- /dev/null +++ b/dev-ops/local-slurm/slurm/cgroup.conf @@ -0,0 +1 @@ +CgroupPlugin=cgroup/v1 diff --git a/dev-ops/local-slurm/slurm/gres.conf b/dev-ops/local-slurm/slurm/gres.conf new file mode 100644 index 000000000..52e0c7e41 --- /dev/null +++ b/dev-ops/local-slurm/slurm/gres.conf @@ -0,0 +1,2 @@ +Name=gpu File=/dev/nullgpu0 +Name=gpu File=/dev/nullgpu1 diff --git a/dev-ops/local-slurm/slurm/slurm.conf b/dev-ops/local-slurm/slurm/slurm.conf new file mode 100644 index 000000000..49bd34e35 --- /dev/null +++ b/dev-ops/local-slurm/slurm/slurm.conf @@ -0,0 +1,38 @@ +# slurm/slurm.conf +ClusterName=artisan +SlurmctldHost=slurmctld + +AuthType=auth/munge +AuthAltTypes=auth/jwt +AuthAltParameters=jwt_key=/etc/slurm/jwt.hs256.key +CredType=cred/munge + +SlurmUser=slurm +SlurmctldPort=6817 +SlurmdPort=6818 +StateSaveLocation=/var/spool/slurm/ctld +SlurmdSpoolDir=/var/spool/slurm/d +SwitchType=switch/none +MpiDefault=none +ProctrackType=proctrack/linuxproc +TaskPlugin=task/none +ReturnToService=2 +SlurmdParameters=config_overrides + +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdLogFile=/var/log/slurm/slurmd.log + +SelectType=select/cons_tres +SelectTypeParameters=CR_CPU_Memory +GresTypes=gpu + +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageHost=slurmdbd +AccountingStorageTRES=gres/gpu +AccountingStorageEnforce=associations,limits,qos,safe +AccountingStoreFlags=job_comment +JobAcctGatherType=jobacct_gather/linux +JobAcctGatherFrequency=30 + +NodeName=c[1-2] CPUs=4 RealMemory=8000 Gres=gpu:2 State=UNKNOWN +PartitionName=compute Nodes=c[1-2] Default=YES MaxTime=INFINITE State=UP diff --git a/dev-ops/local-slurm/slurm/slurmdbd.conf b/dev-ops/local-slurm/slurm/slurmdbd.conf new file mode 100644 index 000000000..f34ea5429 --- /dev/null +++ b/dev-ops/local-slurm/slurm/slurmdbd.conf @@ -0,0 +1,18 @@ +# slurm/slurmdbd.conf — file mode must be 0600 when mounted +AuthType=auth/munge +AuthAltTypes=auth/jwt +AuthAltParameters=jwt_key=/etc/slurm/jwt.hs256.key + +DbdHost=slurmdbd +DbdPort=6819 +SlurmUser=slurm + +StorageType=accounting_storage/mysql +StorageHost=mariadb +StoragePort=3306 +StorageUser=slurm +StoragePass=slurm +StorageLoc=slurm_acct_db + +LogFile=/var/log/slurm/slurmdbd.log +PidFile=/var/run/slurm/slurmdbd.pid
