Package: src:slurm-wlm
Version: 24.11.5-4

The original patch attachment in this bug accidentally included unrelated
changes in doc/man/man5/slurm.conf.5.

Please ignore the original attachment and use the attached corrected
patch instead.

The unintended hunks in the original attachment were:
- spank(7) -> spank(8) near the plugstack.conf text
- interface -> interace in the fm_url paragraph
- spank(7) -> spank(8) in the SEE ALSO section

These edits are not part of the periodic_check_interval change.

The attached corrected patch contains only the
periodic_check_interval feature and its associated documentation and test
changes.

Upstream reference:
https://github.com/SchedMD/slurm/pull/200

Thanks,
Dmitri
Description: add periodic_check_interval SlurmctldParameters option
Author: Dmitri Khokhlov <[email protected]>
Forwarded: https://support.schedmd.com/show_bug.cgi?id=25294
Last-Update: 2026-05-27

--- a/doc/html/power_save.shtml
+++ b/doc/html/power_save.shtml
@@ -296,6 +296,17 @@
         State=CLOUD nodes, the default is 90.</p>
       </dd>
 
+      <dt id="periodic_check_interval"><b>periodic_check_interval=#</b><a 
class=
+      "slurm_link" href="#periodic_check_interval"></a>
+      </dt>
+
+      <dd>
+        <p>How often slurmctld runs periodic background checks, including job
+        time-limit handling, reservation checks, and node timer checks. Lower
+        values can reduce the delay before jobs progress after nodes resume
+        and register. Default is 30 seconds.</p>
+      </dd>
+
       <dt id="power_save_interval"><b>power_save_interval=#</b><a class=
       "slurm_link" href="#power_save_interval"></a>
       </dt>
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -4981,6 +4981,14 @@
 .IP

 .TP
+\fBperiodic_check_interval\fR=\#
+How often slurmctld runs periodic background checks, including job time-limit
+handling, reservation checks, and node timer checks. Lower values can reduce
+the delay before jobs progress after nodes resume and register. Default is 30
+seconds.
+.IP
+
+.TP
 \fBpower_save_interval\fR
 How often the power_save thread looks to resume and suspend nodes. The
 power_save thread will do work sooner if there are node state changes. Default
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -70,6 +70,7 @@
 #include "src/common/log.h"
 #include "src/common/macros.h"
 #include "src/common/pack.h"
+#include "src/common/parse_value.h"
 #include "src/common/port_mgr.h"
 #include "src/common/proc_args.h"
 #include "src/common/read_config.h"
@@ -559,6 +560,44 @@
        }
 }
 
+static void _close_acct_storage_conn(void)
+{
+       if (acct_db_conn)
+               acct_storage_g_close_connection(&acct_db_conn);
+
+       acct_storage_g_fini();
+       slurm_persist_conn_recv_server_fini();
+}
+
+extern uint16_t get_periodic_check_interval(void)
+{
+       static time_t config_update = (time_t) -1;
+       static uint16_t periodic_check_interval = PERIODIC_TIMEOUT;
+       char *tmp_ptr;
+       uint16_t tmp_interval = PERIODIC_TIMEOUT;
+
+       if (config_update == slurm_conf.last_update)
+               return periodic_check_interval;
+
+       if ((tmp_ptr = conf_get_opt_str(slurm_conf.slurmctld_params,
+                                       "periodic_check_interval="))) {
+               if (s_p_handle_uint16(&tmp_interval,
+                                     "periodic_check_interval",
+                                     tmp_ptr) ||
+                   !tmp_interval || (tmp_interval == INFINITE16)) {
+                       error("SlurmctldParameters option 
periodic_check_interval=%s "
+                             "is invalid, using default %u",
+                             tmp_ptr, PERIODIC_TIMEOUT);
+                       tmp_interval = PERIODIC_TIMEOUT;
+               }
+               xfree(tmp_ptr);
+       }
+
+       periodic_check_interval = tmp_interval;
+       config_update = slurm_conf.last_update;
+
+       return periodic_check_interval;
+}
 /* main - slurmctld main function, start various threads and process RPCs */
 int main(int argc, char **argv)
 {
@@ -2492,7 +2531,8 @@
 
                validate_all_reservations(true);
 
-               if (difftime(now, last_timelimit_time) >= PERIODIC_TIMEOUT) {
+               if (difftime(now, last_timelimit_time) >=
+                   get_periodic_check_interval()) {
                        lock_slurmctld(job_write_lock);
                        now = time(NULL);
                        last_timelimit_time = now;
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -9306,7 +9306,8 @@
                }
 
                /* Give srun command warning message about pending timeout */
-               if (job_ptr->end_time <= (now + PERIODIC_TIMEOUT * 2))
+               if (job_ptr->end_time <=
+                   (now + get_periodic_check_interval() * 2))
                        srun_timeout (job_ptr);
 
                /*
@@ -18997,7 +18998,8 @@
            !(job_ptr->warn_flags & WARN_SENT) &&
            (ignore_time ||
             (job_ptr->warn_time &&
-             ((job_ptr->warn_time + PERIODIC_TIMEOUT + time(NULL)) >=
+             ((job_ptr->warn_time + get_periodic_check_interval() +
+               time(NULL)) >=
               job_ptr->end_time)))) {
                /*
                 * If --signal B option was not specified,
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -2149,6 +2149,7 @@
  * resume_after - Resume a down|drain node after resume_after time.
  */
 extern void check_node_timers(void);
+extern uint16_t get_periodic_check_interval(void);
 
 /*
  * Send warning signal to job before end time.
--- a/testsuite/python/tests/test_141_1.py
+++ b/testsuite/python/tests/test_141_1.py
@@ -12,6 +12,7 @@
 suspend_time = 10
 suspend_timeout = 10
 resume_timeout = 10
+periodic_check_interval = 2
 
 
 @pytest.fixture(scope="module", autouse=True)
@@ -37,6 +38,9 @@
     # Mark nodes as IDLE, regardless of current state, when suspending nodes 
with
     # SuspendProgram so that nodes will be eligible to be resumed at a later 
time
     atf.require_config_parameter_includes("SlurmctldParameters", 
"idle_on_node_suspend")
+    atf.require_config_parameter_includes(
+        "SlurmctldParameters", 
f"periodic_check_interval={periodic_check_interval}"
+    )
 
     # Register the cloud node in slurm.conf
     atf.require_config_parameter(
@@ -106,6 +110,40 @@
 
 
 # Tests
+def test_periodic_check_interval():
+    """Test periodic_check_interval advances a CONFIGURING job after node 
registration."""
+    job_id = atf.submit_job_sbatch("-p cloud1 --wrap 'srun sleep 10'", 
fatal=True)
+    atf.wait_for_node_state(f"{node_prefix}1", "ALLOCATED", timeout=5, 
fatal=True)
+    atf.wait_for_node_state(f"{node_prefix}1", "POWERING_UP", fatal=True)
+    assert "CONFIGURING" == atf.get_job_parameter(
+        job_id, "JobState", default="NOT_FOUND", quiet=True
+    ), "Submitted job should be in CONFIGURING state while its ALLOCATED cloud 
node is POWERING_UP"
+
+    # TODO: Wait 2 seconds to avoid race condition between slurmd and slurmctld
+    #       Remove once bug 16459 is fixed.
+    time.sleep(2)
+
+    atf.run_command(
+        f"{atf.properties['slurm-sbin-dir']}/slurmd -b -N {node_prefix}1 
--conf 'feature=f1'",
+        fatal=True,
+        user="root",
+    )
+
+    atf.wait_for_node_state(
+        f"{node_prefix}1",
+        "POWERING_UP",
+        reverse=True,
+        timeout=resume_timeout + 5,
+        fatal=True,
+    )
+
+    assert atf.wait_for_job_state(
+        job_id,
+        "RUNNING",
+        timeout=periodic_check_interval + 5,
+    )
+
+
 # Test state cycle of cloud nodes: POWERED_DOWN, POWERING_UP, IDLE,
 # POWERING_DOWN, POWERED_DOWN
 def test_cloud_state_cycle():

Reply via email to