Package: src:slurm-wlm Version: 24.11.5-4 The original patch attachment in this bug accidentally included unrelated changes in doc/man/man5/slurm.conf.5.
Please ignore the original attachment and use the attached corrected patch instead. The unintended hunks in the original attachment were: - spank(7) -> spank(8) near the plugstack.conf text - interface -> interace in the fm_url paragraph - spank(7) -> spank(8) in the SEE ALSO section These edits are not part of the periodic_check_interval change. The attached corrected patch contains only the periodic_check_interval feature and its associated documentation and test changes. Upstream reference: https://github.com/SchedMD/slurm/pull/200 Thanks, Dmitri
Description: add periodic_check_interval SlurmctldParameters option Author: Dmitri Khokhlov <[email protected]> Forwarded: https://support.schedmd.com/show_bug.cgi?id=25294 Last-Update: 2026-05-27 --- a/doc/html/power_save.shtml +++ b/doc/html/power_save.shtml @@ -296,6 +296,17 @@ State=CLOUD nodes, the default is 90.</p> </dd> + <dt id="periodic_check_interval"><b>periodic_check_interval=#</b><a class= + "slurm_link" href="#periodic_check_interval"></a> + </dt> + + <dd> + <p>How often slurmctld runs periodic background checks, including job + time-limit handling, reservation checks, and node timer checks. Lower + values can reduce the delay before jobs progress after nodes resume + and register. Default is 30 seconds.</p> + </dd> + <dt id="power_save_interval"><b>power_save_interval=#</b><a class= "slurm_link" href="#power_save_interval"></a> </dt> --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -4981,6 +4981,14 @@ .IP .TP +\fBperiodic_check_interval\fR=\# +How often slurmctld runs periodic background checks, including job time-limit +handling, reservation checks, and node timer checks. Lower values can reduce +the delay before jobs progress after nodes resume and register. Default is 30 +seconds. +.IP + +.TP \fBpower_save_interval\fR How often the power_save thread looks to resume and suspend nodes. The power_save thread will do work sooner if there are node state changes. Default --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -70,6 +70,7 @@ #include "src/common/log.h" #include "src/common/macros.h" #include "src/common/pack.h" +#include "src/common/parse_value.h" #include "src/common/port_mgr.h" #include "src/common/proc_args.h" #include "src/common/read_config.h" @@ -559,6 +560,44 @@ } } +static void _close_acct_storage_conn(void) +{ + if (acct_db_conn) + acct_storage_g_close_connection(&acct_db_conn); + + acct_storage_g_fini(); + slurm_persist_conn_recv_server_fini(); +} + +extern uint16_t get_periodic_check_interval(void) +{ + static time_t config_update = (time_t) -1; + static uint16_t periodic_check_interval = PERIODIC_TIMEOUT; + char *tmp_ptr; + uint16_t tmp_interval = PERIODIC_TIMEOUT; + + if (config_update == slurm_conf.last_update) + return periodic_check_interval; + + if ((tmp_ptr = conf_get_opt_str(slurm_conf.slurmctld_params, + "periodic_check_interval="))) { + if (s_p_handle_uint16(&tmp_interval, + "periodic_check_interval", + tmp_ptr) || + !tmp_interval || (tmp_interval == INFINITE16)) { + error("SlurmctldParameters option periodic_check_interval=%s " + "is invalid, using default %u", + tmp_ptr, PERIODIC_TIMEOUT); + tmp_interval = PERIODIC_TIMEOUT; + } + xfree(tmp_ptr); + } + + periodic_check_interval = tmp_interval; + config_update = slurm_conf.last_update; + + return periodic_check_interval; +} /* main - slurmctld main function, start various threads and process RPCs */ int main(int argc, char **argv) { @@ -2492,7 +2531,8 @@ validate_all_reservations(true); - if (difftime(now, last_timelimit_time) >= PERIODIC_TIMEOUT) { + if (difftime(now, last_timelimit_time) >= + get_periodic_check_interval()) { lock_slurmctld(job_write_lock); now = time(NULL); last_timelimit_time = now; --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -9306,7 +9306,8 @@ } /* Give srun command warning message about pending timeout */ - if (job_ptr->end_time <= (now + PERIODIC_TIMEOUT * 2)) + if (job_ptr->end_time <= + (now + get_periodic_check_interval() * 2)) srun_timeout (job_ptr); /* @@ -18997,7 +18998,8 @@ !(job_ptr->warn_flags & WARN_SENT) && (ignore_time || (job_ptr->warn_time && - ((job_ptr->warn_time + PERIODIC_TIMEOUT + time(NULL)) >= + ((job_ptr->warn_time + get_periodic_check_interval() + + time(NULL)) >= job_ptr->end_time)))) { /* * If --signal B option was not specified, --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -2149,6 +2149,7 @@ * resume_after - Resume a down|drain node after resume_after time. */ extern void check_node_timers(void); +extern uint16_t get_periodic_check_interval(void); /* * Send warning signal to job before end time. --- a/testsuite/python/tests/test_141_1.py +++ b/testsuite/python/tests/test_141_1.py @@ -12,6 +12,7 @@ suspend_time = 10 suspend_timeout = 10 resume_timeout = 10 +periodic_check_interval = 2 @pytest.fixture(scope="module", autouse=True) @@ -37,6 +38,9 @@ # Mark nodes as IDLE, regardless of current state, when suspending nodes with # SuspendProgram so that nodes will be eligible to be resumed at a later time atf.require_config_parameter_includes("SlurmctldParameters", "idle_on_node_suspend") + atf.require_config_parameter_includes( + "SlurmctldParameters", f"periodic_check_interval={periodic_check_interval}" + ) # Register the cloud node in slurm.conf atf.require_config_parameter( @@ -106,6 +110,40 @@ # Tests +def test_periodic_check_interval(): + """Test periodic_check_interval advances a CONFIGURING job after node registration.""" + job_id = atf.submit_job_sbatch("-p cloud1 --wrap 'srun sleep 10'", fatal=True) + atf.wait_for_node_state(f"{node_prefix}1", "ALLOCATED", timeout=5, fatal=True) + atf.wait_for_node_state(f"{node_prefix}1", "POWERING_UP", fatal=True) + assert "CONFIGURING" == atf.get_job_parameter( + job_id, "JobState", default="NOT_FOUND", quiet=True + ), "Submitted job should be in CONFIGURING state while its ALLOCATED cloud node is POWERING_UP" + + # TODO: Wait 2 seconds to avoid race condition between slurmd and slurmctld + # Remove once bug 16459 is fixed. + time.sleep(2) + + atf.run_command( + f"{atf.properties['slurm-sbin-dir']}/slurmd -b -N {node_prefix}1 --conf 'feature=f1'", + fatal=True, + user="root", + ) + + atf.wait_for_node_state( + f"{node_prefix}1", + "POWERING_UP", + reverse=True, + timeout=resume_timeout + 5, + fatal=True, + ) + + assert atf.wait_for_job_state( + job_id, + "RUNNING", + timeout=periodic_check_interval + 5, + ) + + # Test state cycle of cloud nodes: POWERED_DOWN, POWERING_UP, IDLE, # POWERING_DOWN, POWERED_DOWN def test_cloud_state_cycle():

