Hi Paul,
This should achieve the results that you are looking for using a new
configuration parameter. The attached patch, including documentation
changes, is built against Slurm version 2.6. You will need to use this
as a local patch for now. I will plan to include it as part of the
version 14.03 release next month.
Another option might be to configure each partition as a separate
cluster and run a separate slurmctld daemon for each partition. That
would improve scalability, but make more work for you and perhaps be
confusing for the users.
Moe Jette
SchedMD
On 2014-02-10 06:49, Paul Edmon wrote:
How difficult would it be to put a switch into SLURM where instead of
considering the global priority chain it would instead consider each
partition wholly independently with respect to both backfill and main
scheduling loop? In our environment we have many partitions. We also
have people submitting 1000's of jobs to those partitions and
partitions are at different priorities. Since SLURM (even in
backfill) runs down the priority chain higher priority queues can
impact scheduling in lower priority queues even of those queues do not
overlap in terms of hardware. It would be better in our case is SLURM
considered each partition as a wholly independent scheduling run and
did all of them both for backfill and main loop.
I know there is the bf_max_job_part option in the backfill loop but
it would be better to just have each partition be independent as that
way you don't get any cross talk. Can this be done? It would be
incredibly helpful for our environment.
-Paul Edmon-
diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index 3f118c5..dcbe390 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -1835,6 +1835,15 @@ after this number of job dependencies have been tested. The default value is
\fBmax_switch_wait=#\fR
Maximum number of seconds that a job can delay execution waiting for the
specified desired switch count. The default value is 300 seconds.
+.TP
+\fBpartition_job_depth=#\fR
+The default number of jobs to attempt scheduling (i.e. the queue depth)
+from each partition/queue in Slurm's main scheduling logic.
+The functionality is similar to that provided by the \fBbf_max_job_part\fR
+option for the backfill scheduling logic.
+The default value is 0 (no limit).
+Job's excluded from attempted scheduling based upon partition will not be
+counted against the \fBdefault_queue_depth\fR limit.
.RE
.TP
diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c
index 63c4b0e..88288c2 100644
--- a/src/slurmctld/job_scheduler.c
+++ b/src/slurmctld/job_scheduler.c
@@ -690,12 +690,14 @@ extern int schedule(uint32_t job_limit)
{
ListIterator job_iterator = NULL, part_iterator = NULL;
List job_queue = NULL;
- int error_code, failed_part_cnt = 0, job_cnt = 0, i;
+ int error_code, failed_part_cnt = 0, job_cnt = 0, i, j, part_cnt;
uint32_t job_depth = 0;
job_queue_rec_t *job_queue_rec;
struct job_record *job_ptr = NULL;
struct part_record *part_ptr, **failed_parts = NULL;
bitstr_t *save_avail_node_bitmap;
+ struct part_record **sched_part_ptr = NULL;
+ int *sched_part_jobs = NULL;
/* Locks: Read config, write job, write node, read partition */
slurmctld_lock_t job_write_lock =
{ READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK };
@@ -709,6 +711,7 @@ extern int schedule(uint32_t job_limit)
static bool fifo_sched = false;
static int sched_timeout = 0;
static int def_job_limit = 100;
+ static int max_jobs_per_part = 0;
time_t now = time(NULL), sched_start;
DEF_TIMERS;
@@ -748,6 +751,18 @@ extern int schedule(uint32_t job_limit)
def_job_limit = i;
}
}
+
+ if (sched_params &&
+ (tmp_ptr=strstr(sched_params, "partition_job_depth="))) {
+ /* 01234567890123456789 */
+ i = atoi(tmp_ptr + 20);
+ if (i < 0) {
+ error("ignoring SchedulerParameters: "
+ "partition_job_depth value of %d", i);
+ } else {
+ max_jobs_per_part = i;
+ }
+ }
xfree(sched_params);
sched_timeout = slurm_get_msg_timeout() / 2;
@@ -801,10 +816,24 @@ extern int schedule(uint32_t job_limit)
}
#endif
- failed_parts = xmalloc(sizeof(struct part_record *) *
- list_count(part_list));
+ part_cnt = list_count(part_list);
+ failed_parts = xmalloc(sizeof(struct part_record *) * part_cnt);
save_avail_node_bitmap = bit_copy(avail_node_bitmap);
+ if (max_jobs_per_part) {
+ ListIterator part_iterator;
+ sched_part_ptr = xmalloc(sizeof(struct part_record *) *
+ part_cnt);
+ sched_part_jobs = xmalloc(sizeof(int) * part_cnt);
+ part_iterator = list_iterator_create(part_list);
+ i = 0;
+ while ((part_ptr = (struct part_record *)
+ list_next(part_iterator))) {
+ sched_part_ptr[i++] = part_ptr;
+ }
+ list_iterator_destroy(part_iterator);
+ }
+
debug("sched: Running job scheduler");
/*
* If we are doing FIFO scheduling, use the job records right off the
@@ -875,6 +904,20 @@ next_part: part_ptr = (struct part_record *)
debug("sched: loop taking too long, breaking out");
break;
}
+
+ if (max_jobs_per_part) {
+ bool skip_job = false;
+ for (j = 0; j < part_cnt; j++) {
+ if (sched_part_ptr[j] != job_ptr->part_ptr)
+ continue;
+ if (sched_part_jobs[j]++ >=
+ max_jobs_per_part)
+ skip_job = true;
+ break;
+ }
+ if (skip_job)
+ continue;
+ }
if (job_depth++ > job_limit) {
debug3("sched: already tested %u jobs, breaking out",
job_depth);
@@ -1109,6 +1152,8 @@ next_part: part_ptr = (struct part_record *)
} else {
FREE_NULL_LIST(job_queue);
}
+ xfree(sched_part_ptr);
+ xfree(sched_part_jobs);
unlock_slurmctld(job_write_lock);
END_TIMER2("schedule");