Hi,

I was wondering why --ntasks-per-socket is only available for CR_Socket but not
CR_Core. Looking at the _allocate_sockets and _allocate_cores routines in
src/plugins/select/cons_res/job_test.c it seems that the codes are very similar
(but with subtle differences) and perhaps unifying both codes may be desirable.

In the attached patch (against the slurm-2.6 branch), I tried to do exactly
that, i.e., use basically one code for _allocate_sockets and _allocate_cores.
As a consequence, --ntasks-per-socket is then also available for CR_Core. There
are two places (conditionals) where either original routine has more logic than
the other; I have put "/* only in _allocate_... */" comments at the respective
places and used the more elaborate version in my unified routine. Additionally,
the very last bit of code (determining what is returned) is more complicated in
CR_Core but I didn't understand why it's there; it's commented out and marked
by "* Return sequence in _allocate_cores". With these three places in the code
I'm not sure whether they to the right thing in my unified routine (in all 
cases).

So, I'm not proposing to apply the attached patch (yet); it is more a
suggestion to unify the allocation routines for CR_Socket and CR_Core into one
routine (giving --ntasks-per-socket also for CR_Core and reducing code
duplication) and a question to people who understand the routines better than
me and who can comment on the mentioned differences between _allocate_cores and
_allocate_sockets (and if unification is desirable).

Best regards,
Armin

>From 5474bcb948a9a99981cd1aac2b1a41d9b6c35cb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Armin=20Gr=C3=B6=C3=9Flinger?=
 <[email protected]>
Date: Fri, 30 Aug 2013 12:57:35 +0200
Subject: [PATCH] Unify allocation code for CR_Socket and CR_Core

This makes --ntasks-per-socket also available for CR_Core.
---
 src/plugins/select/cons_res/job_test.c |  270 ++++++--------------------------
 1 file changed, 51 insertions(+), 219 deletions(-)

diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c
index 806a0f4..110b44b 100644
--- a/src/plugins/select/cons_res/job_test.c
+++ b/src/plugins/select/cons_res/job_test.c
@@ -115,6 +115,10 @@ static int _eval_nodes_topo(struct job_record *job_ptr, bitstr_t *node_map,
 			uint32_t req_nodes, uint32_t cr_node_cnt,
 			uint16_t *cpu_cnt);
 
+static uint16_t _allocate_sc(struct job_record *job_ptr, bitstr_t *core_map,
+			      bitstr_t *part_core_map, const uint32_t node_i,
+			      bool entire_sockets_only);
+
 /* _allocate_sockets - Given the job requirements, determine which sockets
  *                     from the given node can be allocated (if any) to this
  *                     job. Returns the number of cpus that can be used by
@@ -129,6 +133,43 @@ static int _eval_nodes_topo(struct job_record *job_ptr, bitstr_t *node_map,
 uint16_t _allocate_sockets(struct job_record *job_ptr, bitstr_t *core_map,
 			   bitstr_t *part_core_map, const uint32_t node_i)
 {
+	return _allocate_sc(job_ptr, core_map, part_core_map, node_i, true);
+}
+
+/* _allocate_cores - Given the job requirements, determine which cores
+ *                   from the given node can be allocated (if any) to this
+ *                   job. Returns the number of cpus that can be used by
+ *                   this node AND a bitmap of the selected cores.
+ *
+ * IN job_ptr       - pointer to job requirements
+ * IN/OUT core_map  - bitmap of cores available for use/selected for use
+ * IN part_core_map - bitmap of cores already allocated from this partition
+ * IN node_i        - index of node to be evaluated
+ * IN cpu_type      - if true, allocate CPUs rather than cores
+ */
+uint16_t _allocate_cores(struct job_record *job_ptr, bitstr_t *core_map,
+			 bitstr_t *part_core_map, const uint32_t node_i,
+			 bool cpu_type)
+{
+	return _allocate_sc(job_ptr, core_map, part_core_map, node_i, false);
+}
+
+/* _allocate_sc - Given the job requirements, determine which cores/sockets
+ *                from the given node can be allocated (if any) to this
+ *                job. Returns the number of cpus that can be used by
+ *                this node AND a bitmap of the selected cores.
+ *
+ * IN job_ptr       - pointer to job requirements
+ * IN/OUT core_map  - bitmap of cores available for use/selected for use
+ * IN part_core_map - bitmap of cores already allocated from this partition
+ * IN node_i        - index of node to be evaluated
+ * IN entire_sockets_only - if true, allocate cores only on sockets that
+ *                        - have no other allocated cores.
+ */
+static uint16_t _allocate_sc(struct job_record *job_ptr, bitstr_t *core_map,
+			      bitstr_t *part_core_map, const uint32_t node_i,
+			      bool entire_sockets_only)
+{
 	uint16_t cpu_count = 0, cpu_cnt = 0;
 	uint16_t si, cps, avail_cpus = 0, num_tasks = 0;
 	uint32_t core_begin    = cr_get_coremap_offset(node_i);
@@ -231,9 +272,11 @@ uint16_t _allocate_sockets(struct job_record *job_ptr, bitstr_t *core_map,
 		if (part_core_map && bit_test(part_core_map, c))
 			used_cpu_array[i]++;
 	}
-	/* if a socket is already in use, it cannot be used by this job */
+
 	for (i = 0; i < sockets; i++) {
-		if (used_cores[i]) {
+		/* if a socket is already in use and entire_sockets_only is
+		 * enabled, it cannot be used by this job */
+		if (entire_sockets_only && used_cores[i]) {
 			free_core_count -= free_cores[i];
 			used_cores[i] += free_cores[i];
 			free_cores[i] = 0;
@@ -319,11 +362,12 @@ uint16_t _allocate_sockets(struct job_record *job_ptr, bitstr_t *core_map,
 	} else {
 		j = avail_cpus / cpus_per_task;
 		num_tasks = MIN(num_tasks, j);
-		if (job_ptr->details->ntasks_per_node)
+		if (job_ptr->details->ntasks_per_node)     /* only in _allocate_sockets */
 			avail_cpus = num_tasks * cpus_per_task;
 	}
 	if ((job_ptr->details->ntasks_per_node &&
-	     (num_tasks < job_ptr->details->ntasks_per_node)) ||
+	     (num_tasks < job_ptr->details->ntasks_per_node) &&
+	     (job_ptr->details->overcommit == 0)) ||        /* only in _allocate_cores */
 	    (job_ptr->details->pn_min_cpus &&
 	     (avail_cpus < job_ptr->details->pn_min_cpus))) {
 		/* insufficient resources on this node */
@@ -392,224 +436,12 @@ fini:
 	}
 	xfree(free_cores);
 	return cpu_count;
-}
-
-
-/* _allocate_cores - Given the job requirements, determine which cores
- *                   from the given node can be allocated (if any) to this
- *                   job. Returns the number of cpus that can be used by
- *                   this node AND a bitmap of the selected cores.
- *
- * IN job_ptr       - pointer to job requirements
- * IN/OUT core_map  - bitmap of cores available for use/selected for use
- * IN part_core_map - bitmap of cores already allocated from this partition
- * IN node_i        - index of node to be evaluated
- * IN cpu_type      - if true, allocate CPUs rather than cores
- */
-uint16_t _allocate_cores(struct job_record *job_ptr, bitstr_t *core_map,
-			 bitstr_t *part_core_map, const uint32_t node_i,
-			 bool cpu_type)
-{
-	uint16_t avail_cpus = 0, num_tasks = 0, total_cpus = 0;
-	uint32_t core_begin    = cr_get_coremap_offset(node_i);
-	uint32_t core_end      = cr_get_coremap_offset(node_i+1);
-	uint32_t c;
-	uint16_t cpus_per_task = job_ptr->details->cpus_per_task;
-	uint16_t *free_cores, free_core_count = 0;
-	uint16_t i, j, sockets    = select_node_record[node_i].sockets;
-	uint16_t cores_per_socket = select_node_record[node_i].cores;
-	uint16_t threads_per_core = select_node_record[node_i].vpus;
-	uint16_t min_cores = 1, min_sockets = 1;
-	uint16_t ntasks_per_core = 0xffff;
-	uint32_t free_cpu_count = 0, used_cpu_count = 0;
-
-	if (job_ptr->details && job_ptr->details->mc_ptr) {
-		multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr;
-		if (mc_ptr->cores_per_socket != (uint16_t) NO_VAL) {
-			min_cores   = mc_ptr->cores_per_socket;
-		}
-		if (mc_ptr->sockets_per_node != (uint16_t) NO_VAL) {
-			min_sockets = mc_ptr->sockets_per_node;
-		}
-		if (mc_ptr->ntasks_per_core) {
-			ntasks_per_core = mc_ptr->ntasks_per_core;
-		}
-		if ((mc_ptr->threads_per_core != (uint16_t) NO_VAL) &&
-		    (mc_ptr->threads_per_core <  ntasks_per_core)) {
-			ntasks_per_core = mc_ptr->threads_per_core;
-		}
-	}
-
-	/* These are the job parameters that we must respect:
-	 *
-	 *   job_ptr->details->mc_ptr->cores_per_socket (cr_core|cr_socket)
-	 *	- number of cores per socket to allocate to this jobb
-	 *   job_ptr->details->mc_ptr->sockets_per_node (cr_core|cr_socket)
-	 *	- number of sockets per node to allocate to this job
-	 *   job_ptr->details->mc_ptr->ntasks_per_core (cr_core|cr_socket)
-	 *	- number of tasks to launch per core
-	 *   job_ptr->details->mc_ptr->ntasks_per_socket (cr_core|cr_socket)
-	 *	- number of tasks to launch per socket
-	 *
-	 *   job_ptr->details->ntasks_per_node (all cr_types)
-	 *	- total number of tasks to launch on this node
-	 *   job_ptr->details->cpus_per_task (all cr_types)
-	 *	- number of cpus to allocate per task
-	 *
-	 * These are the hardware constraints:
-	 *   cpus = sockets * cores_per_socket * threads_per_core
-	 *
-	 * These are the cores that are available for use: core_map
-	 *
-	 * NOTE: currently we only allocate at the socket level, the core
-	 *       level, or the cpu level. When hyperthreading is enabled
-	 *       in the BIOS, then there can be more than one thread/cpu
-	 *       per physical core.
-	 *
-	 * PROCEDURE:
-	 *
-	 * Step 1: Determine the current usage data: free_cores[] and
-	 *         free_core_count
-	 *
-	 * Step 2: check min_cores per socket and min_sockets per node
-	 *
-	 * Step 3: Compute task-related data: use ntasks_per_core,
-	 *         ntasks_per_node and cpus_per_task to determine
-	 *         the number of tasks that can run on this node
-	 *
-	 * Step 4: Mark the allocated resources in the job_cores bitmap
-	 *         and return "num_tasks" from Step 3.
-	 *
-	 *
-	 * Start by assuming that all "free" cores will be given to the
-	 * job. Check min_* to ensure that there's enough resources.
-	 * Reduce the core count to match max_* (if necessary). Also,
-	 * reduce the core count (if necessary) to match ntasks_per_core.
-	 * Note that we're not processing ntasks_per_socket, because the
-	 * srun manpage says that ntasks_per_socket is only valid for
-	 * CR_SOCKET.
-	 */
-
-	/* Step 1: create and compute core-count-per-socket
-	 * arrays and total core counts */
-	free_cores = xmalloc(sockets * sizeof(uint16_t));
-
-	for (c = core_begin; c < core_end; c++) {
-		i = (uint16_t) (c - core_begin) / cores_per_socket;
-		if (bit_test(core_map, c)) {
-			free_cores[i]++;
-			free_core_count++;
-			free_cpu_count += threads_per_core;
-		}
-		if (part_core_map && bit_test(part_core_map, c))
-			used_cpu_count += threads_per_core;
-
-	}
-	if ((job_ptr->part_ptr->max_cpus_per_node != INFINITE) &&
-	    (free_cpu_count + used_cpu_count >
-	     job_ptr->part_ptr->max_cpus_per_node)) {
-		int excess = free_cpu_count + used_cpu_count -
-			     job_ptr->part_ptr->max_cpus_per_node;
-		for (c = core_begin; c < core_end; c++) {
-			i = (uint16_t) (c - core_begin) / cores_per_socket;
-			if (free_cores[i] > 0) {
-				free_cores[i]--;
-				free_core_count--;
-				excess -= threads_per_core;
-				if (excess <= 0)
-					break;
-			}
-		}
-	}
-
-	/* Step 2: check min_cores per socket and min_sockets per node */
-	j = 0;
-	for (i = 0; i < sockets; i++) {
-		if (free_cores[i] < min_cores) {
-			/* cannot use this socket */
-			free_core_count -= free_cores[i];
-			free_cores[i] = 0;
-			continue;
-		}
-		/* count this socket as usable */
-		j++;
-	}
-	if (j < min_sockets) {
-		/* cannot use this node */
-		num_tasks = 0;
-		goto fini;
-	}
-
-	if (free_core_count < 1) {
-		/* no available resources on this node */
-		num_tasks = 0;
-		goto fini;
-	}
-
-	/* Step 3: Compute task-related data: use ntasks_per_core,
-	 *         ntasks_per_node and cpus_per_task to determine
-	 *         the number of tasks to run on this node
-	 *
-	 * Note: cpus_per_task and ntasks_per_core need to play nice
-	 *       2 tasks_per_core vs. 2 cpus_per_task
-	 */
-	threads_per_core = MIN(threads_per_core, ntasks_per_core);
-	num_tasks = avail_cpus = threads_per_core;
-
-	/* convert from PER_CORE to TOTAL_FOR_NODE */
-	avail_cpus *= free_core_count;
-	total_cpus = avail_cpus;
-	num_tasks  *= free_core_count;
-
-	/* If job requested exclusive rights to the node don't do the min here
-	 * since it will make it so we don't allocate the entire node */
-	if (job_ptr->details->ntasks_per_node && job_ptr->details->shared)
-		num_tasks = MIN(num_tasks, job_ptr->details->ntasks_per_node);
-
-	if (cpus_per_task < 2) {
-		avail_cpus = num_tasks;
-	} else {
-		j = avail_cpus / cpus_per_task;
-		num_tasks = MIN(num_tasks, j);
-	}
-
-	if ((job_ptr->details->ntasks_per_node &&
-	     (num_tasks < job_ptr->details->ntasks_per_node) &&
-	     (job_ptr->details->overcommit == 0)) ||
-	    (job_ptr->details->pn_min_cpus &&
-	     (avail_cpus < job_ptr->details->pn_min_cpus))) {
-		/* insufficient resources on this node */
-		num_tasks = 0;
-		goto fini;
-	}
-
-	/* Step 4 */
-	for (c = core_begin; c < core_end && avail_cpus > 0; c++) {
-		if (bit_test(core_map, c) == 0)
-			continue;
-		i = (uint16_t) (c - core_begin) / cores_per_socket;
-		if (free_cores[i] == 0)
-			bit_clear(core_map, c);
-		else {
-			free_cores[i]--;
-			if (avail_cpus >= threads_per_core)
-				avail_cpus -= threads_per_core;
-			else
-				avail_cpus = 0;
-		}
-	}
-
-	/* clear leftovers */
-	if (c < core_end)
-		bit_nclear(core_map, c, core_end-1);
-
-fini:
-	if (!num_tasks)
-		bit_nclear(core_map, core_begin, core_end-1);
-	xfree(free_cores);
+/*
+ * Return sequence in _allocate_cores:
 	if ((job_ptr->details->shared == 0) && num_tasks)
 		return total_cpus;
 	return num_tasks * cpus_per_task;
+*/
 }
 
 
-- 
1.7.9.5

Reply via email to