On 11/23/2017 03:34 PM, Christoph Hellwig wrote: > FYI, the patch below changes both the irq and block mappings to > always use the cpu possible map (should be split in two in due time). > > I think this is the right way forward. For every normal machine > those two are the same, but for VMs with maxcpus above their normal > count or some big iron that can grow more cpus it means we waster > a few more resources for the not present but reserved cpus. It > fixes the reported issue for me: > > diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c > index 9f8cffc8a701..3eb169f15842 100644 > --- a/block/blk-mq-cpumap.c > +++ b/block/blk-mq-cpumap.c > @@ -16,11 +16,6 @@ > > static int cpu_to_queue_index(unsigned int nr_queues, const int cpu) > { > - /* > - * Non present CPU will be mapped to queue index 0. > - */ > - if (!cpu_present(cpu)) > - return 0; > return cpu % nr_queues; > } > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 11097477eeab..612ce1fb7c4e 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -2114,16 +2114,11 @@ static void blk_mq_init_cpu_queues(struct > request_queue *q, > INIT_LIST_HEAD(&__ctx->rq_list); > __ctx->queue = q; > > - /* If the cpu isn't present, the cpu is mapped to first hctx */ > - if (!cpu_present(i)) > - continue; > - > - hctx = blk_mq_map_queue(q, i); > - > /* > * Set local node, IFF we have more than one hw queue. If > * not, we remain on the home node of the device > */ > + hctx = blk_mq_map_queue(q, i); > if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) > hctx->numa_node = local_memory_node(cpu_to_node(i)); > } > @@ -2180,7 +2175,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) > * > * If the cpu isn't present, the cpu is mapped to first hctx. > */ > - for_each_present_cpu(i) { > + for_each_possible_cpu(i) { > hctx_idx = q->mq_map[i]; > /* unmapped hw queue can be remapped after CPU topo changed */ > if (!set->tags[hctx_idx] && > diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c > index e12d35108225..a37a3b4b6342 100644 > --- a/kernel/irq/affinity.c > +++ b/kernel/irq/affinity.c > @@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, > struct cpumask *nmsk, > } > } > > -static cpumask_var_t *alloc_node_to_present_cpumask(void) > +static cpumask_var_t *alloc_node_to_possible_cpumask(void) > { > cpumask_var_t *masks; > int node; > @@ -62,7 +62,7 @@ static cpumask_var_t *alloc_node_to_present_cpumask(void) > return NULL; > } > > -static void free_node_to_present_cpumask(cpumask_var_t *masks) > +static void free_node_to_possible_cpumask(cpumask_var_t *masks) > { > int node; > > @@ -71,22 +71,22 @@ static void free_node_to_present_cpumask(cpumask_var_t > *masks) > kfree(masks); > } > > -static void build_node_to_present_cpumask(cpumask_var_t *masks) > +static void build_node_to_possible_cpumask(cpumask_var_t *masks) > { > int cpu; > > - for_each_present_cpu(cpu) > + for_each_possible_cpu(cpu) > cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); > } > > -static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask, > +static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask, > const struct cpumask *mask, nodemask_t *nodemsk) > { > int n, nodes = 0; > > /* Calculate the number of nodes in the supplied affinity mask */ > for_each_node(n) { > - if (cpumask_intersects(mask, node_to_present_cpumask[n])) { > + if (cpumask_intersects(mask, node_to_possible_cpumask[n])) { > node_set(n, *nodemsk); > nodes++; > } > @@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct > irq_affinity *affd) > int last_affv = affv + affd->pre_vectors; > nodemask_t nodemsk = NODE_MASK_NONE; > struct cpumask *masks; > - cpumask_var_t nmsk, *node_to_present_cpumask; > + cpumask_var_t nmsk, *node_to_possible_cpumask; > > /* > * If there aren't any vectors left after applying the pre/post > @@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct > irq_affinity *affd) > if (!masks) > goto out; > > - node_to_present_cpumask = alloc_node_to_present_cpumask(); > - if (!node_to_present_cpumask) > + node_to_possible_cpumask = alloc_node_to_possible_cpumask(); > + if (!node_to_possible_cpumask) > goto out; > > /* Fill out vectors at the beginning that don't need affinity */ > @@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct > irq_affinity *affd) > > /* Stabilize the cpumasks */ > get_online_cpus(); > - build_node_to_present_cpumask(node_to_present_cpumask); > - nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask, > + build_node_to_possible_cpumask(node_to_possible_cpumask); > + nodes = get_nodes_in_cpumask(node_to_possible_cpumask, > cpu_possible_mask, > &nodemsk); > > /* > @@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct > irq_affinity *affd) > if (affv <= nodes) { > for_each_node_mask(n, nodemsk) { > cpumask_copy(masks + curvec, > - node_to_present_cpumask[n]); > + node_to_possible_cpumask[n]); > if (++curvec == last_affv) > break; > } > @@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct > irq_affinity *affd) > vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; > > /* Get the cpus on this node which are in the mask */ > - cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]); > + cpumask_and(nmsk, cpu_possible_mask, > node_to_possible_cpumask[n]); > > /* Calculate the number of cpus per vector */ > ncpus = cpumask_weight(nmsk); > @@ -192,7 +192,7 @@ irq_create_affinity_masks(int nvecs, const struct > irq_affinity *affd) > /* Fill out vectors at the end that don't need affinity */ > for (; curvec < nvecs; curvec++) > cpumask_copy(masks + curvec, irq_default_affinity); > - free_node_to_present_cpumask(node_to_present_cpumask); > + free_node_to_possible_cpumask(node_to_possible_cpumask); > out: > free_cpumask_var(nmsk); > return masks; > @@ -214,7 +214,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, > const struct irq_affinity > return 0; > > get_online_cpus(); > - ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv; > + ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv; > put_online_cpus(); > return ret; > } > What will happen for the CPU hotplug case? Wouldn't we route I/O to a disabled CPU with this patch?
Cheers, Hannes -- Dr. Hannes Reinecke Teamlead Storage & Networking h...@suse.de +49 911 74053 688 SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton HRB 21284 (AG Nürnberg)