On 11/23/2017 03:34 PM, Christoph Hellwig wrote:
> FYI, the patch below changes both the irq and block mappings to
> always use the cpu possible map (should be split in two in due time).
> 
> I think this is the right way forward.  For every normal machine
> those two are the same, but for VMs with maxcpus above their normal
> count or some big iron that can grow more cpus it means we waster
> a few more resources for the not present but reserved cpus.  It
> fixes the reported issue for me:
> 
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index 9f8cffc8a701..3eb169f15842 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -16,11 +16,6 @@
>  
>  static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
>  {
> -     /*
> -      * Non present CPU will be mapped to queue index 0.
> -      */
> -     if (!cpu_present(cpu))
> -             return 0;
>       return cpu % nr_queues;
>  }
>  
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 11097477eeab..612ce1fb7c4e 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2114,16 +2114,11 @@ static void blk_mq_init_cpu_queues(struct 
> request_queue *q,
>               INIT_LIST_HEAD(&__ctx->rq_list);
>               __ctx->queue = q;
>  
> -             /* If the cpu isn't present, the cpu is mapped to first hctx */
> -             if (!cpu_present(i))
> -                     continue;
> -
> -             hctx = blk_mq_map_queue(q, i);
> -
>               /*
>                * Set local node, IFF we have more than one hw queue. If
>                * not, we remain on the home node of the device
>                */
> +             hctx = blk_mq_map_queue(q, i);
>               if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
>                       hctx->numa_node = local_memory_node(cpu_to_node(i));
>       }
> @@ -2180,7 +2175,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
>        *
>        * If the cpu isn't present, the cpu is mapped to first hctx.
>        */
> -     for_each_present_cpu(i) {
> +     for_each_possible_cpu(i) {
>               hctx_idx = q->mq_map[i];
>               /* unmapped hw queue can be remapped after CPU topo changed */
>               if (!set->tags[hctx_idx] &&
> diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
> index e12d35108225..a37a3b4b6342 100644
> --- a/kernel/irq/affinity.c
> +++ b/kernel/irq/affinity.c
> @@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, 
> struct cpumask *nmsk,
>       }
>  }
>  
> -static cpumask_var_t *alloc_node_to_present_cpumask(void)
> +static cpumask_var_t *alloc_node_to_possible_cpumask(void)
>  {
>       cpumask_var_t *masks;
>       int node;
> @@ -62,7 +62,7 @@ static cpumask_var_t *alloc_node_to_present_cpumask(void)
>       return NULL;
>  }
>  
> -static void free_node_to_present_cpumask(cpumask_var_t *masks)
> +static void free_node_to_possible_cpumask(cpumask_var_t *masks)
>  {
>       int node;
>  
> @@ -71,22 +71,22 @@ static void free_node_to_present_cpumask(cpumask_var_t 
> *masks)
>       kfree(masks);
>  }
>  
> -static void build_node_to_present_cpumask(cpumask_var_t *masks)
> +static void build_node_to_possible_cpumask(cpumask_var_t *masks)
>  {
>       int cpu;
>  
> -     for_each_present_cpu(cpu)
> +     for_each_possible_cpu(cpu)
>               cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
>  }
>  
> -static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask,
> +static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
>                               const struct cpumask *mask, nodemask_t *nodemsk)
>  {
>       int n, nodes = 0;
>  
>       /* Calculate the number of nodes in the supplied affinity mask */
>       for_each_node(n) {
> -             if (cpumask_intersects(mask, node_to_present_cpumask[n])) {
> +             if (cpumask_intersects(mask, node_to_possible_cpumask[n])) {
>                       node_set(n, *nodemsk);
>                       nodes++;
>               }
> @@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct 
> irq_affinity *affd)
>       int last_affv = affv + affd->pre_vectors;
>       nodemask_t nodemsk = NODE_MASK_NONE;
>       struct cpumask *masks;
> -     cpumask_var_t nmsk, *node_to_present_cpumask;
> +     cpumask_var_t nmsk, *node_to_possible_cpumask;
>  
>       /*
>        * If there aren't any vectors left after applying the pre/post
> @@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct 
> irq_affinity *affd)
>       if (!masks)
>               goto out;
>  
> -     node_to_present_cpumask = alloc_node_to_present_cpumask();
> -     if (!node_to_present_cpumask)
> +     node_to_possible_cpumask = alloc_node_to_possible_cpumask();
> +     if (!node_to_possible_cpumask)
>               goto out;
>  
>       /* Fill out vectors at the beginning that don't need affinity */
> @@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct 
> irq_affinity *affd)
>  
>       /* Stabilize the cpumasks */
>       get_online_cpus();
> -     build_node_to_present_cpumask(node_to_present_cpumask);
> -     nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask,
> +     build_node_to_possible_cpumask(node_to_possible_cpumask);
> +     nodes = get_nodes_in_cpumask(node_to_possible_cpumask, 
> cpu_possible_mask,
>                                    &nodemsk);
>  
>       /*
> @@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct 
> irq_affinity *affd)
>       if (affv <= nodes) {
>               for_each_node_mask(n, nodemsk) {
>                       cpumask_copy(masks + curvec,
> -                                  node_to_present_cpumask[n]);
> +                                  node_to_possible_cpumask[n]);
>                       if (++curvec == last_affv)
>                               break;
>               }
> @@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct 
> irq_affinity *affd)
>               vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
>  
>               /* Get the cpus on this node which are in the mask */
> -             cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]);
> +             cpumask_and(nmsk, cpu_possible_mask, 
> node_to_possible_cpumask[n]);
>  
>               /* Calculate the number of cpus per vector */
>               ncpus = cpumask_weight(nmsk);
> @@ -192,7 +192,7 @@ irq_create_affinity_masks(int nvecs, const struct 
> irq_affinity *affd)
>       /* Fill out vectors at the end that don't need affinity */
>       for (; curvec < nvecs; curvec++)
>               cpumask_copy(masks + curvec, irq_default_affinity);
> -     free_node_to_present_cpumask(node_to_present_cpumask);
> +     free_node_to_possible_cpumask(node_to_possible_cpumask);
>  out:
>       free_cpumask_var(nmsk);
>       return masks;
> @@ -214,7 +214,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, 
> const struct irq_affinity
>               return 0;
>  
>       get_online_cpus();
> -     ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv;
> +     ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv;
>       put_online_cpus();
>       return ret;
>  }
> 
What will happen for the CPU hotplug case?
Wouldn't we route I/O to a disabled CPU with this patch?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Teamlead Storage & Networking
h...@suse.de                                   +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

Reply via email to