From: [email protected] <[email protected]> Sent: Wednesday, 
October 1, 2025 10:06 PM
> 
> When selecting an outgoing channel for I/O, storvsc tries to select a
> channel with a returning CPU that is not the same as issuing CPU. This
> worked well in the past, however it doesn't work well when the Hyper-V
> exposes a large number of channels (up to the number of all CPUs). Use
> a different CPU for returning channel is not efficient on Hyper-V.
> 
> Change this behavior by preferring to the channel with the same CPU
> as the current I/O issuing CPU whenever possible.
> 
> Tests have shown improvements in newer Hyper-V/Azure environment, and
> no regression with older Hyper-V/Azure environments.
> 
> Tested-by: Raheel Abdul Faizy <[email protected]>
> Signed-off-by: Long Li <[email protected]>
> ---
>  drivers/scsi/storvsc_drv.c | 96 ++++++++++++++++++--------------------
>  1 file changed, 45 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
> index d9e59204a9c3..092939791ea0 100644
> --- a/drivers/scsi/storvsc_drv.c
> +++ b/drivers/scsi/storvsc_drv.c
> @@ -1406,14 +1406,19 @@ static struct vmbus_channel *get_og_chn(struct 
> storvsc_device *stor_device,
>       }
> 
>       /*
> -      * Our channel array is sparsley populated and we
> +      * Our channel array could be sparsley populated and we
>        * initiated I/O on a processor/hw-q that does not
>        * currently have a designated channel. Fix this.
>        * The strategy is simple:
> -      * I. Ensure NUMA locality
> -      * II. Distribute evenly (best effort)
> +      * I. Prefer the channel associated with the current CPU
> +      * II. Ensure NUMA locality
> +      * III. Distribute evenly (best effort)
>        */
> 
> +     /* Prefer the channel on the I/O issuing processor/hw-q */
> +     if (cpumask_test_cpu(q_num, &stor_device->alloced_cpus))
> +             return stor_device->stor_chns[q_num];
> +

Hmmm. When get_og_chn() is called, we know that
stor_device->stor_chns[q_num] is NULL since storvsc_do_io() has
already handled the non-NULL case. And the checks are all done
with stor_device->lock held, so the stor_chns array can't change. 
Hence the above code will return NULL, which will cause a NULL
reference when storvsc_do_io() sends out the VMBus packet.

My recollection is that get_og_chan() is called when there is no
channel that interrupts the current CPU (that's what it means
for stor_device->stor_chns[<current CPU>] to be NULL). So the
algorithm must pick a channel that interrupts some other CPU,
preferably a CPU in the current NUMA node. Adding code to prefer
the channel associated with the current CPU doesn't make sense in
get_og_chn(), as get_og_chn() is only called when it is already 
known that there is no such channel.

Or is there a case that I'm missing? Regardless, the above code
seems problematic because it would return NULL.

Michael

>       node_mask = cpumask_of_node(cpu_to_node(q_num));
> 
>       num_channels = 0;
> @@ -1469,59 +1474,48 @@ static int storvsc_do_io(struct hv_device *device,
>       /* See storvsc_change_target_cpu(). */
>       outgoing_channel = READ_ONCE(stor_device->stor_chns[q_num]);
>       if (outgoing_channel != NULL) {
> -             if (outgoing_channel->target_cpu == q_num) {
> -                     /*
> -                      * Ideally, we want to pick a different channel if
> -                      * available on the same NUMA node.
> -                      */
> -                     node_mask = cpumask_of_node(cpu_to_node(q_num));
> -                     for_each_cpu_wrap(tgt_cpu,
> -                              &stor_device->alloced_cpus, q_num + 1) {
> -                             if (!cpumask_test_cpu(tgt_cpu, node_mask))
> -                                     continue;
> -                             if (tgt_cpu == q_num)
> -                                     continue;
> -                             channel = READ_ONCE(
> -                                     stor_device->stor_chns[tgt_cpu]);
> -                             if (channel == NULL)
> -                                     continue;
> -                             if (hv_get_avail_to_write_percent(
> -                                                     &channel->outbound)
> -                                             > ring_avail_percent_lowater) {
> -                                     outgoing_channel = channel;
> -                                     goto found_channel;
> -                             }
> -                     }
> +             if (hv_get_avail_to_write_percent(&outgoing_channel->outbound)
> +                             > ring_avail_percent_lowater)
> +                     goto found_channel;
> 
> -                     /*
> -                      * All the other channels on the same NUMA node are
> -                      * busy. Try to use the channel on the current CPU
> -                      */
> -                     if (hv_get_avail_to_write_percent(
> -                                             &outgoing_channel->outbound)
> -                                     > ring_avail_percent_lowater)
> +             /*
> +              * Channel is busy, try to find a channel on the same NUMA node
> +              */
> +             node_mask = cpumask_of_node(cpu_to_node(q_num));
> +             for_each_cpu_wrap(tgt_cpu, &stor_device->alloced_cpus,
> +                               q_num + 1) {
> +                     if (!cpumask_test_cpu(tgt_cpu, node_mask))
> +                             continue;
> +                     channel = READ_ONCE(stor_device->stor_chns[tgt_cpu]);
> +                     if (!channel)
> +                             continue;
> +                     if (hv_get_avail_to_write_percent(&channel->outbound)
> +                                     > ring_avail_percent_lowater) {
> +                             outgoing_channel = channel;
>                               goto found_channel;
> +                     }
> +             }
> 
> -                     /*
> -                      * If we reach here, all the channels on the current
> -                      * NUMA node are busy. Try to find a channel in
> -                      * other NUMA nodes
> -                      */
> -                     for_each_cpu(tgt_cpu, &stor_device->alloced_cpus) {
> -                             if (cpumask_test_cpu(tgt_cpu, node_mask))
> -                                     continue;
> -                             channel = READ_ONCE(
> -                                     stor_device->stor_chns[tgt_cpu]);
> -                             if (channel == NULL)
> -                                     continue;
> -                             if (hv_get_avail_to_write_percent(
> -                                                     &channel->outbound)
> -                                             > ring_avail_percent_lowater) {
> -                                     outgoing_channel = channel;
> -                                     goto found_channel;
> -                             }
> +             /*
> +              * If we reach here, all the channels on the current
> +              * NUMA node are busy. Try to find a channel in
> +              * all NUMA nodes
> +              */
> +             for_each_cpu_wrap(tgt_cpu, &stor_device->alloced_cpus,
> +                               q_num + 1) {
> +                     channel = READ_ONCE(stor_device->stor_chns[tgt_cpu]);
> +                     if (!channel)
> +                             continue;
> +                     if (hv_get_avail_to_write_percent(&channel->outbound)
> +                                     > ring_avail_percent_lowater) {
> +                             outgoing_channel = channel;
> +                             goto found_channel;
>                       }
>               }
> +             /*
> +              * If we reach here, all the channels are busy. Use the
> +              * original channel found.
> +              */
>       } else {
>               spin_lock_irqsave(&stor_device->lock, flags);
>               outgoing_channel = stor_device->stor_chns[q_num];
> --
> 2.34.1
> 


Reply via email to