from:"Dragos Tatulea"

Re: [PATCH vhost v2 0/7] vdpa/mlx5: Optimze MKEY operations

2024-09-11 Thread Dragos Tatulea




On 11.09.24 10:02, Eugenio Perez Martin wrote:
> On Mon, Sep 9, 2024 at 11:30 AM Dragos Tatulea  wrote:
>>
>>
>>
>> On 30.08.24 12:58, Dragos Tatulea wrote:
>>> This series improves the time of .set_map() operations by parallelizing
>>> the MKEY creation and deletion for direct MKEYs. Looking at the top
>>> level MKEY creation/deletion functions, the following improvement can be
>>> seen:
>>>
>>> |---+-|
>>> | operation | improvement |
>>> |---+-|
>>> | create_user_mr()  | 3-5x|
>>> | destroy_user_mr() | 8x  |
>>> |---+-|
>>>
>>> The last part of the series introduces lazy MKEY deletion which
>>> postpones the MKEY deletion to a later point in a workqueue.
>>>
>>> As this series and the previous ones were targeting live migration,
>>> we can also observe improvements on this front:
>>>
>>> |---+--+--|
>>> | Stage | Downtime #1 (ms) | Downtime #2 (ms) |
>>> |---+--+--|
>>> | Baseline  | 3140 | 3630 |
>>> | Parallel MKEY ops | 1200 | 2000 |
>>> | Deferred deletion | 1014 | 1253 |
>>> |---+--+--|
>>>
>>> Test configuration: 256 GB VM, 32 CPUs x 2 threads per core, 4 x mlx5
>>> vDPA devices x 32 VQs (16 VQPs)
>>>
>>> This series must be applied on top of the parallel VQ suspend/resume
>>> series [0].
>>>
>>> [0] 
>>> https://lore.kernel.org/all/20240816090159.1967650-1-dtatu...@nvidia.com/
>>>
>>> ---
>>> v2:
>>> - Swapped flex array usage for plain zero length array in first patch.
>>> - Updated code to use Scope-Based Cleanup Helpers where appropriate
>>>   (only second patch).
>>> - Added macro define for MTT alignment in first patch.
>>> - Improved commit messages/comments based on review comments.
>>> - Removed extra newlines.
>> Gentle ping for the remaining patches in v2.
>>
> 
> Same here, this series is already in MST's branch:
> https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git/commit/?h=vhost&id=d424b079e243128383e88bee79f143ff30b4ec62
> 
Ack. Thanks!

Re: [PATCH mlx5-vhost v2 01/10] net/mlx5: Support throttled commands from async API

2024-09-11 Thread Dragos Tatulea




On 11.09.24 10:00, Eugenio Perez Martin wrote:
> On Mon, Sep 9, 2024 at 11:33 AM Dragos Tatulea  wrote:
>>
>>
>>
>> On 16.08.24 11:01, Dragos Tatulea wrote:
>>> Currently, commands that qualify as throttled can't be used via the
>>> async API. That's due to the fact that the throttle semaphore can sleep
>>> but the async API can't.
>>>
>>> This patch allows throttling in the async API by using the tentative
>>> variant of the semaphore and upon failure (semaphore at 0) returns EBUSY
>>> to signal to the caller that they need to wait for the completion of
>>> previously issued commands.
>>>
>>> Furthermore, make sure that the semaphore is released in the callback.
>>>
>>> Signed-off-by: Dragos Tatulea 
>>> Cc: Leon Romanovsky 
>>> Reviewed-by: Tariq Toukan 
>> Same reminder as in v1: Tariq is the maintainer for mlx5 so his review
>> also counts as Acked-by.
>>
> 
> Not sure if it was the case when you send the mail, but this series is
> already in the maintainer's branch:
> * https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git/
> * 
> https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git/commit/?h=vhost&id=691fd851e1bc8ec043798e1ab337305e6291cd6b
It wasn't. Thanks for the notice!

Thanks,
Dragos

Re: [PATCH mlx5-vhost v2 01/10] net/mlx5: Support throttled commands from async API

2024-09-09 Thread Dragos Tatulea




On 16.08.24 11:01, Dragos Tatulea wrote:
> Currently, commands that qualify as throttled can't be used via the
> async API. That's due to the fact that the throttle semaphore can sleep
> but the async API can't.
> 
> This patch allows throttling in the async API by using the tentative
> variant of the semaphore and upon failure (semaphore at 0) returns EBUSY
> to signal to the caller that they need to wait for the completion of
> previously issued commands.
> 
> Furthermore, make sure that the semaphore is released in the callback.
> 
> Signed-off-by: Dragos Tatulea 
> Cc: Leon Romanovsky 
> Reviewed-by: Tariq Toukan 
Same reminder as in v1: Tariq is the maintainer for mlx5 so his review
also counts as Acked-by.

Thanks,
Dragos

> ---
>  drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 21 ++-
>  1 file changed, 16 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c 
> b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
> index 20768ef2e9d2..f69c977c1569 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
> @@ -1882,10 +1882,12 @@ static int cmd_exec(struct mlx5_core_dev *dev, void 
> *in, int in_size, void *out,
>  
>   throttle_op = mlx5_cmd_is_throttle_opcode(opcode);
>   if (throttle_op) {
> - /* atomic context may not sleep */
> - if (callback)
> - return -EINVAL;
> - down(&dev->cmd.vars.throttle_sem);
> + if (callback) {
> + if (down_trylock(&dev->cmd.vars.throttle_sem))
> + return -EBUSY;
> + } else {
> + down(&dev->cmd.vars.throttle_sem);
> + }
>   }
>  
>   pages_queue = is_manage_pages(in);
> @@ -2091,10 +2093,19 @@ static void mlx5_cmd_exec_cb_handler(int status, void 
> *_work)
>  {
>   struct mlx5_async_work *work = _work;
>   struct mlx5_async_ctx *ctx;
> + struct mlx5_core_dev *dev;
> + u16 opcode;
>  
>   ctx = work->ctx;
> - status = cmd_status_err(ctx->dev, status, work->opcode, work->op_mod, 
> work->out);
> + dev = ctx->dev;
> + opcode = work->opcode;
> + status = cmd_status_err(dev, status, work->opcode, work->op_mod, 
> work->out);
>   work->user_callback(status, work);
> + /* Can't access "work" from this point on. It could have been freed in
> +  * the callback.
> +  */
> + if (mlx5_cmd_is_throttle_opcode(opcode))
> + up(&dev->cmd.vars.throttle_sem);
>   if (atomic_dec_and_test(&ctx->num_inflight))
>   complete(&ctx->inflight_done);
>  }

Re: [PATCH vhost v2 0/7] vdpa/mlx5: Optimze MKEY operations

2024-09-09 Thread Dragos Tatulea




On 30.08.24 12:58, Dragos Tatulea wrote:
> This series improves the time of .set_map() operations by parallelizing
> the MKEY creation and deletion for direct MKEYs. Looking at the top
> level MKEY creation/deletion functions, the following improvement can be
> seen:
> 
> |---+-|
> | operation | improvement |
> |---+-|
> | create_user_mr()  | 3-5x|
> | destroy_user_mr() | 8x  |
> |---+-|
> 
> The last part of the series introduces lazy MKEY deletion which
> postpones the MKEY deletion to a later point in a workqueue.
> 
> As this series and the previous ones were targeting live migration,
> we can also observe improvements on this front:
> 
> |---+--+--|
> | Stage | Downtime #1 (ms) | Downtime #2 (ms) |
> |---+--+--|
> | Baseline  | 3140 | 3630 |
> | Parallel MKEY ops | 1200 | 2000 |
> | Deferred deletion | 1014 | 1253 |
> |---+--+--|
> 
> Test configuration: 256 GB VM, 32 CPUs x 2 threads per core, 4 x mlx5
> vDPA devices x 32 VQs (16 VQPs)
> 
> This series must be applied on top of the parallel VQ suspend/resume
> series [0].
> 
> [0] https://lore.kernel.org/all/20240816090159.1967650-1-dtatu...@nvidia.com/
> 
> ---
> v2:
> - Swapped flex array usage for plain zero length array in first patch.
> - Updated code to use Scope-Based Cleanup Helpers where appropriate
>   (only second patch).
> - Added macro define for MTT alignment in first patch.
> - Improved commit messages/comments based on review comments.
> - Removed extra newlines.
Gentle ping for the remaining patches in v2.

Thanks,
Dragos

Re: [PATCH v2 2/2] vdpa: Remove ioctl VHOST_VDPA_SET_CONFIG per spec compliance

2024-09-04 Thread Dragos Tatulea




On 04.09.24 08:34, Jason Wang wrote:
> On Wed, Sep 4, 2024 at 1:59 PM Dragos Tatulea  wrote:
>>
>>
>>
>> On 04.09.24 05:38, Jason Wang wrote:
>>> On Wed, Sep 4, 2024 at 1:15 AM Carlos Bilbao
>>>  wrote:
>>>>
>>>> From: Carlos Bilbao 
>>>>
>>>> Remove invalid ioctl VHOST_VDPA_SET_CONFIG and all its implementations
>>>> with vdpa_config_ops->set_config(). This is needed per virtio spec
>>>> requirements; virtio-spec v3.1 Sec 5.1.4 states that "All of the device
>>>> configuration fields are read-only for the driver."
>>>>
>>>> Signed-off-by: Carlos Bilbao 
>>>
>>> Note that only the config space of the modern device is read only. So
>>> it should be fine to remove vp_vdpa which only works for modern
>>> devices.
>> Just out of curiosity: how will this work for devices that are not
>> v1.3 compliant but are v1.2 compliant?
> 
> Devices don't know the version of the spec, it works with features.
> For example, most devices mandate ACCESS_PLATFORM which implies a
> mandatory VERSION_1. So they are modern devices.
> 
And modern devices should not write to the device config space. This
was discouraged in v1.x until v1.3 which now prohibits it. Did I get
this right?

Thanks,
Dragos

>> Or is this true of all devices
>> except eni?
> 
> ENI depends on the virtio-pci legacy library, so we know it's a legacy
> device implementation which allows mac address setting via config
> space.
> 
> Thanks
> 
>>
>> Thanks,
>> Dragos
>>>
>>> And for eni, it is a legacy only device, so we should not move the
>>> set_config there.
>>>
>>> For the rest, we need the acks for those maintainers.
>>>
>>> Thanks
>>>
>>
>

Re: [PATCH v2 2/2] vdpa: Remove ioctl VHOST_VDPA_SET_CONFIG per spec compliance

2024-09-03 Thread Dragos Tatulea




On 04.09.24 05:38, Jason Wang wrote:
> On Wed, Sep 4, 2024 at 1:15 AM Carlos Bilbao
>  wrote:
>>
>> From: Carlos Bilbao 
>>
>> Remove invalid ioctl VHOST_VDPA_SET_CONFIG and all its implementations
>> with vdpa_config_ops->set_config(). This is needed per virtio spec
>> requirements; virtio-spec v3.1 Sec 5.1.4 states that "All of the device
>> configuration fields are read-only for the driver."
>>
>> Signed-off-by: Carlos Bilbao 
> 
> Note that only the config space of the modern device is read only. So
> it should be fine to remove vp_vdpa which only works for modern
> devices.
Just out of curiosity: how will this work for devices that are not
v1.3 compliant but are v1.2 compliant? Or is this true of all devices
except eni?

Thanks,
Dragos
> 
> And for eni, it is a legacy only device, so we should not move the
> set_config there.
> 
> For the rest, we need the acks for those maintainers.
> 
> Thanks
>

Re: [PATCH vhost v2 00/10] vdpa/mlx5: Parallelize device suspend/resume

2024-09-03 Thread Dragos Tatulea




On 03.09.24 10:10, Eugenio Perez Martin wrote:
> On Tue, Sep 3, 2024 at 9:48 AM Dragos Tatulea  wrote:
>>
>>
>>
>> On 03.09.24 09:40, Lei Yang wrote:
>>> On Mon, Sep 2, 2024 at 7:05 PM Dragos Tatulea  wrote:
>>>>
>>>> Hi Lei,
>>>>
>>>> On 02.09.24 12:03, Lei Yang wrote:
>>>>> Hi Dragos
>>>>>
>>>>> QE tested this series with mellanox nic, it failed with [1] when
>>>>> booting guest, and host dmesg also will print messages [2]. This bug
>>>>> can be reproduced boot guest with vhost-vdpa device.
>>>>>
>>>>> [1] qemu) qemu-kvm: vhost VQ 1 ring restore failed: -1: Operation not
>>>>> permitted (1)
>>>>> qemu-kvm: vhost VQ 0 ring restore failed: -1: Operation not permitted (1)
>>>>> qemu-kvm: unable to start vhost net: 5: falling back on userspace virtio
>>>>> qemu-kvm: vhost_set_features failed: Device or resource busy (16)
>>>>> qemu-kvm: unable to start vhost net: 16: falling back on userspace virtio
>>>>>
>>>>> [2] Host dmesg:
>>>>> [ 1406.187977] mlx5_core :0d:00.2:
>>>>> mlx5_vdpa_compat_reset:3267:(pid 8506): performing device reset
>>>>> [ 1406.189221] mlx5_core :0d:00.2:
>>>>> mlx5_vdpa_compat_reset:3267:(pid 8506): performing device reset
>>>>> [ 1406.190354] mlx5_core :0d:00.2:
>>>>> mlx5_vdpa_show_mr_leaks:573:(pid 8506) warning: mkey still alive after
>>>>> resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
>>>>> [ 1471.538487] mlx5_core :0d:00.2: cb_timeout_handler:938:(pid
>>>>> 428): cmd[13]: MODIFY_GENERAL_OBJECT(0xa01) Async, timeout. Will cause
>>>>> a leak of a command resource
>>>>> [ 1471.539486] mlx5_core :0d:00.2: cb_timeout_handler:938:(pid
>>>>> 428): cmd[12]: MODIFY_GENERAL_OBJECT(0xa01) Async, timeout. Will cause
>>>>> a leak of a command resource
>>>>> [ 1471.540351] mlx5_core :0d:00.2: modify_virtqueues:1617:(pid
>>>>> 8511) error: modify vq 0 failed, state: 0 -> 0, err: 0
>>>>> [ 1471.541433] mlx5_core :0d:00.2: modify_virtqueues:1617:(pid
>>>>> 8511) error: modify vq 1 failed, state: 0 -> 0, err: -110
>>>>> [ 1471.542388] mlx5_core :0d:00.2: mlx5_vdpa_set_status:3203:(pid
>>>>> 8511) warning: failed to resume VQs
>>>>> [ 1471.549778] mlx5_core :0d:00.2:
>>>>> mlx5_vdpa_show_mr_leaks:573:(pid 8511) warning: mkey still alive after
>>>>> resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
>>>>> [ 1512.929854] mlx5_core :0d:00.2:
>>>>> mlx5_vdpa_compat_reset:3267:(pid 8565): performing device reset
>>>>> [ 1513.100290] mlx5_core :0d:00.2:
>>>>> mlx5_vdpa_show_mr_leaks:573:(pid 8565) warning: mkey still alive after
>>>>> resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
>>>>>
>>>
>>> Hi Dragos
>>>
>>>> Can you provide more details about the qemu version and the vdpa device
>>>> options used?
>>>>
>>>> Also, which FW version are you using? There is a relevant bug in FW
>>>> 22.41.1000 which was fixed in the latest FW (22.42.1000). Did you
>>>> encounter any FW syndromes in the host dmesg log?
>>>
>>> This problem has gone when I updated the firmware version to
>>> 22.42.1000, and I tested it with regression tests using mellanox nic,
>>> everything works well.
>>>
>>> Tested-by: Lei Yang 
>> Good to hear. Thanks for the quick reaction.
>>
> 
> Is it possible to add a check so it doesn't use the async fashion in old FW?
> 
Unfortunately not, it would have been there otherwise.

Note that this affects only FW version 22.41.1000. Older versions are not
affected because VQ resume is not supported.

Thanks,
Dragos

Re: [PATCH vhost v2 00/10] vdpa/mlx5: Parallelize device suspend/resume

2024-09-03 Thread Dragos Tatulea




On 03.09.24 09:40, Lei Yang wrote:
> On Mon, Sep 2, 2024 at 7:05 PM Dragos Tatulea  wrote:
>>
>> Hi Lei,
>>
>> On 02.09.24 12:03, Lei Yang wrote:
>>> Hi Dragos
>>>
>>> QE tested this series with mellanox nic, it failed with [1] when
>>> booting guest, and host dmesg also will print messages [2]. This bug
>>> can be reproduced boot guest with vhost-vdpa device.
>>>
>>> [1] qemu) qemu-kvm: vhost VQ 1 ring restore failed: -1: Operation not
>>> permitted (1)
>>> qemu-kvm: vhost VQ 0 ring restore failed: -1: Operation not permitted (1)
>>> qemu-kvm: unable to start vhost net: 5: falling back on userspace virtio
>>> qemu-kvm: vhost_set_features failed: Device or resource busy (16)
>>> qemu-kvm: unable to start vhost net: 16: falling back on userspace virtio
>>>
>>> [2] Host dmesg:
>>> [ 1406.187977] mlx5_core :0d:00.2:
>>> mlx5_vdpa_compat_reset:3267:(pid 8506): performing device reset
>>> [ 1406.189221] mlx5_core :0d:00.2:
>>> mlx5_vdpa_compat_reset:3267:(pid 8506): performing device reset
>>> [ 1406.190354] mlx5_core :0d:00.2:
>>> mlx5_vdpa_show_mr_leaks:573:(pid 8506) warning: mkey still alive after
>>> resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
>>> [ 1471.538487] mlx5_core :0d:00.2: cb_timeout_handler:938:(pid
>>> 428): cmd[13]: MODIFY_GENERAL_OBJECT(0xa01) Async, timeout. Will cause
>>> a leak of a command resource
>>> [ 1471.539486] mlx5_core :0d:00.2: cb_timeout_handler:938:(pid
>>> 428): cmd[12]: MODIFY_GENERAL_OBJECT(0xa01) Async, timeout. Will cause
>>> a leak of a command resource
>>> [ 1471.540351] mlx5_core :0d:00.2: modify_virtqueues:1617:(pid
>>> 8511) error: modify vq 0 failed, state: 0 -> 0, err: 0
>>> [ 1471.541433] mlx5_core :0d:00.2: modify_virtqueues:1617:(pid
>>> 8511) error: modify vq 1 failed, state: 0 -> 0, err: -110
>>> [ 1471.542388] mlx5_core :0d:00.2: mlx5_vdpa_set_status:3203:(pid
>>> 8511) warning: failed to resume VQs
>>> [ 1471.549778] mlx5_core :0d:00.2:
>>> mlx5_vdpa_show_mr_leaks:573:(pid 8511) warning: mkey still alive after
>>> resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
>>> [ 1512.929854] mlx5_core :0d:00.2:
>>> mlx5_vdpa_compat_reset:3267:(pid 8565): performing device reset
>>> [ 1513.100290] mlx5_core :0d:00.2:
>>> mlx5_vdpa_show_mr_leaks:573:(pid 8565) warning: mkey still alive after
>>> resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
>>>
> 
> Hi Dragos
> 
>> Can you provide more details about the qemu version and the vdpa device
>> options used?
>>
>> Also, which FW version are you using? There is a relevant bug in FW
>> 22.41.1000 which was fixed in the latest FW (22.42.1000). Did you
>> encounter any FW syndromes in the host dmesg log?
> 
> This problem has gone when I updated the firmware version to
> 22.42.1000, and I tested it with regression tests using mellanox nic,
> everything works well.
> 
> Tested-by: Lei Yang 
Good to hear. Thanks for the quick reaction.

Thanks,
Dragos

Re: [PATCH] vdpa/mlx5: Use random MAC address when no nic vport MAC set

2024-08-30 Thread Dragos Tatulea

Hi Cindy,

On 30.08.24 15:52, Dragos Tatulea wrote:
> 
> 
> On 30.08.24 11:12, Cindy Lu wrote:
>> On Thu, 29 Aug 2024 at 18:00, Dragos Tatulea  wrote:
>>>
>>>
>>>
>>> On 29.08.24 11:05, Cindy Lu wrote:
>>>> On Wed, 28 Aug 2024 at 17:37, Dragos Tatulea  wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 28.08.24 11:00, Cindy Lu wrote:
>>>>>> On Wed, 28 Aug 2024 at 09:51, Jason Wang  wrote:
>>>>>>>
>>>>>>> On Wed, Aug 28, 2024 at 12:03 AM Dragos Tatulea  
>>>>>>> wrote:
>>>>>>>>
>>>>>>>> When the vdpa device is configured without a specific MAC
>>>>>>>> address, the vport MAC address is used. However, this
>>>>>>>> address can be 0 which prevents the driver from properly
>>>>>>>> configuring the MPFS and breaks steering.
>>>>>>>>
>>>>>>>> The solution is to simply generate a random MAC address
>>>>>>>> when no MAC is set on the nic vport.
>>>>>>>>
>>>>>>>> Now it's possible to create a vdpa device without a
>>>>>>>> MAC address and run qemu with this device without needing
>>>>>>>> to configure an explicit MAC address.
>>>>>>>>
>>>>>>>> Signed-off-by: Dragos Tatulea 
>>>>>>>> Reviewed-by: Jiri Pirko 
>>>>>>>
>>>>>>> Acked-by: Jason Wang 
>>>>>>>
>>>>>>> (Adding Cindy for double checking if it has any side effect on Qemu 
>>>>>>> side)
>>>>>>>
>>>>>>> Thanks
>>>>>>>
>>>>>> But Now there is a bug in QEMU: if the hardware MAC address does not
>>>>>> match the one in the QEMU command line, it will cause traffic loss.
>>>>>>
>>>>> Why is this a new issue in qemu? qemu in it's current state won't work
>>>>> with a different mac address that the one that is set in HW anyway.
>>>>>
>>>> this is not a new bug. We are trying to fix it because it will cause
>>>> traffic lose without any warning.
>>>> in my fix , this setting (different mac in device and Qemu) will fail
>>>> to load the VM.
>>> Which is a good thing, right? Some feedback to the user that there is
>>> a misconfig. I got bitten by this so many times... Thank you for adding it.
>>>
>>>>
>>>>>> So, Just an FYI here: if your patch merged, it may cause traffic loss.
>>>>>> and now I'm working in the fix it in qemu, the link is
>>>>>> https://patchew.org/QEMU/20240716011349.821777-1-l...@redhat.com/
>>>>>> The idea of this fix is
>>>>>> There are will only two acceptable situations for qemu:
>>>>>> 1. The hardware MAC address is the same as the MAC address specified
>>>>>> in the QEMU command line, and both MAC addresses are not 0.
>>>>>> 2. The hardware MAC address is not 0, and the MAC address in the QEMU
>>>>>> command line is 0. In this situation, the hardware MAC address will
>>>>>> overwrite the QEMU command line address.
>>>>>>
>>>>> Why would this not work with this patch? This patch simply sets a MAC
>>>>> if the vport doesn't have one set. Which allows for more scenarios to
>>>>> work.
>>>>>
>>>> I do not mean your patch will not work, I just want to make some
>>>> clarify here.Your patch + my fix may cause the VM to fail to load in
>>>> some situations, and this is as expected.
>>>> Your patch is good to merge.
>>> Ack. Thank you for the clarification.
>>>
>>> Thanks,
>>> Dragos
>>>
>> Hi Dragos，
>>  I think we need to hold this patch. Because it may not be working
>> with upstream qemu.
>>
>> MLX will create a random MAC address for your patch. Additionally, if
>> there is no specific MAC in the QEMU command line, QEMU will also
>> generate a random MAC.
>> these two MAC are not the same. and this will cause traffic loss.
> Ahaa, it turns out that qemu 8.x and 9.x have different behaviour.
> 
> Initially I was testing this scenario (vdpa device created with no mac
> and no mac set in qemu cli) with qemu 8.x. There, qemu was not being
> able to set the qemu generated random mac addres because .set_config()
> is a nop in mlx5_vdpa.
> 
> Then I moved to qemu 9.x and saw that this scenario was working because
> now the CVQ was used instead to configure the mac on the device.
> 
> So this patch should definitely not be applied.
> 
> I was thinking if there are ways to fix this for 8.x. The only feasible
> way is to implement .set_config() in mlx5_vdpa for the mac
> configuration. But as you previousy said, this is discouraged.
> 
I just tested your referenced qemu fix from patchwork and I found that
for the case when a vdpa device doesn't have a mac address (mac address
0 and VIRTIO_NET_F_MAC not set) qemu will return an error. So with this
fix we'd be back to square one where the user always has to set a mac
somewhere.

Would it be possible to take this case into consideration with your
fix?

Thanks,
Dragos

Re: [PATCH] vdpa/mlx5: Use random MAC address when no nic vport MAC set

2024-08-30 Thread Dragos Tatulea




On 30.08.24 11:12, Cindy Lu wrote:
> On Thu, 29 Aug 2024 at 18:00, Dragos Tatulea  wrote:
>>
>>
>>
>> On 29.08.24 11:05, Cindy Lu wrote:
>>> On Wed, 28 Aug 2024 at 17:37, Dragos Tatulea  wrote:
>>>>
>>>>
>>>>
>>>> On 28.08.24 11:00, Cindy Lu wrote:
>>>>> On Wed, 28 Aug 2024 at 09:51, Jason Wang  wrote:
>>>>>>
>>>>>> On Wed, Aug 28, 2024 at 12:03 AM Dragos Tatulea  
>>>>>> wrote:
>>>>>>>
>>>>>>> When the vdpa device is configured without a specific MAC
>>>>>>> address, the vport MAC address is used. However, this
>>>>>>> address can be 0 which prevents the driver from properly
>>>>>>> configuring the MPFS and breaks steering.
>>>>>>>
>>>>>>> The solution is to simply generate a random MAC address
>>>>>>> when no MAC is set on the nic vport.
>>>>>>>
>>>>>>> Now it's possible to create a vdpa device without a
>>>>>>> MAC address and run qemu with this device without needing
>>>>>>> to configure an explicit MAC address.
>>>>>>>
>>>>>>> Signed-off-by: Dragos Tatulea 
>>>>>>> Reviewed-by: Jiri Pirko 
>>>>>>
>>>>>> Acked-by: Jason Wang 
>>>>>>
>>>>>> (Adding Cindy for double checking if it has any side effect on Qemu side)
>>>>>>
>>>>>> Thanks
>>>>>>
>>>>> But Now there is a bug in QEMU: if the hardware MAC address does not
>>>>> match the one in the QEMU command line, it will cause traffic loss.
>>>>>
>>>> Why is this a new issue in qemu? qemu in it's current state won't work
>>>> with a different mac address that the one that is set in HW anyway.
>>>>
>>> this is not a new bug. We are trying to fix it because it will cause
>>> traffic lose without any warning.
>>> in my fix , this setting (different mac in device and Qemu) will fail
>>> to load the VM.
>> Which is a good thing, right? Some feedback to the user that there is
>> a misconfig. I got bitten by this so many times... Thank you for adding it.
>>
>>>
>>>>> So, Just an FYI here: if your patch merged, it may cause traffic loss.
>>>>> and now I'm working in the fix it in qemu, the link is
>>>>> https://patchew.org/QEMU/20240716011349.821777-1-l...@redhat.com/
>>>>> The idea of this fix is
>>>>> There are will only two acceptable situations for qemu:
>>>>> 1. The hardware MAC address is the same as the MAC address specified
>>>>> in the QEMU command line, and both MAC addresses are not 0.
>>>>> 2. The hardware MAC address is not 0, and the MAC address in the QEMU
>>>>> command line is 0. In this situation, the hardware MAC address will
>>>>> overwrite the QEMU command line address.
>>>>>
>>>> Why would this not work with this patch? This patch simply sets a MAC
>>>> if the vport doesn't have one set. Which allows for more scenarios to
>>>> work.
>>>>
>>> I do not mean your patch will not work, I just want to make some
>>> clarify here.Your patch + my fix may cause the VM to fail to load in
>>> some situations, and this is as expected.
>>> Your patch is good to merge.
>> Ack. Thank you for the clarification.
>>
>> Thanks,
>> Dragos
>>
> Hi Dragos，
>  I think we need to hold this patch. Because it may not be working
> with upstream qemu.
> 
> MLX will create a random MAC address for your patch. Additionally, if
> there is no specific MAC in the QEMU command line, QEMU will also
> generate a random MAC.
> these two MAC are not the same. and this will cause traffic loss.
Ahaa, it turns out that qemu 8.x and 9.x have different behaviour.

Initially I was testing this scenario (vdpa device created with no mac
and no mac set in qemu cli) with qemu 8.x. There, qemu was not being
able to set the qemu generated random mac addres because .set_config()
is a nop in mlx5_vdpa.

Then I moved to qemu 9.x and saw that this scenario was working because
now the CVQ was used instead to configure the mac on the device.

So this patch should definitely not be applied.

I was thinking if there are ways to fix this for 8.x. The only feasible
way is to implement .set_config() in mlx5_vdpa for the mac
configuration. But as you previousy said, this is discouraged.

Thanks,
Dragos

Re: [PATCH vhost 1/7] vdpa/mlx5: Create direct MKEYs in parallel

2024-08-30 Thread Dragos Tatulea




On 29.08.24 17:15, Eugenio Perez Martin wrote:
> On Thu, Aug 29, 2024 at 3:54 PM Dragos Tatulea  wrote:
>>
>>
>>
>> On 29.08.24 15:10, Eugenio Perez Martin wrote:
>>> On Wed, Aug 21, 2024 at 1:41 PM Dragos Tatulea  wrote:
>>>>
>>>> Use the async interface to issue MTT MKEY creation.
>>>> Extra care is taken at the allocation of FW input commands
>>>> due to the MTT tables having variable sizes depending on
>>>> MR.
>>>>
>>>> The indirect MKEY is still created synchronously at the
>>>> end as the direct MKEYs need to be filled in.
>>>>
>>>> This makes create_user_mr() 3-5x faster, depending on
>>>> the size of the MR.
>>>>
>>>> Signed-off-by: Dragos Tatulea 
>>>> Reviewed-by: Cosmin Ratiu 
>>>> ---
>>>>  drivers/vdpa/mlx5/core/mr.c | 118 +---
>>>>  1 file changed, 96 insertions(+), 22 deletions(-)
>>>>
>>>> diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
>>>> index 4758914ccf86..66e6a15f823f 100644
>>>> --- a/drivers/vdpa/mlx5/core/mr.c
>>>> +++ b/drivers/vdpa/mlx5/core/mr.c
>>>> @@ -49,17 +49,18 @@ static void populate_mtts(struct mlx5_vdpa_direct_mr 
>>>> *mr, __be64 *mtt)
>>>> }
>>>>  }
>>>>
>>>> -static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct 
>>>> mlx5_vdpa_direct_mr *mr)
>>>> +struct mlx5_create_mkey_mem {
>>>> +   u8 out[MLX5_ST_SZ_BYTES(create_mkey_out)];
>>>> +   u8 in[MLX5_ST_SZ_BYTES(create_mkey_in)];
>>>> +   DECLARE_FLEX_ARRAY(__be64, mtt);
>>>
>>> I may be missing something obvious, but why do we need
>>> DECLARE_FLEX_ARRAY here? My understanding is that it is only needed in
>>> special cases like uapi headers and we can use "__be64 mtt[]" here.
>>>
>> checkpatch.pl was complaining about it because in my initial version I
>> used the "[0]" version of zero length based arrays.
>>
>> My impression was that DECLARE_FLEX_ARRAY is preferred option because it
>> triggers a compiler error if the zero lenth array is not at the end of
>> the struct. But on closer inspection I see that using the right C99
>> empty brackets notation is enough to trigger this error.
>> DECLARE_FLEX_ARRAY seems to be useful for the union case.
>>
>> I will change it in a v2.
>>
>>>> +};
>>>> +
>>>> +static void fill_create_direct_mr(struct mlx5_vdpa_dev *mvdev,
>>>> + struct mlx5_vdpa_direct_mr *mr,
>>>> + struct mlx5_create_mkey_mem *mem)
>>>>  {
>>>> -   int inlen;
>>>> +   void *in = &mem->in;
>>>> void *mkc;
>>>> -   void *in;
>>>> -   int err;
>>>> -
>>>> -   inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 
>>>> roundup(MLX5_ST_SZ_BYTES(mtt) * mr->nsg, 16);
>>>> -   in = kvzalloc(inlen, GFP_KERNEL);
>>>> -   if (!in)
>>>> -   return -ENOMEM;
>>>>
>>>> MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid);
>>>> mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
>>>> @@ -76,18 +77,25 @@ static int create_direct_mr(struct mlx5_vdpa_dev 
>>>> *mvdev, struct mlx5_vdpa_direct
>>>> MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
>>>>  get_octo_len(mr->end - mr->start, mr->log_size));
>>>> populate_mtts(mr, MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt));
>>>> -   err = mlx5_vdpa_create_mkey(mvdev, &mr->mr, in, inlen);
>>>> -   kvfree(in);
>>>> -   if (err) {
>>>> -   mlx5_vdpa_warn(mvdev, "Failed to create direct MR\n");
>>>> -   return err;
>>>> -   }
>>>>
>>>> -   return 0;
>>>> +   MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
>>>> +   MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid);
>>>> +}
>>>> +
>>>> +static void create_direct_mr_end(struct mlx5_vdpa_dev *mvdev,
>>>> +struct mlx5_vdpa_direct_mr *mr,
>>>> +struct mlx5_create_mkey_mem *mem)
>>>

[PATCH vhost v2 5/7] vdpa/mlx5: Rename mr_mtx -> lock

2024-08-30 Thread Dragos Tatulea

Now that the mr resources have their own namespace in the
struct, give the lock a clearer name.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h |  2 +-
 drivers/vdpa/mlx5/core/mr.c| 20 ++--
 drivers/vdpa/mlx5/core/resources.c |  6 +++---
 drivers/vdpa/mlx5/net/mlx5_vnet.c  |  4 ++--
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 5ae6deea2a8a..89b564cecddf 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -87,7 +87,7 @@ struct mlx5_vdpa_mr_resources {
struct mlx5_vdpa_mr *mr[MLX5_VDPA_NUM_AS];
unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
struct list_head mr_list_head;
-   struct mutex mr_mtx;
+   struct mutex lock;
 };
 
 struct mlx5_vdpa_dev {
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 95087d7ae78a..e0412297bae5 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -666,9 +666,9 @@ static void _mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev,
 void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_mr *mr)
 {
-   mutex_lock(&mvdev->mres.mr_mtx);
+   mutex_lock(&mvdev->mres.lock);
_mlx5_vdpa_put_mr(mvdev, mr);
-   mutex_unlock(&mvdev->mres.mr_mtx);
+   mutex_unlock(&mvdev->mres.lock);
 }
 
 static void _mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
@@ -683,9 +683,9 @@ static void _mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
 void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_mr *mr)
 {
-   mutex_lock(&mvdev->mres.mr_mtx);
+   mutex_lock(&mvdev->mres.lock);
_mlx5_vdpa_get_mr(mvdev, mr);
-   mutex_unlock(&mvdev->mres.mr_mtx);
+   mutex_unlock(&mvdev->mres.lock);
 }
 
 void mlx5_vdpa_update_mr(struct mlx5_vdpa_dev *mvdev,
@@ -694,19 +694,19 @@ void mlx5_vdpa_update_mr(struct mlx5_vdpa_dev *mvdev,
 {
struct mlx5_vdpa_mr *old_mr = mvdev->mres.mr[asid];
 
-   mutex_lock(&mvdev->mres.mr_mtx);
+   mutex_lock(&mvdev->mres.lock);
 
_mlx5_vdpa_put_mr(mvdev, old_mr);
mvdev->mres.mr[asid] = new_mr;
 
-   mutex_unlock(&mvdev->mres.mr_mtx);
+   mutex_unlock(&mvdev->mres.lock);
 }
 
 static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev *mvdev)
 {
struct mlx5_vdpa_mr *mr;
 
-   mutex_lock(&mvdev->mres.mr_mtx);
+   mutex_lock(&mvdev->mres.lock);
 
list_for_each_entry(mr, &mvdev->mres.mr_list_head, mr_list) {
 
@@ -715,7 +715,7 @@ static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev 
*mvdev)
   mr, mr->mkey, 
refcount_read(&mr->refcount));
}
 
-   mutex_unlock(&mvdev->mres.mr_mtx);
+   mutex_unlock(&mvdev->mres.lock);
 
 }
 
@@ -779,9 +779,9 @@ struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct 
mlx5_vdpa_dev *mvdev,
if (!mr)
return ERR_PTR(-ENOMEM);
 
-   mutex_lock(&mvdev->mres.mr_mtx);
+   mutex_lock(&mvdev->mres.lock);
err = _mlx5_vdpa_create_mr(mvdev, mr, iotlb);
-   mutex_unlock(&mvdev->mres.mr_mtx);
+   mutex_unlock(&mvdev->mres.lock);
 
if (err)
goto out_err;
diff --git a/drivers/vdpa/mlx5/core/resources.c 
b/drivers/vdpa/mlx5/core/resources.c
index 3e3b3049cb08..fe2ca3458f6c 100644
--- a/drivers/vdpa/mlx5/core/resources.c
+++ b/drivers/vdpa/mlx5/core/resources.c
@@ -256,7 +256,7 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
mlx5_vdpa_warn(mvdev, "resources already allocated\n");
return -EINVAL;
}
-   mutex_init(&mvdev->mres.mr_mtx);
+   mutex_init(&mvdev->mres.lock);
res->uar = mlx5_get_uars_page(mdev);
if (IS_ERR(res->uar)) {
err = PTR_ERR(res->uar);
@@ -301,7 +301,7 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
 err_uctx:
mlx5_put_uars_page(mdev, res->uar);
 err_uars:
-   mutex_destroy(&mvdev->mres.mr_mtx);
+   mutex_destroy(&mvdev->mres.lock);
return err;
 }
 
@@ -318,7 +318,7 @@ void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev)
dealloc_pd(mvdev, res->pdn, res->uid);
destroy_uctx(mvdev, res->uid);
mlx5_put_uars_page(mvdev->mdev, res->uar);
-   mutex_destroy(&mvdev->mres.mr_mtx);
+   mutex_destroy(&mvdev->mres.lock);
res->valid = false;
 }
 
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 3e55a7f1afcd..8a51c492a62a 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vn

[PATCH vhost v2 7/7] vdpa/mlx5: Postpone MR deletion

2024-08-30 Thread Dragos Tatulea

Currently, when a new MR is set up, the old MR is deleted. MR deletion
is about 30-40% the time of MR creation. As deleting the old MR is not
important for the process of setting up the new MR, this operation
can be postponed.

This series adds a workqueue that does MR garbage collection at a later
point. If the MR lock is taken, the handler will back off and
reschedule. The exception during shutdown: then the handler must
not postpone the work.

Note that this is only a speculative optimization: if there is some
mapping operation that is triggered while the garbage collector handler
has the lock taken, this operation it will have to wait for the handler
to finish.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h | 10 ++
 drivers/vdpa/mlx5/core/mr.c| 55 --
 drivers/vdpa/mlx5/net/mlx5_vnet.c  |  4 +--
 3 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index c3e17bc888e8..2cedf7e2dbc4 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -86,8 +86,18 @@ enum {
 struct mlx5_vdpa_mr_resources {
struct mlx5_vdpa_mr *mr[MLX5_VDPA_NUM_AS];
unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
+
+   /* Pre-deletion mr list */
struct list_head mr_list_head;
+
+   /* Deferred mr list */
+   struct list_head mr_gc_list_head;
+   struct workqueue_struct *wq_gc;
+   struct delayed_work gc_dwork_ent;
+
struct mutex lock;
+
+   atomic_t shutdown;
 };
 
 struct mlx5_vdpa_dev {
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 0bc99f159046..55755e97a946 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -653,14 +653,50 @@ static void _mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev 
*mvdev, struct mlx5_vdpa_
kfree(mr);
 }
 
+/* There can be multiple .set_map() operations in quick succession.
+ * This large delay is a simple way to prevent the MR cleanup from blocking
+ * .set_map() MR creation in this scenario.
+ */
+#define MLX5_VDPA_MR_GC_TRIGGER_MS 2000
+
+static void mlx5_vdpa_mr_gc_handler(struct work_struct *work)
+{
+   struct mlx5_vdpa_mr_resources *mres;
+   struct mlx5_vdpa_mr *mr, *tmp;
+   struct mlx5_vdpa_dev *mvdev;
+
+   mres = container_of(work, struct mlx5_vdpa_mr_resources, 
gc_dwork_ent.work);
+
+   if (atomic_read(&mres->shutdown)) {
+   mutex_lock(&mres->lock);
+   } else if (!mutex_trylock(&mres->lock)) {
+   queue_delayed_work(mres->wq_gc, &mres->gc_dwork_ent,
+  
msecs_to_jiffies(MLX5_VDPA_MR_GC_TRIGGER_MS));
+   return;
+   }
+
+   mvdev = container_of(mres, struct mlx5_vdpa_dev, mres);
+
+   list_for_each_entry_safe(mr, tmp, &mres->mr_gc_list_head, mr_list) {
+   _mlx5_vdpa_destroy_mr(mvdev, mr);
+   }
+
+   mutex_unlock(&mres->lock);
+}
+
 static void _mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_mr *mr)
 {
+   struct mlx5_vdpa_mr_resources *mres = &mvdev->mres;
+
if (!mr)
return;
 
-   if (refcount_dec_and_test(&mr->refcount))
-   _mlx5_vdpa_destroy_mr(mvdev, mr);
+   if (refcount_dec_and_test(&mr->refcount)) {
+   list_move_tail(&mr->mr_list, &mres->mr_gc_list_head);
+   queue_delayed_work(mres->wq_gc, &mres->gc_dwork_ent,
+  
msecs_to_jiffies(MLX5_VDPA_MR_GC_TRIGGER_MS));
+   }
 }
 
 void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev,
@@ -848,9 +884,17 @@ int mlx5_vdpa_init_mr_resources(struct mlx5_vdpa_dev 
*mvdev)
 {
struct mlx5_vdpa_mr_resources *mres = &mvdev->mres;
 
-   INIT_LIST_HEAD(&mres->mr_list_head);
+   mres->wq_gc = create_singlethread_workqueue("mlx5_vdpa_mr_gc");
+   if (!mres->wq_gc)
+   return -ENOMEM;
+
+   INIT_DELAYED_WORK(&mres->gc_dwork_ent, mlx5_vdpa_mr_gc_handler);
+
mutex_init(&mres->lock);
 
+   INIT_LIST_HEAD(&mres->mr_list_head);
+   INIT_LIST_HEAD(&mres->mr_gc_list_head);
+
return 0;
 }
 
@@ -858,5 +902,10 @@ void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev 
*mvdev)
 {
struct mlx5_vdpa_mr_resources *mres = &mvdev->mres;
 
+   atomic_set(&mres->shutdown, 1);
+
+   flush_delayed_work(&mres->gc_dwork_ent);
+   destroy_workqueue(mres->wq_gc);
+   mres->wq_gc = NULL;
mutex_destroy(&mres->lock);
 }
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index fc86e33e620a..9ccbe1c1ec15 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@

[PATCH vhost v2 3/7] vdpa/mlx5: Rename function

2024-08-30 Thread Dragos Tatulea

A followup patch will use this name for something else.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h | 2 +-
 drivers/vdpa/mlx5/core/mr.c| 2 +-
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 8 
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 24fa00afb24f..4d217d18239c 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -135,7 +135,7 @@ int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, u32 
*mkey, u32 *in,
 int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 mkey);
 struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev,
 struct vhost_iotlb *iotlb);
-void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev);
+void mlx5_vdpa_clean_mrs(struct mlx5_vdpa_dev *mvdev);
 void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_mr *mr);
 void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev,
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 64bcae2bae8a..50bb2cc95ea2 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -719,7 +719,7 @@ static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev 
*mvdev)
 
 }
 
-void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev)
+void mlx5_vdpa_clean_mrs(struct mlx5_vdpa_dev *mvdev)
 {
for (int i = 0; i < MLX5_VDPA_NUM_AS; i++)
mlx5_vdpa_update_mr(mvdev, NULL, i);
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 822092eccb32..cf2b77ebc72b 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3223,7 +3223,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
 err_driver:
unregister_link_notifier(ndev);
 err_setup:
-   mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
+   mlx5_vdpa_clean_mrs(&ndev->mvdev);
ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
 err_clear:
up_write(&ndev->reslock);
@@ -3275,7 +3275,7 @@ static int mlx5_vdpa_compat_reset(struct vdpa_device 
*vdev, u32 flags)
}
 
if (flags & VDPA_RESET_F_CLEAN_MAP)
-   mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
+   mlx5_vdpa_clean_mrs(&ndev->mvdev);
ndev->mvdev.status = 0;
ndev->mvdev.suspended = false;
ndev->cur_num_vqs = MLX5V_DEFAULT_VQ_COUNT;
@@ -3433,7 +3433,7 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev)
ndev = to_mlx5_vdpa_ndev(mvdev);
 
free_fixed_resources(ndev);
-   mlx5_vdpa_destroy_mr_resources(mvdev);
+   mlx5_vdpa_clean_mrs(mvdev);
if (!is_zero_ether_addr(ndev->config.mac)) {
pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
@@ -4008,7 +4008,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
 err_res2:
free_fixed_resources(ndev);
 err_mr:
-   mlx5_vdpa_destroy_mr_resources(mvdev);
+   mlx5_vdpa_clean_mrs(mvdev);
 err_res:
mlx5_vdpa_free_resources(&ndev->mvdev);
 err_mpfs:
-- 
2.45.1

[PATCH vhost v2 6/7] vdpa/mlx5: Introduce init/destroy for MR resources

2024-08-30 Thread Dragos Tatulea

There's currently not a lot of action happening during
the init/destroy of MR resources. But more will be added
in the upcoming patches.

As the mr mutex lock init/destroy has been moved to these
new functions, the lifetime has now shifted away from
mlx5_vdpa_alloc_resources() / mlx5_vdpa_free_resources()
into these new functions. However, the lifetime at the
outer scope remains the same:
mlx5_vdpa_dev_add() / mlx5_vdpa_dev_free()

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h |  2 ++
 drivers/vdpa/mlx5/core/mr.c| 17 +
 drivers/vdpa/mlx5/core/resources.c |  3 ---
 drivers/vdpa/mlx5/net/mlx5_vnet.c  |  9 +++--
 4 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 89b564cecddf..c3e17bc888e8 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -138,6 +138,8 @@ int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, u32 
*mkey, u32 *in,
 int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 mkey);
 struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev,
 struct vhost_iotlb *iotlb);
+int mlx5_vdpa_init_mr_resources(struct mlx5_vdpa_dev *mvdev);
+void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev);
 void mlx5_vdpa_clean_mrs(struct mlx5_vdpa_dev *mvdev);
 void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_mr *mr);
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index e0412297bae5..0bc99f159046 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -843,3 +843,20 @@ int mlx5_vdpa_reset_mr(struct mlx5_vdpa_dev *mvdev, 
unsigned int asid)
 
return 0;
 }
+
+int mlx5_vdpa_init_mr_resources(struct mlx5_vdpa_dev *mvdev)
+{
+   struct mlx5_vdpa_mr_resources *mres = &mvdev->mres;
+
+   INIT_LIST_HEAD(&mres->mr_list_head);
+   mutex_init(&mres->lock);
+
+   return 0;
+}
+
+void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev)
+{
+   struct mlx5_vdpa_mr_resources *mres = &mvdev->mres;
+
+   mutex_destroy(&mres->lock);
+}
diff --git a/drivers/vdpa/mlx5/core/resources.c 
b/drivers/vdpa/mlx5/core/resources.c
index fe2ca3458f6c..aeae31d0cefa 100644
--- a/drivers/vdpa/mlx5/core/resources.c
+++ b/drivers/vdpa/mlx5/core/resources.c
@@ -256,7 +256,6 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
mlx5_vdpa_warn(mvdev, "resources already allocated\n");
return -EINVAL;
}
-   mutex_init(&mvdev->mres.lock);
res->uar = mlx5_get_uars_page(mdev);
if (IS_ERR(res->uar)) {
err = PTR_ERR(res->uar);
@@ -301,7 +300,6 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
 err_uctx:
mlx5_put_uars_page(mdev, res->uar);
 err_uars:
-   mutex_destroy(&mvdev->mres.lock);
return err;
 }
 
@@ -318,7 +316,6 @@ void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev)
dealloc_pd(mvdev, res->pdn, res->uid);
destroy_uctx(mvdev, res->uid);
mlx5_put_uars_page(mvdev->mdev, res->uar);
-   mutex_destroy(&mvdev->mres.lock);
res->valid = false;
 }
 
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 8a51c492a62a..fc86e33e620a 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3434,6 +3434,7 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev)
 
free_fixed_resources(ndev);
mlx5_vdpa_clean_mrs(mvdev);
+   mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
if (!is_zero_ether_addr(ndev->config.mac)) {
pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
@@ -3962,12 +3963,14 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
if (err)
goto err_mpfs;
 
-   INIT_LIST_HEAD(&mvdev->mres.mr_list_head);
+   err = mlx5_vdpa_init_mr_resources(mvdev);
+   if (err)
+   goto err_res;
 
if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
err = mlx5_vdpa_create_dma_mr(mvdev);
if (err)
-   goto err_res;
+   goto err_mr_res;
}
 
err = alloc_fixed_resources(ndev);
@@ -4009,6 +4012,8 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
free_fixed_resources(ndev);
 err_mr:
mlx5_vdpa_clean_mrs(mvdev);
+err_mr_res:
+   mlx5_vdpa_destroy_mr_resources(mvdev);
 err_res:
mlx5_vdpa_free_resources(&ndev->mvdev);
 err_mpfs:
-- 
2.45.1

[PATCH vhost v2 4/7] vdpa/mlx5: Extract mr members in own resource struct

2024-08-30 Thread Dragos Tatulea

Group all mapping related resources into their own structure.

Upcoming patches will add more members in this new structure.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h | 13 ++-
 drivers/vdpa/mlx5/core/mr.c| 30 -
 drivers/vdpa/mlx5/core/resources.c |  6 ++---
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 36 +++---
 4 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 4d217d18239c..5ae6deea2a8a 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -83,10 +83,18 @@ enum {
MLX5_VDPA_NUM_AS = 2
 };
 
+struct mlx5_vdpa_mr_resources {
+   struct mlx5_vdpa_mr *mr[MLX5_VDPA_NUM_AS];
+   unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
+   struct list_head mr_list_head;
+   struct mutex mr_mtx;
+};
+
 struct mlx5_vdpa_dev {
struct vdpa_device vdev;
struct mlx5_core_dev *mdev;
struct mlx5_vdpa_resources res;
+   struct mlx5_vdpa_mr_resources mres;
 
u64 mlx_features;
u64 actual_features;
@@ -95,13 +103,8 @@ struct mlx5_vdpa_dev {
u16 max_idx;
u32 generation;
 
-   struct mlx5_vdpa_mr *mr[MLX5_VDPA_NUM_AS];
-   struct list_head mr_list_head;
-   /* serialize mr access */
-   struct mutex mr_mtx;
struct mlx5_control_vq cvq;
struct workqueue_struct *wq;
-   unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
bool suspended;
 
struct mlx5_async_ctx async_ctx;
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 50bb2cc95ea2..95087d7ae78a 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -666,9 +666,9 @@ static void _mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev,
 void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_mr *mr)
 {
-   mutex_lock(&mvdev->mr_mtx);
+   mutex_lock(&mvdev->mres.mr_mtx);
_mlx5_vdpa_put_mr(mvdev, mr);
-   mutex_unlock(&mvdev->mr_mtx);
+   mutex_unlock(&mvdev->mres.mr_mtx);
 }
 
 static void _mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
@@ -683,39 +683,39 @@ static void _mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
 void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_mr *mr)
 {
-   mutex_lock(&mvdev->mr_mtx);
+   mutex_lock(&mvdev->mres.mr_mtx);
_mlx5_vdpa_get_mr(mvdev, mr);
-   mutex_unlock(&mvdev->mr_mtx);
+   mutex_unlock(&mvdev->mres.mr_mtx);
 }
 
 void mlx5_vdpa_update_mr(struct mlx5_vdpa_dev *mvdev,
 struct mlx5_vdpa_mr *new_mr,
 unsigned int asid)
 {
-   struct mlx5_vdpa_mr *old_mr = mvdev->mr[asid];
+   struct mlx5_vdpa_mr *old_mr = mvdev->mres.mr[asid];
 
-   mutex_lock(&mvdev->mr_mtx);
+   mutex_lock(&mvdev->mres.mr_mtx);
 
_mlx5_vdpa_put_mr(mvdev, old_mr);
-   mvdev->mr[asid] = new_mr;
+   mvdev->mres.mr[asid] = new_mr;
 
-   mutex_unlock(&mvdev->mr_mtx);
+   mutex_unlock(&mvdev->mres.mr_mtx);
 }
 
 static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev *mvdev)
 {
struct mlx5_vdpa_mr *mr;
 
-   mutex_lock(&mvdev->mr_mtx);
+   mutex_lock(&mvdev->mres.mr_mtx);
 
-   list_for_each_entry(mr, &mvdev->mr_list_head, mr_list) {
+   list_for_each_entry(mr, &mvdev->mres.mr_list_head, mr_list) {
 
mlx5_vdpa_warn(mvdev, "mkey still alive after resource delete: "
  "mr: %p, mkey: 0x%x, refcount: %u\n",
   mr, mr->mkey, 
refcount_read(&mr->refcount));
}
 
-   mutex_unlock(&mvdev->mr_mtx);
+   mutex_unlock(&mvdev->mres.mr_mtx);
 
 }
 
@@ -753,7 +753,7 @@ static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev,
if (err)
goto err_iotlb;
 
-   list_add_tail(&mr->mr_list, &mvdev->mr_list_head);
+   list_add_tail(&mr->mr_list, &mvdev->mres.mr_list_head);
 
return 0;
 
@@ -779,9 +779,9 @@ struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct 
mlx5_vdpa_dev *mvdev,
if (!mr)
return ERR_PTR(-ENOMEM);
 
-   mutex_lock(&mvdev->mr_mtx);
+   mutex_lock(&mvdev->mres.mr_mtx);
err = _mlx5_vdpa_create_mr(mvdev, mr, iotlb);
-   mutex_unlock(&mvdev->mr_mtx);
+   mutex_unlock(&mvdev->mres.mr_mtx);
 
if (err)
goto out_err;
@@ -801,7 +801,7 @@ int mlx5_vdpa_update_cvq_iotlb(struct mlx5_vdpa_dev *mvdev,
 {
int err;
 
-   if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] != asid)
+   if (mvdev->mres.group2a

[PATCH vhost v2 2/7] vdpa/mlx5: Delete direct MKEYs in parallel

2024-08-30 Thread Dragos Tatulea

Use the async interface to issue MTT MKEY deletion.

This makes destroy_user_mr() on average 8x times faster.
This number is also dependent on the size of the MR being
deleted.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
---
 drivers/vdpa/mlx5/core/mr.c | 64 +
 1 file changed, 64 insertions(+)

diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index e72fb11e353d..64bcae2bae8a 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -55,6 +55,11 @@ struct mlx5_create_mkey_mem {
__be64 mtt[];
 };
 
+struct mlx5_destroy_mkey_mem {
+   u8 out[MLX5_ST_SZ_BYTES(destroy_mkey_out)];
+   u8 in[MLX5_ST_SZ_BYTES(destroy_mkey_in)];
+};
+
 static void fill_create_direct_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_direct_mr *mr,
  struct mlx5_create_mkey_mem *mem)
@@ -91,6 +96,17 @@ static void create_direct_mr_end(struct mlx5_vdpa_dev *mvdev,
mr->mr = mlx5_idx_to_mkey(mkey_index);
 }
 
+static void fill_destroy_direct_mr(struct mlx5_vdpa_dev *mvdev,
+  struct mlx5_vdpa_direct_mr *mr,
+  struct mlx5_destroy_mkey_mem *mem)
+{
+   void *in = &mem->in;
+
+   MLX5_SET(destroy_mkey_in, in, uid, mvdev->res.uid);
+   MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY);
+   MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mr->mr));
+}
+
 static void destroy_direct_mr(struct mlx5_vdpa_dev *mvdev, struct 
mlx5_vdpa_direct_mr *mr)
 {
if (!mr->mr)
@@ -257,6 +273,53 @@ static int create_direct_keys(struct mlx5_vdpa_dev *mvdev, 
struct mlx5_vdpa_mr *
return err;
 }
 
+DEFINE_FREE(free_cmds, struct mlx5_vdpa_async_cmd *, kvfree(_T))
+DEFINE_FREE(free_cmd_mem, struct mlx5_destroy_mkey_mem *, kvfree(_T))
+
+static int destroy_direct_keys(struct mlx5_vdpa_dev *mvdev, struct 
mlx5_vdpa_mr *mr)
+{
+   struct mlx5_destroy_mkey_mem *cmd_mem __free(free_cmd_mem) = NULL;
+   struct mlx5_vdpa_async_cmd *cmds __free(free_cmds) = NULL;
+   struct mlx5_vdpa_direct_mr *dmr;
+   int err = 0;
+   int i = 0;
+
+   cmds = kvcalloc(mr->num_directs, sizeof(*cmds), GFP_KERNEL);
+   cmd_mem = kvcalloc(mr->num_directs, sizeof(*cmd_mem), GFP_KERNEL);
+   if (!cmds || !cmd_mem)
+   return -ENOMEM;
+
+   list_for_each_entry(dmr, &mr->head, list) {
+   cmds[i].out = cmd_mem[i].out;
+   cmds[i].outlen = sizeof(cmd_mem[i].out);
+   cmds[i].in = cmd_mem[i].in;
+   cmds[i].inlen = sizeof(cmd_mem[i].in);
+   fill_destroy_direct_mr(mvdev, dmr, &cmd_mem[i]);
+   i++;
+   }
+
+   err = mlx5_vdpa_exec_async_cmds(mvdev, cmds, mr->num_directs);
+   if (err) {
+
+   mlx5_vdpa_err(mvdev, "error issuing MTT mkey deletion for 
direct mrs: %d\n", err);
+   return err;
+   }
+
+   i = 0;
+   list_for_each_entry(dmr, &mr->head, list) {
+   struct mlx5_vdpa_async_cmd *cmd = &cmds[i++];
+
+   dmr->mr = 0;
+   if (cmd->err) {
+   err = err ? err : cmd->err;
+   mlx5_vdpa_err(mvdev, "error deleting MTT mkey [0x%llx, 
0x%llx]: %d\n",
+   dmr->start, dmr->end, cmd->err);
+   }
+   }
+
+   return err;
+}
+
 static int create_indirect_key(struct mlx5_vdpa_dev *mvdev, struct 
mlx5_vdpa_mr *mr)
 {
int inlen;
@@ -565,6 +628,7 @@ static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, 
struct mlx5_vdpa_mr *mr
struct mlx5_vdpa_direct_mr *n;
 
destroy_indirect_key(mvdev, mr);
+   destroy_direct_keys(mvdev, mr);
list_for_each_entry_safe_reverse(dmr, n, &mr->head, list) {
list_del_init(&dmr->list);
unmap_direct_mr(mvdev, dmr);
-- 
2.45.1

[PATCH vhost v2 1/7] vdpa/mlx5: Create direct MKEYs in parallel

2024-08-30 Thread Dragos Tatulea

Use the async interface to issue MTT MKEY creation.
Extra care is taken at the allocation of FW input commands
due to the MTT tables having variable sizes depending on
MR.

The indirect MKEY is still created synchronously at the
end as the direct MKEYs need to be filled in.

This makes create_user_mr() 3-5x faster, depending on
the size of the MR.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/core/mr.c | 120 +---
 1 file changed, 98 insertions(+), 22 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 4758914ccf86..e72fb11e353d 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -49,17 +49,18 @@ static void populate_mtts(struct mlx5_vdpa_direct_mr *mr, 
__be64 *mtt)
}
 }
 
-static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct 
mlx5_vdpa_direct_mr *mr)
+struct mlx5_create_mkey_mem {
+   u8 out[MLX5_ST_SZ_BYTES(create_mkey_out)];
+   u8 in[MLX5_ST_SZ_BYTES(create_mkey_in)];
+   __be64 mtt[];
+};
+
+static void fill_create_direct_mr(struct mlx5_vdpa_dev *mvdev,
+ struct mlx5_vdpa_direct_mr *mr,
+ struct mlx5_create_mkey_mem *mem)
 {
-   int inlen;
+   void *in = &mem->in;
void *mkc;
-   void *in;
-   int err;
-
-   inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 
roundup(MLX5_ST_SZ_BYTES(mtt) * mr->nsg, 16);
-   in = kvzalloc(inlen, GFP_KERNEL);
-   if (!in)
-   return -ENOMEM;
 
MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid);
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
@@ -76,18 +77,25 @@ static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, 
struct mlx5_vdpa_direct
MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
 get_octo_len(mr->end - mr->start, mr->log_size));
populate_mtts(mr, MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt));
-   err = mlx5_vdpa_create_mkey(mvdev, &mr->mr, in, inlen);
-   kvfree(in);
-   if (err) {
-   mlx5_vdpa_warn(mvdev, "Failed to create direct MR\n");
-   return err;
-   }
 
-   return 0;
+   MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
+   MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid);
+}
+
+static void create_direct_mr_end(struct mlx5_vdpa_dev *mvdev,
+struct mlx5_vdpa_direct_mr *mr,
+struct mlx5_create_mkey_mem *mem)
+{
+   u32 mkey_index = MLX5_GET(create_mkey_out, mem->out, mkey_index);
+
+   mr->mr = mlx5_idx_to_mkey(mkey_index);
 }
 
 static void destroy_direct_mr(struct mlx5_vdpa_dev *mvdev, struct 
mlx5_vdpa_direct_mr *mr)
 {
+   if (!mr->mr)
+   return;
+
mlx5_vdpa_destroy_mkey(mvdev, mr->mr);
 }
 
@@ -179,6 +187,76 @@ static int klm_byte_size(int nklms)
return 16 * ALIGN(nklms, 4);
 }
 
+#define MLX5_VDPA_MTT_ALIGN 16
+
+static int create_direct_keys(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr 
*mr)
+{
+   struct mlx5_vdpa_async_cmd *cmds;
+   struct mlx5_vdpa_direct_mr *dmr;
+   int err = 0;
+   int i = 0;
+
+   cmds = kvcalloc(mr->num_directs, sizeof(*cmds), GFP_KERNEL);
+   if (!cmds)
+   return -ENOMEM;
+
+   list_for_each_entry(dmr, &mr->head, list) {
+   struct mlx5_create_mkey_mem *cmd_mem;
+   int mttlen, mttcount;
+
+   mttlen = roundup(MLX5_ST_SZ_BYTES(mtt) * dmr->nsg, 
MLX5_VDPA_MTT_ALIGN);
+   mttcount = mttlen / sizeof(cmd_mem->mtt[0]);
+   cmd_mem = kvcalloc(1, struct_size(cmd_mem, mtt, mttcount), 
GFP_KERNEL);
+   if (!cmd_mem) {
+   err = -ENOMEM;
+   goto done;
+   }
+
+   cmds[i].out = cmd_mem->out;
+   cmds[i].outlen = sizeof(cmd_mem->out);
+   cmds[i].in = cmd_mem->in;
+   cmds[i].inlen = struct_size(cmd_mem, mtt, mttcount);
+
+   fill_create_direct_mr(mvdev, dmr, cmd_mem);
+
+   i++;
+   }
+
+   err = mlx5_vdpa_exec_async_cmds(mvdev, cmds, mr->num_directs);
+   if (err) {
+
+   mlx5_vdpa_err(mvdev, "error issuing MTT mkey creation for 
direct mrs: %d\n", err);
+   goto done;
+   }
+
+   i = 0;
+   list_for_each_entry(dmr, &mr->head, list) {
+   struct mlx5_vdpa_async_cmd *cmd = &cmds[i++];
+   struct mlx5_create_mkey_mem *cmd_mem;
+
+   cmd_mem = container_of(cmd->out, struct mlx5_create_mkey_mem, 
out);
+
+   if (!cmd->err) {
+   create_direct_mr_end(mvdev, dmr, cmd_mem);
+   } else {
+   err = err ?

[PATCH vhost v2 0/7] vdpa/mlx5: Optimze MKEY operations

2024-08-30 Thread Dragos Tatulea

This series improves the time of .set_map() operations by parallelizing
the MKEY creation and deletion for direct MKEYs. Looking at the top
level MKEY creation/deletion functions, the following improvement can be
seen:

|---+-|
| operation | improvement |
|---+-|
| create_user_mr()  | 3-5x|
| destroy_user_mr() | 8x  |
|---+-|

The last part of the series introduces lazy MKEY deletion which
postpones the MKEY deletion to a later point in a workqueue.

As this series and the previous ones were targeting live migration,
we can also observe improvements on this front:

|---+--+--|
| Stage | Downtime #1 (ms) | Downtime #2 (ms) |
|---+--+--|
| Baseline  | 3140 | 3630 |
| Parallel MKEY ops | 1200 | 2000 |
| Deferred deletion | 1014 | 1253 |
|---+--+--|

Test configuration: 256 GB VM, 32 CPUs x 2 threads per core, 4 x mlx5
vDPA devices x 32 VQs (16 VQPs)

This series must be applied on top of the parallel VQ suspend/resume
series [0].

[0] https://lore.kernel.org/all/20240816090159.1967650-1-dtatu...@nvidia.com/

---
v2:
- Swapped flex array usage for plain zero length array in first patch.
- Updated code to use Scope-Based Cleanup Helpers where appropriate
  (only second patch).
- Added macro define for MTT alignment in first patch.
- Improved commit messages/comments based on review comments.
- Removed extra newlines.
---

Dragos Tatulea (7):
  vdpa/mlx5: Create direct MKEYs in parallel
  vdpa/mlx5: Delete direct MKEYs in parallel
  vdpa/mlx5: Rename function
  vdpa/mlx5: Extract mr members in own resource struct
  vdpa/mlx5: Rename mr_mtx -> lock
  vdpa/mlx5: Introduce init/destroy for MR resources
  vdpa/mlx5: Postpone MR deletion

 drivers/vdpa/mlx5/core/mlx5_vdpa.h |  25 ++-
 drivers/vdpa/mlx5/core/mr.c| 288 +
 drivers/vdpa/mlx5/core/resources.c |   3 -
 drivers/vdpa/mlx5/net/mlx5_vnet.c  |  53 +++---
 4 files changed, 296 insertions(+), 73 deletions(-)

-- 
2.45.1

Re: [PATCH] vdpa/mlx5: Use random MAC address when no nic vport MAC set

2024-08-30 Thread Dragos Tatulea



Hi Cindy,

On 30.08.24 11:29, Cindy Lu wrote:
> On Fri, 30 Aug 2024 at 03:03, Dragos Tatulea  wrote:
>>
>>
>>
>> On 29.08.24 12:00, Dragos Tatulea wrote:
>>>
>>>
>>> On 29.08.24 11:05, Cindy Lu wrote:
>>>> On Wed, 28 Aug 2024 at 17:37, Dragos Tatulea  wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 28.08.24 11:00, Cindy Lu wrote:
>>>>>> On Wed, 28 Aug 2024 at 09:51, Jason Wang  wrote:
>>>>>>>
>>>>>>> On Wed, Aug 28, 2024 at 12:03 AM Dragos Tatulea  
>>>>>>> wrote:
>>>>>>>>
>>>>>>>> When the vdpa device is configured without a specific MAC
>>>>>>>> address, the vport MAC address is used. However, this
>>>>>>>> address can be 0 which prevents the driver from properly
>>>>>>>> configuring the MPFS and breaks steering.
>>>>>>>>
>>>>>>>> The solution is to simply generate a random MAC address
>>>>>>>> when no MAC is set on the nic vport.
>>>>>>>>
>>>>>>>> Now it's possible to create a vdpa device without a
>>>>>>>> MAC address and run qemu with this device without needing
>>>>>>>> to configure an explicit MAC address.
>>>>>>>>
>>>>>>>> Signed-off-by: Dragos Tatulea 
>>>>>>>> Reviewed-by: Jiri Pirko 
>>>>>>>
>>>>>>> Acked-by: Jason Wang 
>>>>>>>
>>>>>>> (Adding Cindy for double checking if it has any side effect on Qemu 
>>>>>>> side)
>>>>>>>
>>>>>>> Thanks
>>>>>>>
>>>>>> But Now there is a bug in QEMU: if the hardware MAC address does not
>>>>>> match the one in the QEMU command line, it will cause traffic loss.
>>>>>>
>>>>> Why is this a new issue in qemu? qemu in it's current state won't work
>>>>> with a different mac address that the one that is set in HW anyway.
>>>>>
>>>> this is not a new bug. We are trying to fix it because it will cause
>>>> traffic lose without any warning.
>>>> in my fix , this setting (different mac in device and Qemu) will fail
>>>> to load the VM.
>>> Which is a good thing, right? Some feedback to the user that there is
>>> a misconfig. I got bitten by this so many times... Thank you for adding it.
>>>
>>>>
>>>>>> So, Just an FYI here: if your patch merged, it may cause traffic loss.
>>>>>> and now I'm working in the fix it in qemu, the link is
>>>>>> https://patchew.org/QEMU/20240716011349.821777-1-l...@redhat.com/
>>>>>> The idea of this fix is
>>>>>> There are will only two acceptable situations for qemu:
>>>>>> 1. The hardware MAC address is the same as the MAC address specified
>>>>>> in the QEMU command line, and both MAC addresses are not 0.
>>>>>> 2. The hardware MAC address is not 0, and the MAC address in the QEMU
>>>>>> command line is 0. In this situation, the hardware MAC address will
>>>>>> overwrite the QEMU command line address.
>>>>>>
>>>>> Why would this not work with this patch? This patch simply sets a MAC
>>>>> if the vport doesn't have one set. Which allows for more scenarios to
>>>>> work.
>>>>>
>>>> I do not mean your patch will not work, I just want to make some
>>>> clarify here.Your patch + my fix may cause the VM to fail to load in
>>>> some situations, and this is as expected.
>>>> Your patch is good to merge.
>>> Ack. Thank you for the clarification.
>> (Side note)
>> While looking at another issue I discovered that it's possible to
>> configure a random MAC on the mlx5_vdpa device at VM boot time if
>> device MAC configuration is implemented during during .set_config(). So
>> I was able to boot up a VM with a random MAC address coming from qemu
>> and the traffic worked with this new MAC.
>>
>> So now I'm not sure if this is just by luck or if the .set_config()
>> op should be implemented for the MAC part in our device.
>>
>> Thanks,
>> Dragos
>>
> Hi Dragos，
> For qemu part, I think this is not set from set_config()?  it should
> be from the CVQ?
I see that .set_config() is called during boot time. CVQ commands are
happening only when the MAC is configured from within the VM.

> Usually, we don't recommend using the set_config() function because
> the configuration space should be read-only for modern devices.
> 
Ack

> Now there is a bug in this part of qemu, and we plan to remove the
> code to set_config() in virtio_net_device_realize(), here is the patch
> https://lore.kernel.org/all/cacgkmevcskfahpbqlammszdfn-qphg5zx+uqvrfx0hswybz...@mail.gmail.com/T/
> and this is still under review
> 
Thanks for the clarification. So if I understand correctly,
there will be no way for qemu to set a random MAC address for vdpa
devices going forward. Unless it is done through the CVQ from within
the VM.


Thanks,
Dragos

Re: [PATCH 2/2] vdpa: Add support to update speed/duplex in vDPA/mlx5_vnet

2024-08-29 Thread Dragos Tatulea

(resending as I accidentally replied only to Carlos)

On 29.08.24 18:16, Carlos Bilbao wrote:
> From: Carlos Bilbao 
> 
> Include support to update the vDPA configuration fields of speed and
> duplex (as needed by VHOST_VDPA_SET_CONFIG). This includes function
> mlx5_vdpa_set_config() as well as changes in vdpa.c to fill the initial
> values to UNKNOWN. Also add a warning message for when
> mlx5_vdpa_get_config() receives offset and length out of bounds.
> 
> Signed-off-by: Carlos Bilbao 
> ---
>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 34 ++-
>  drivers/vdpa/vdpa.c   | 27 
>  include/uapi/linux/vdpa.h |  2 ++
>  3 files changed, 62 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> index c47009a8b472..a44bb2072eec 100644
> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> @@ -3221,12 +3221,44 @@ static void mlx5_vdpa_get_config(struct vdpa_device 
> *vdev, unsigned int offset,
>  
>   if (offset + len <= sizeof(struct virtio_net_config))
>   memcpy(buf, (u8 *)&ndev->config + offset, len);
> + else
> + mlx5_vdpa_warn(mvdev, "Offset and length out of bounds\n");
>  }
>  
>  static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
> offset, const void *buf,
>unsigned int len)
>  {
> - /* not supported */
> + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
> + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
> +
> + if (offset + len > sizeof(struct virtio_net_config)) {
> + mlx5_vdpa_warn(mvdev, "Offset and length out of bounds\n");
> + return;
> + }
> +
> + /*
> +  * Note that this will update the speed/duplex configuration fields
> +  * but the hardware support to actually perform this change does
> +  * not exist yet.
> +  */
> + switch (offset) {
> + case offsetof(struct virtio_net_config, speed):
> + if (len == sizeof(((struct virtio_net_config *) 0)->speed))
> + memcpy(&ndev->config.speed, buf, len);
> + else
> + mlx5_vdpa_warn(mvdev, "Invalid length for speed.\n");
> + break;
> +
> + case offsetof(struct virtio_net_config, duplex):
> + if (len == sizeof(((struct virtio_net_config *)0)->duplex))
> + memcpy(&ndev->config.duplex, buf, len);
> + else
> + mlx5_vdpa_warn(mvdev, "Invalid length for duplex.\n");
> + break;
> +
> + default:
> + mlx5_vdpa_warn(mvdev, "Configuration field not supported.\n");
This will trigger noise in dmesg because there is a MAC configuration here.
> + }
I would prefer that the .set_config remains a stub TBH. Setting the fields here 
is
misleading: the user might deduce that the configuration worked when they read 
the
values and see that they were updated.

Thanks,
dragos
>  }
>  
>  static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
> diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
> index 4dbd2e55a288..b920e4405f6d 100644
> --- a/drivers/vdpa/vdpa.c
> +++ b/drivers/vdpa/vdpa.c
> @@ -15,6 +15,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  static LIST_HEAD(mdev_head);
>  /* A global mutex that protects vdpa management device and device level 
> operations. */
> @@ -919,6 +920,22 @@ static int vdpa_dev_net_status_config_fill(struct 
> sk_buff *msg, u64 features,
>   return nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16);
>  }
>  
> +static int vdpa_dev_net_speed_config_fill(struct sk_buff *msg, u64 features,
> + struct virtio_net_config *config)
> +{
> + __le32 speed = cpu_to_le32(SPEED_UNKNOWN);
> +
> + return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_SPEED, sizeof(speed), &speed);
> +}
> +
> +static int vdpa_dev_net_duplex_config_fill(struct sk_buff *msg, u64 features,
> + struct virtio_net_config *config)
> +{
> + u8 duplex = DUPLEX_UNKNOWN;
> +
> + return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_DUPLEX, sizeof(duplex), 
> &duplex);
> +}
> +
>  static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff 
> *msg)
>  {
>   struct virtio_net_config config = {};
> @@ -940,6 +957,16 @@ static int vdpa_dev_net_config_fill(struct vdpa_device 
> *vdev, struct sk_buff *ms
>  
>   if (vdpa_dev_net_status_config_fill(msg, features_device, &config))
>   return -EMSGSIZE;
> + /*
> +  * mlx5_vdpa vDPA devicess currently do not support the
> +  * VIRTIO_NET_F_SPEED_DUPLEX feature, which reports speed and
> +  * duplex; hence these are set to UNKNOWN for now.
> +  */
> + if (vdpa_dev_net_speed_config_fill(msg, features_device, &config))
> + return -EMSGSIZE;
> +
> + if (vdpa_dev_net_duplex_config_fill(msg

Re: [PATCH] vdpa/mlx5: Use random MAC address when no nic vport MAC set

2024-08-29 Thread Dragos Tatulea




On 29.08.24 12:00, Dragos Tatulea wrote:
> 
> 
> On 29.08.24 11:05, Cindy Lu wrote:
>> On Wed, 28 Aug 2024 at 17:37, Dragos Tatulea  wrote:
>>>
>>>
>>>
>>> On 28.08.24 11:00, Cindy Lu wrote:
>>>> On Wed, 28 Aug 2024 at 09:51, Jason Wang  wrote:
>>>>>
>>>>> On Wed, Aug 28, 2024 at 12:03 AM Dragos Tatulea  
>>>>> wrote:
>>>>>>
>>>>>> When the vdpa device is configured without a specific MAC
>>>>>> address, the vport MAC address is used. However, this
>>>>>> address can be 0 which prevents the driver from properly
>>>>>> configuring the MPFS and breaks steering.
>>>>>>
>>>>>> The solution is to simply generate a random MAC address
>>>>>> when no MAC is set on the nic vport.
>>>>>>
>>>>>> Now it's possible to create a vdpa device without a
>>>>>> MAC address and run qemu with this device without needing
>>>>>> to configure an explicit MAC address.
>>>>>>
>>>>>> Signed-off-by: Dragos Tatulea 
>>>>>> Reviewed-by: Jiri Pirko 
>>>>>
>>>>> Acked-by: Jason Wang 
>>>>>
>>>>> (Adding Cindy for double checking if it has any side effect on Qemu side)
>>>>>
>>>>> Thanks
>>>>>
>>>> But Now there is a bug in QEMU: if the hardware MAC address does not
>>>> match the one in the QEMU command line, it will cause traffic loss.
>>>>
>>> Why is this a new issue in qemu? qemu in it's current state won't work
>>> with a different mac address that the one that is set in HW anyway.
>>>
>> this is not a new bug. We are trying to fix it because it will cause
>> traffic lose without any warning.
>> in my fix , this setting (different mac in device and Qemu) will fail
>> to load the VM.
> Which is a good thing, right? Some feedback to the user that there is
> a misconfig. I got bitten by this so many times... Thank you for adding it.
> 
>>
>>>> So, Just an FYI here: if your patch merged, it may cause traffic loss.
>>>> and now I'm working in the fix it in qemu, the link is
>>>> https://patchew.org/QEMU/20240716011349.821777-1-l...@redhat.com/
>>>> The idea of this fix is
>>>> There are will only two acceptable situations for qemu:
>>>> 1. The hardware MAC address is the same as the MAC address specified
>>>> in the QEMU command line, and both MAC addresses are not 0.
>>>> 2. The hardware MAC address is not 0, and the MAC address in the QEMU
>>>> command line is 0. In this situation, the hardware MAC address will
>>>> overwrite the QEMU command line address.
>>>>
>>> Why would this not work with this patch? This patch simply sets a MAC
>>> if the vport doesn't have one set. Which allows for more scenarios to
>>> work.
>>>
>> I do not mean your patch will not work, I just want to make some
>> clarify here.Your patch + my fix may cause the VM to fail to load in
>> some situations, and this is as expected.
>> Your patch is good to merge.
> Ack. Thank you for the clarification.
(Side note)
While looking at another issue I discovered that it's possible to
configure a random MAC on the mlx5_vdpa device at VM boot time if
device MAC configuration is implemented during during .set_config(). So
I was able to boot up a VM with a random MAC address coming from qemu
and the traffic worked with this new MAC.

So now I'm not sure if this is just by luck or if the .set_config()
op should be implemented for the MAC part in our device.

Thanks,
Dragos

Re: [PATCH 1/2] mlx5_vnet: Set speed and duplex of vDPA devices to UNKNOWN

2024-08-29 Thread Dragos Tatulea




On 29.08.24 18:16, Carlos Bilbao wrote:
> From: Carlos Bilbao 
> 
> Initialize the speed and duplex fields in virtio_net_config to UNKNOWN.
> This is needed because mlx5_vdpa vDPA devicess currently do not support the
> VIRTIO_NET_F_SPEED_DUPLEX feature which reports speed and duplex. Add
> needed helper cpu_to_mlx5vdpa32() to convert endianness of speed.
> 
> Signed-off-by: Carlos Bilbao 
Nit: prefix is vdpa/mlx5. Once that is fixed, for this patch:
Reviewed-by: Dragos Tatulea 

> ---
>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 12 
>  1 file changed, 12 insertions(+)
> 
> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> index fa78e8288ebb..c47009a8b472 100644
> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> @@ -193,6 +193,11 @@ static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev 
> *mvdev, u16 val)
>   return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
>  }
>  
> +static __virtio32 cpu_to_mlx5vdpa32(struct mlx5_vdpa_dev *mvdev, u32 val)
> +{
> + return __cpu_to_virtio32(mlx5_vdpa_is_little_endian(mvdev), val);
> +}
> +
>  static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
>  {
>   if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
> @@ -3795,6 +3800,13 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
> *v_mdev, const char *name,
>   init_rwsem(&ndev->reslock);
>   config = &ndev->config;
>  
> + /*
> +  * mlx5_vdpa vDPA devices currently don't support reporting or
> +  * setting the speed or duplex.
> +  */
> + config->speed  = cpu_to_mlx5vdpa32(mvdev, SPEED_UNKNOWN);
> + config->duplex = DUPLEX_UNKNOWN;
> +
>   if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
>   err = config_func_mtu(mdev, add_config->net.mtu);
>   if (err)

Re: [PATCH vhost 6/7] vdpa/mlx5: Introduce init/destroy for MR resources

2024-08-29 Thread Dragos Tatulea




On 29.08.24 16:37, Eugenio Perez Martin wrote:
> On Wed, Aug 21, 2024 at 1:42 PM Dragos Tatulea  wrote:
>>
>> There's currently not a lot of action happening during
>> the init/destroy of MR resources. But more will be added
>> in the upcoming patches.
> 
> If the series doesn't receive new patches, it is just the next patch :).
> 
>>
>> Signed-off-by: Dragos Tatulea 
>> Reviewed-by: Cosmin Ratiu 
>> ---
>>  drivers/vdpa/mlx5/core/mlx5_vdpa.h |  2 ++
>>  drivers/vdpa/mlx5/core/mr.c| 17 +
>>  drivers/vdpa/mlx5/core/resources.c |  3 ---
>>  drivers/vdpa/mlx5/net/mlx5_vnet.c  | 10 --
>>  4 files changed, 27 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
>> b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
>> index 89b564cecddf..c3e17bc888e8 100644
>> --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
>> +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
>> @@ -138,6 +138,8 @@ int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, 
>> u32 *mkey, u32 *in,
>>  int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 mkey);
>>  struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev,
>>  struct vhost_iotlb *iotlb);
>> +int mlx5_vdpa_init_mr_resources(struct mlx5_vdpa_dev *mvdev);
>> +void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev);
>>  void mlx5_vdpa_clean_mrs(struct mlx5_vdpa_dev *mvdev);
>>  void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
>>   struct mlx5_vdpa_mr *mr);
>> diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
>> index f20f2a8a701d..ec75f165f832 100644
>> --- a/drivers/vdpa/mlx5/core/mr.c
>> +++ b/drivers/vdpa/mlx5/core/mr.c
>> @@ -843,3 +843,20 @@ int mlx5_vdpa_reset_mr(struct mlx5_vdpa_dev *mvdev, 
>> unsigned int asid)
>>
>> return 0;
>>  }
>> +
>> +int mlx5_vdpa_init_mr_resources(struct mlx5_vdpa_dev *mvdev)
>> +{
>> +   struct mlx5_vdpa_mr_resources *mres = &mvdev->mres;
>> +
>> +   INIT_LIST_HEAD(&mres->mr_list_head);
>> +   mutex_init(&mres->lock);
>> +
>> +   return 0;
> 
> I'd leave this function return void here and remove the caller error
> control path.
> 
It is like this because the next patch adds an error path.

>> +}
>> +
>> +void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev)
>> +{
>> +   struct mlx5_vdpa_mr_resources *mres = &mvdev->mres;
>> +
>> +   mutex_destroy(&mres->lock);
>> +}
>> diff --git a/drivers/vdpa/mlx5/core/resources.c 
>> b/drivers/vdpa/mlx5/core/resources.c
>> index fe2ca3458f6c..aeae31d0cefa 100644
>> --- a/drivers/vdpa/mlx5/core/resources.c
>> +++ b/drivers/vdpa/mlx5/core/resources.c
>> @@ -256,7 +256,6 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev 
>> *mvdev)
>> mlx5_vdpa_warn(mvdev, "resources already allocated\n");
>> return -EINVAL;
>> }
>> -   mutex_init(&mvdev->mres.lock);
>> res->uar = mlx5_get_uars_page(mdev);
>> if (IS_ERR(res->uar)) {
>> err = PTR_ERR(res->uar);
>> @@ -301,7 +300,6 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev 
>> *mvdev)
>>  err_uctx:
>> mlx5_put_uars_page(mdev, res->uar);
>>  err_uars:
>> -   mutex_destroy(&mvdev->mres.lock);
> 
> Maybe it is just me, but this patch is also moving the lock lifetime
> from mlx5_vdpa_alloc_resources / mlx5_vdpa_free_resources to
> mlx5_vdpa_dev_add / mlx5_vdpa_free. I guess it has a justification we
> can either clarify in the patch message or split in its own patch.
> 
Good point. Will do.

>> return err;
>>  }
>>
>> @@ -318,7 +316,6 @@ void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev 
>> *mvdev)
>> dealloc_pd(mvdev, res->pdn, res->uid);
>> destroy_uctx(mvdev, res->uid);
>> mlx5_put_uars_page(mvdev->mdev, res->uar);
>> -   mutex_destroy(&mvdev->mres.lock);
>> res->valid = false;
>>  }
>>
>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> index 8a51c492a62a..1cadcb05a5c7 100644
>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> @@ -3434,6 +3434,7 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev)
>>
>> free_fixed_resources(ndev);
>> mlx5_vdpa_c

Re: [PATCH vhost 7/7] vdpa/mlx5: Postpone MR deletion

2024-08-29 Thread Dragos Tatulea




On 29.08.24 17:07, Eugenio Perez Martin wrote:
> On Wed, Aug 21, 2024 at 1:42 PM Dragos Tatulea  wrote:
>>
>> Currently, when a new MR is set up, the old MR is deleted. MR deletion
>> is about 30-40% the time of MR creation. As deleting the old MR is not
>> important for the process of setting up the new MR, this operation
>> can be postponed.
>>
>> This series adds a workqueue that does MR garbage collection at a later
>> point. If the MR lock is taken, the handler will back off and
>> reschedule. The exception during shutdown: then the handler must
>> not postpone the work.
>>
>> Note that this is only a speculative optimization: if there is some
>> mapping operation that is triggered while the garbage collector handler
>> has the lock taken, this operation it will have to wait for the handler
>> to finish.
>>
>> Signed-off-by: Dragos Tatulea 
>> Reviewed-by: Cosmin Ratiu 
>> ---
>>  drivers/vdpa/mlx5/core/mlx5_vdpa.h | 10 ++
>>  drivers/vdpa/mlx5/core/mr.c| 51 --
>>  drivers/vdpa/mlx5/net/mlx5_vnet.c  |  3 +-
>>  3 files changed, 60 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
>> b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
>> index c3e17bc888e8..2cedf7e2dbc4 100644
>> --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
>> +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
>> @@ -86,8 +86,18 @@ enum {
>>  struct mlx5_vdpa_mr_resources {
>> struct mlx5_vdpa_mr *mr[MLX5_VDPA_NUM_AS];
>> unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
>> +
>> +   /* Pre-deletion mr list */
>> struct list_head mr_list_head;
>> +
>> +   /* Deferred mr list */
>> +   struct list_head mr_gc_list_head;
>> +   struct workqueue_struct *wq_gc;
>> +   struct delayed_work gc_dwork_ent;
>> +
>> struct mutex lock;
>> +
>> +   atomic_t shutdown;
>>  };
>>
>>  struct mlx5_vdpa_dev {
>> diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
>> index ec75f165f832..43fce6b39cf2 100644
>> --- a/drivers/vdpa/mlx5/core/mr.c
>> +++ b/drivers/vdpa/mlx5/core/mr.c
>> @@ -653,14 +653,46 @@ static void _mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev 
>> *mvdev, struct mlx5_vdpa_
>> kfree(mr);
>>  }
>>
>> +#define MLX5_VDPA_MR_GC_TRIGGER_MS 2000
>> +
>> +static void mlx5_vdpa_mr_gc_handler(struct work_struct *work)
>> +{
>> +   struct mlx5_vdpa_mr_resources *mres;
>> +   struct mlx5_vdpa_mr *mr, *tmp;
>> +   struct mlx5_vdpa_dev *mvdev;
>> +
>> +   mres = container_of(work, struct mlx5_vdpa_mr_resources, 
>> gc_dwork_ent.work);
>> +
>> +   if (atomic_read(&mres->shutdown)) {
>> +   mutex_lock(&mres->lock);
>> +   } else if (!mutex_trylock(&mres->lock)) {
> 
> Is the trylock worth it? My understanding is that mutex_lock will add
> the kthread to the waitqueue anyway if it is not able to acquire the
> lock.
> 
I want to believe it is :). I noticed during testing that this can
interfere with the case where there are several .set_map() operations
in quick succession. That's why the work is delayed by such a long
time.

It's not a perfect heuristic but I found that it's better than not
having it.

>> +   queue_delayed_work(mres->wq_gc, &mres->gc_dwork_ent,
>> +  
>> msecs_to_jiffies(MLX5_VDPA_MR_GC_TRIGGER_MS));
>> +   return;
>> +   }
>> +
>> +   mvdev = container_of(mres, struct mlx5_vdpa_dev, mres);
>> +
>> +   list_for_each_entry_safe(mr, tmp, &mres->mr_gc_list_head, mr_list) {
>> +   _mlx5_vdpa_destroy_mr(mvdev, mr);
>> +   }
>> +
>> +   mutex_unlock(&mres->lock);
>> +}
>> +
>>  static void _mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev,
>>   struct mlx5_vdpa_mr *mr)
>>  {
>> +   struct mlx5_vdpa_mr_resources *mres = &mvdev->mres;
>> +
>> if (!mr)
>> return;
>>
>> -   if (refcount_dec_and_test(&mr->refcount))
>> -   _mlx5_vdpa_destroy_mr(mvdev, mr);
>> +   if (refcount_dec_and_test(&mr->refcount)) {
>> +   list_move_tail(&mr->mr_list, &mres->mr_gc_list_head);
>> +   queue_delayed_work(mres->wq_gc, &mres->gc_dwork_ent,
>> +  
>> msecs_to_jiffies(MLX5_VDPA

Re: [PATCH] vdpa: Set speed and duplex of mlx5_vnet to UNKNOWN

2024-08-29 Thread Dragos Tatulea




On 28.08.24 20:16, Carlos Bilbao wrote:
> From: Carlos Bilbao 
> 
> mlx5_vdpa vDPA devices currently don't support reporting or setting the
> speed and duplex and hence should be UNKNOWN instead of zero.
> 
> Signed-off-by: Carlos Bilbao 
> ---
>  drivers/vdpa/mlx5/net/mlx5_vnet.c |  7 +++
>  drivers/vdpa/vdpa.c   | 23 +++
>  include/uapi/linux/vdpa.h |  2 ++
>  3 files changed, 32 insertions(+)
> 
> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> index fa78e8288ebb..319f5c6121de 100644
> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> @@ -3795,6 +3795,13 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
> *v_mdev, const char *name,
>   init_rwsem(&ndev->reslock);
>   config = &ndev->config;
>  
> + /*
> +  * mlx5_vdpa vDPA devices currently don't support reporting or
> +  * setting the speed or duplex.
> +  */
> + config->speed  = SPEED_UNKNOWN;
> + config->duplex = DUPLEX_UNKNOWN;
> +
The values in virtio_net_config are little endian so you'll need to explicitly
convert them. As speed is a u32, you'll need to add a cpu_to_mlx5vdpa32() 
helper.

Thanks,
Dragos

>   if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
>   err = config_func_mtu(mdev, add_config->net.mtu);
>   if (err)
> diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
> index 4dbd2e55a288..abde23e0041d 100644
> --- a/drivers/vdpa/vdpa.c
> +++ b/drivers/vdpa/vdpa.c
> @@ -15,6 +15,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  static LIST_HEAD(mdev_head);
>  /* A global mutex that protects vdpa management device and device level 
> operations. */
> @@ -919,6 +920,22 @@ static int vdpa_dev_net_status_config_fill(struct 
> sk_buff *msg, u64 features,
>   return nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16);
>  }
>  
> +static int vdpa_dev_net_speed_config_fill(struct sk_buff *msg, u64 features,
> + struct virtio_net_config *config)
> +{
> + __le32 speed = cpu_to_le32(SPEED_UNKNOWN);
> +
> + return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_SPEED, sizeof(speed), &speed);
> +}
> +
> +static int vdpa_dev_net_duplex_config_fill(struct sk_buff *msg, u64 features,
> + struct virtio_net_config *config)
> +{
> + u8 duplex = DUPLEX_UNKNOWN;
> +
> + return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_DUPLEX, sizeof(duplex), 
> &duplex);
> +}
> +
>  static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff 
> *msg)
>  {
>   struct virtio_net_config config = {};
> @@ -941,6 +958,12 @@ static int vdpa_dev_net_config_fill(struct vdpa_device 
> *vdev, struct sk_buff *ms
>   if (vdpa_dev_net_status_config_fill(msg, features_device, &config))
>   return -EMSGSIZE;
>  
> + if (vdpa_dev_net_speed_config_fill(msg, features_device, &config))
> + return -EMSGSIZE;
> +
> + if (vdpa_dev_net_duplex_config_fill(msg, features_device, &config))
> + return -EMSGSIZE;
> +
>   return vdpa_dev_net_mq_config_fill(msg, features_device, &config);
>  }
>  
> diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
> index 842bf1201ac4..1c64ee0dd7b1 100644
> --- a/include/uapi/linux/vdpa.h
> +++ b/include/uapi/linux/vdpa.h
> @@ -43,6 +43,8 @@ enum vdpa_attr {
>   VDPA_ATTR_DEV_NET_STATUS,   /* u8 */
>   VDPA_ATTR_DEV_NET_CFG_MAX_VQP,  /* u16 */
>   VDPA_ATTR_DEV_NET_CFG_MTU,  /* u16 */
> + VDPA_ATTR_DEV_NET_CFG_SPEED,/* u32 */
> + VDPA_ATTR_DEV_NET_CFG_DUPLEX,   /* u8 */
>  
>   VDPA_ATTR_DEV_NEGOTIATED_FEATURES,  /* u64 */
>   VDPA_ATTR_DEV_MGMTDEV_MAX_VQS,  /* u32 */

Re: [PATCH] vdpa/mlx5: Use random MAC address when no nic vport MAC set

2024-08-29 Thread Dragos Tatulea




On 29.08.24 11:05, Cindy Lu wrote:
> On Wed, 28 Aug 2024 at 17:37, Dragos Tatulea  wrote:
>>
>>
>>
>> On 28.08.24 11:00, Cindy Lu wrote:
>>> On Wed, 28 Aug 2024 at 09:51, Jason Wang  wrote:
>>>>
>>>> On Wed, Aug 28, 2024 at 12:03 AM Dragos Tatulea  
>>>> wrote:
>>>>>
>>>>> When the vdpa device is configured without a specific MAC
>>>>> address, the vport MAC address is used. However, this
>>>>> address can be 0 which prevents the driver from properly
>>>>> configuring the MPFS and breaks steering.
>>>>>
>>>>> The solution is to simply generate a random MAC address
>>>>> when no MAC is set on the nic vport.
>>>>>
>>>>> Now it's possible to create a vdpa device without a
>>>>> MAC address and run qemu with this device without needing
>>>>> to configure an explicit MAC address.
>>>>>
>>>>> Signed-off-by: Dragos Tatulea 
>>>>> Reviewed-by: Jiri Pirko 
>>>>
>>>> Acked-by: Jason Wang 
>>>>
>>>> (Adding Cindy for double checking if it has any side effect on Qemu side)
>>>>
>>>> Thanks
>>>>
>>> But Now there is a bug in QEMU: if the hardware MAC address does not
>>> match the one in the QEMU command line, it will cause traffic loss.
>>>
>> Why is this a new issue in qemu? qemu in it's current state won't work
>> with a different mac address that the one that is set in HW anyway.
>>
> this is not a new bug. We are trying to fix it because it will cause
> traffic lose without any warning.
> in my fix , this setting (different mac in device and Qemu) will fail
> to load the VM.
Which is a good thing, right? Some feedback to the user that there is
a misconfig. I got bitten by this so many times... Thank you for adding it.

> 
>>> So, Just an FYI here: if your patch merged, it may cause traffic loss.
>>> and now I'm working in the fix it in qemu, the link is
>>> https://patchew.org/QEMU/20240716011349.821777-1-l...@redhat.com/
>>> The idea of this fix is
>>> There are will only two acceptable situations for qemu:
>>> 1. The hardware MAC address is the same as the MAC address specified
>>> in the QEMU command line, and both MAC addresses are not 0.
>>> 2. The hardware MAC address is not 0, and the MAC address in the QEMU
>>> command line is 0. In this situation, the hardware MAC address will
>>> overwrite the QEMU command line address.
>>>
>> Why would this not work with this patch? This patch simply sets a MAC
>> if the vport doesn't have one set. Which allows for more scenarios to
>> work.
>>
> I do not mean your patch will not work, I just want to make some
> clarify here.Your patch + my fix may cause the VM to fail to load in
> some situations, and this is as expected.
> Your patch is good to merge.
Ack. Thank you for the clarification.

Thanks,
Dragos

Re: [PATCH] vdpa/mlx5: Use random MAC address when no nic vport MAC set

2024-08-28 Thread Dragos Tatulea




On 28.08.24 11:00, Cindy Lu wrote:
> On Wed, 28 Aug 2024 at 09:51, Jason Wang  wrote:
>>
>> On Wed, Aug 28, 2024 at 12:03 AM Dragos Tatulea  wrote:
>>>
>>> When the vdpa device is configured without a specific MAC
>>> address, the vport MAC address is used. However, this
>>> address can be 0 which prevents the driver from properly
>>> configuring the MPFS and breaks steering.
>>>
>>> The solution is to simply generate a random MAC address
>>> when no MAC is set on the nic vport.
>>>
>>> Now it's possible to create a vdpa device without a
>>> MAC address and run qemu with this device without needing
>>> to configure an explicit MAC address.
>>>
>>> Signed-off-by: Dragos Tatulea 
>>> Reviewed-by: Jiri Pirko 
>>
>> Acked-by: Jason Wang 
>>
>> (Adding Cindy for double checking if it has any side effect on Qemu side)
>>
>> Thanks
>>
> But Now there is a bug in QEMU: if the hardware MAC address does not
> match the one in the QEMU command line, it will cause traffic loss.
> 
Why is this a new issue in qemu? qemu in it's current state won't work
with a different mac address that the one that is set in HW anyway.

> So, Just an FYI here: if your patch merged, it may cause traffic loss.
> and now I'm working in the fix it in qemu, the link is
> https://patchew.org/QEMU/20240716011349.821777-1-l...@redhat.com/
> The idea of this fix is
> There are will only two acceptable situations for qemu:
> 1. The hardware MAC address is the same as the MAC address specified
> in the QEMU command line, and both MAC addresses are not 0.
> 2. The hardware MAC address is not 0, and the MAC address in the QEMU
> command line is 0. In this situation, the hardware MAC address will
> overwrite the QEMU command line address.
> 
Why would this not work with this patch? This patch simply sets a MAC
if the vport doesn't have one set. Which allows for more scenarios to
work.

Thanks,
Dragos

> Thanks
> Cindy
> 
> 
>>> ---
>>>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 +++
>>>  1 file changed, 3 insertions(+)
>>>
>>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
>>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>>> index fa78e8288ebb..1c26139d02fe 100644
>>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
>>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>>> @@ -3824,6 +3824,9 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
>>> *v_mdev, const char *name,
>>> err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, 
>>> config->mac);
>>> if (err)
>>> goto err_alloc;
>>> +
>>> +   if (is_zero_ether_addr(config->mac))
>>> +   eth_random_addr(config->mac);
>>> }
>>>
>>> if (!is_zero_ether_addr(config->mac)) {
>>> --
>>> 2.45.1
>>>
>>
>

Re: [PATCH] vdpa/mlx5: Fix invalid mr resource destroy

2024-08-28 Thread Dragos Tatulea




On 28.08.24 08:22, Si-Wei Liu wrote:
> 
> 
> On 8/27/2024 9:08 AM, Dragos Tatulea wrote:
>> Certain error paths from mlx5_vdpa_dev_add() can end up releasing mr
>> resources which never got initialized in the first place.
>>
>> This patch adds the missing check in mlx5_vdpa_destroy_mr_resources()
>> to block releasing non-initialized mr resources.
>>
>> Reference trace:
>>
>>mlx5_core :08:00.2: mlx5_vdpa_dev_add:3274:(pid 2700) warning: No mac 
>> address provisioned?
>>BUG: kernel NULL pointer dereference, address: 
>>#PF: supervisor read access in kernel mode
>>#PF: error_code(0x) - not-present page
>>PGD 140216067 P4D 0
>>Oops:  [#1] PREEMPT SMP NOPTI
>>CPU: 8 PID: 2700 Comm: vdpa Kdump: loaded Not tainted 
>> 5.14.0-496.el9.x86_64 #1
>>Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
>> rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
>>RIP: 0010:vhost_iotlb_del_range+0xf/0xe0 [vhost_iotlb]
>>Code: [...]
>>RSP: 0018:ff1c823ac23077f0 EFLAGS: 00010246
>>RAX: c1a21a60 RBX: 899567a0 RCX: 
>>RDX:  RSI:  RDI: 
>>RBP: ff1bda1f7c21e800 R08:  R09: ff1c823ac2307670
>>R10: ff1c823ac2307668 R11: 8a9e7b68 R12: 
>>R13:  R14: ff1bda1f43e341a0 R15: ffea
>>FS:  7f56eba7c740() GS:ff1bda269f80() 
>> knlGS:
>>CS:  0010 DS:  ES:  CR0: 80050033
>>CR2:  CR3: 000104d90001 CR4: 00771ef0
>>DR0:  DR1:  DR2: 
>>DR3:  DR6: fffe0ff0 DR7: 0400
>>PKRU: 5554
>>Call Trace:
>>
>> ? show_trace_log_lvl+0x1c4/0x2df
>> ? show_trace_log_lvl+0x1c4/0x2df
>> ? mlx5_vdpa_free+0x3d/0x150 [mlx5_vdpa]
>> ? __die_body.cold+0x8/0xd
>> ? page_fault_oops+0x134/0x170
>> ? __irq_work_queue_local+0x2b/0xc0
>> ? irq_work_queue+0x2c/0x50
>> ? exc_page_fault+0x62/0x150
>> ? asm_exc_page_fault+0x22/0x30
>> ? __pfx_mlx5_vdpa_free+0x10/0x10 [mlx5_vdpa]
>> ? vhost_iotlb_del_range+0xf/0xe0 [vhost_iotlb]
>> mlx5_vdpa_free+0x3d/0x150 [mlx5_vdpa]
>> vdpa_release_dev+0x1e/0x50 [vdpa]
>> device_release+0x31/0x90
>> kobject_cleanup+0x37/0x130
>> mlx5_vdpa_dev_add+0x2d2/0x7a0 [mlx5_vdpa]
>> vdpa_nl_cmd_dev_add_set_doit+0x277/0x4c0 [vdpa]
>> genl_family_rcv_msg_doit+0xd9/0x130
>> genl_family_rcv_msg+0x14d/0x220
>> ? __pfx_vdpa_nl_cmd_dev_add_set_doit+0x10/0x10 [vdpa]
>> ? _copy_to_user+0x1a/0x30
>> ? move_addr_to_user+0x4b/0xe0
>> genl_rcv_msg+0x47/0xa0
>> ? __import_iovec+0x46/0x150
>> ? __pfx_genl_rcv_msg+0x10/0x10
>> netlink_rcv_skb+0x54/0x100
>> genl_rcv+0x24/0x40
>> netlink_unicast+0x245/0x370
>> netlink_sendmsg+0x206/0x440
>> __sys_sendto+0x1dc/0x1f0
>> ? do_read_fault+0x10c/0x1d0
>> ? do_pte_missing+0x10d/0x190
>> __x64_sys_sendto+0x20/0x30
>> do_syscall_64+0x5c/0xf0
>> ? __count_memcg_events+0x4f/0xb0
>> ? mm_account_fault+0x6c/0x100
>> ? handle_mm_fault+0x116/0x270
>> ? do_user_addr_fault+0x1d6/0x6a0
>> ? do_syscall_64+0x6b/0xf0
>> ? clear_bhb_loop+0x25/0x80
>> ? clear_bhb_loop+0x25/0x80
>> ? clear_bhb_loop+0x25/0x80
>> ? clear_bhb_loop+0x25/0x80
>> ? clear_bhb_loop+0x25/0x80
>> entry_SYSCALL_64_after_hwframe+0x78/0x80
>>
>> Fixes: ("vdpa/mlx5: Decouple cvq iotlb handling from hw mapping code")
> The fix looks fine to me, but how come this is the commit that introduced the 
> problem? Can you help clarify?
> 
The crash happens due to prune_iotlb() being called on an uninitialized
value. prune_iotlb() was moved in mlx5_vdpa_destroy_mr_resources() in
this change. But the function was called mlx5_vdpa_destroy_mr() back
then and it was used a bit differently.

This fix could have only checked the validity of the iotlb member. But
there are some locks being taken in the called function which are also
not initialized. Hence the check for the resource valid flag.

Thanks,
Dragos

> Reviewed-by: Si-Wei Liu 
> 
> Thanks,
> -Siwei
> 
>> Signed-off-by: Dragos Tatulea 
>> Reviewed-by: Cosmin Ratiu 
>> ---
>>   drivers/vdpa/mlx5/core/mr.c | 3 +++
>>   1 file changed, 3 insertions(+)
>>
>> diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
>> index 4758914ccf86..bf56f3d69625 100644
>> --- a/drivers/vdpa/mlx5/core/mr.c
>> +++ b/drivers/vdpa/mlx5/core/mr.c
>> @@ -581,6 +581,9 @@ static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev 
>> *mvdev)
>> void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev)
>>   {
>> +if (!mvdev->res.valid)
>> +return;
>> +
>>   for (int i = 0; i < MLX5_VDPA_NUM_AS; i++)
>>   mlx5_vdpa_update_mr(mvdev, NULL, i);
>>   
>

Re: [PATCH] vdpa/mlx5: Use random MAC address when no nic vport MAC set

2024-08-28 Thread Dragos Tatulea




On 28.08.24 07:54, Si-Wei Liu wrote:
> 
> 
> On 8/27/2024 9:02 AM, Dragos Tatulea wrote:
>> When the vdpa device is configured without a specific MAC
>> address, the vport MAC address is used. However, this
>> address can be 0 which prevents the driver from properly
>> configuring the MPFS and breaks steering.
>>
>> The solution is to simply generate a random MAC address
>> when no MAC is set on the nic vport.
>>
>> Now it's possible to create a vdpa device without a
>> MAC address and run qemu with this device without needing
>> to configure an explicit MAC address.
>>
>> Signed-off-by: Dragos Tatulea 
>> Reviewed-by: Jiri Pirko 
>> ---
>>   drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 +++
>>   1 file changed, 3 insertions(+)
>>
>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> index fa78e8288ebb..1c26139d02fe 100644
>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> @@ -3824,6 +3824,9 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
>> *v_mdev, const char *name,
>>   err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
>>   if (err)
>>   goto err_alloc;
>> +
>> +if (is_zero_ether_addr(config->mac))
>> +eth_random_addr(config->mac);
> I wonder with this change we no longer honor the historical behaviour to 
> retain the zero mac address and clear the _F_MAC bit, should we head to 
> remove the below logic? It looks to me below would become dead code 
> effectively.
> 
It is still possible to create a vdpa device with a zero mac address
explicitly, right? 

> } else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0) 
> {
> /*
>  * We used to clear _F_MAC feature bit if seeing
>  * zero mac address when device features are not
>  * specifically provisioned. Keep the behaviour
>  * so old scripts do not break.
>  */
> device_features &= ~BIT_ULL(VIRTIO_NET_F_MAC);
> 
> If we are not going to honor old behaviour any more, looks to me we should 
> also block users from creating vdpa device with zero mac address, if the mac 
> attribute is specified. There's more sorrow than help the zero mac address 
> could buy for users.
That makes sense. There is a small risk of breaking user's scripts that
do this by accident...

Thanks,
Dragos

Re: [RFC] Why is set_config not supported in mlx5_vnet?

2024-08-27 Thread Dragos Tatulea




On 27.08.24 04:03, Jason Wang wrote:
> On Tue, Aug 27, 2024 at 12:11 AM Dragos Tatulea  wrote:
>>
>>
>> On 26.08.24 16:24, Andrew Lunn wrote:
>>> On Mon, Aug 26, 2024 at 11:06:09AM +0200, Dragos Tatulea wrote:
>>>>
>>>>
>>>> On 23.08.24 18:54, Carlos Bilbao wrote:
>>>>> Hello,
>>>>>
>>>>> I'm debugging my vDPA setup, and when using ioctl to retrieve the
>>>>> configuration, I noticed that it's running in half duplex mode:
>>>>>
>>>>> Configuration data (24 bytes):
>>>>>   MAC address: (Mac address)
>>>>>   Status: 0x0001
>>>>>   Max virtqueue pairs: 8
>>>>>   MTU: 1500
>>>>>   Speed: 0 Mb
>>>>>   Duplex: Half Duplex
>>>>>   RSS max key size: 0
>>>>>   RSS max indirection table length: 0
>>>>>   Supported hash types: 0x
>>>>>
>>>>> I believe this might be contributing to the underperformance of vDPA.
>>>> mlx5_vdpa vDPA devicess currently do not support the 
>>>> VIRTIO_NET_F_SPEED_DUPLEX
>>>> feature which reports speed and duplex. You can check the state on the
>>>> PF.
>>>
>>> Then it should probably report DUPLEX_UNKNOWN.
>>>
>>> The speed of 0 also suggests SPEED_UNKNOWN is not being returned. So
>>> this just looks buggy in general.
>>>
>> The virtio spec doesn't mention what those values should be when
>> VIRTIO_NET_F_SPEED_DUPLEX is not supported.
>>
>> Jason, should vdpa_dev_net_config_fill() initialize the speed/duplex
>> fields to SPEED/DUPLEX_UNKNOWN instead of 0?
> 
> Spec said
> 
> """
> The following two fields, speed and duplex, only exist if
> VIRTIO_NET_F_SPEED_DUPLEX is set.
> """
> 
> So my understanding is that it is undefined behaviour, and those
> fields seems useless before feature negotiation. For safety, it might
> be better to initialize them as UNKOWN.
> 
After a closer look my statement doesn't make sense: the device will copy
the virtio_net_config bytes on top.

The solution is to initialize these fields to UNKNOWN in the driver. Will send
a patch to fix this.

Thanks,
Dragos

[PATCH] vdpa/mlx5: Fix invalid mr resource destroy

2024-08-27 Thread Dragos Tatulea

Certain error paths from mlx5_vdpa_dev_add() can end up releasing mr
resources which never got initialized in the first place.

This patch adds the missing check in mlx5_vdpa_destroy_mr_resources()
to block releasing non-initialized mr resources.

Reference trace:

  mlx5_core :08:00.2: mlx5_vdpa_dev_add:3274:(pid 2700) warning: No mac 
address provisioned?
  BUG: kernel NULL pointer dereference, address: 
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x) - not-present page
  PGD 140216067 P4D 0
  Oops:  [#1] PREEMPT SMP NOPTI
  CPU: 8 PID: 2700 Comm: vdpa Kdump: loaded Not tainted 5.14.0-496.el9.x86_64 #1
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  RIP: 0010:vhost_iotlb_del_range+0xf/0xe0 [vhost_iotlb]
  Code: [...]
  RSP: 0018:ff1c823ac23077f0 EFLAGS: 00010246
  RAX: c1a21a60 RBX: 899567a0 RCX: 
  RDX:  RSI:  RDI: 
  RBP: ff1bda1f7c21e800 R08:  R09: ff1c823ac2307670
  R10: ff1c823ac2307668 R11: 8a9e7b68 R12: 
  R13:  R14: ff1bda1f43e341a0 R15: ffea
  FS:  7f56eba7c740() GS:ff1bda269f80() knlGS:
  CS:  0010 DS:  ES:  CR0: 80050033
  CR2:  CR3: 000104d90001 CR4: 00771ef0
  DR0:  DR1:  DR2: 
  DR3:  DR6: fffe0ff0 DR7: 0400
  PKRU: 5554
  Call Trace:

   ? show_trace_log_lvl+0x1c4/0x2df
   ? show_trace_log_lvl+0x1c4/0x2df
   ? mlx5_vdpa_free+0x3d/0x150 [mlx5_vdpa]
   ? __die_body.cold+0x8/0xd
   ? page_fault_oops+0x134/0x170
   ? __irq_work_queue_local+0x2b/0xc0
   ? irq_work_queue+0x2c/0x50
   ? exc_page_fault+0x62/0x150
   ? asm_exc_page_fault+0x22/0x30
   ? __pfx_mlx5_vdpa_free+0x10/0x10 [mlx5_vdpa]
   ? vhost_iotlb_del_range+0xf/0xe0 [vhost_iotlb]
   mlx5_vdpa_free+0x3d/0x150 [mlx5_vdpa]
   vdpa_release_dev+0x1e/0x50 [vdpa]
   device_release+0x31/0x90
   kobject_cleanup+0x37/0x130
   mlx5_vdpa_dev_add+0x2d2/0x7a0 [mlx5_vdpa]
   vdpa_nl_cmd_dev_add_set_doit+0x277/0x4c0 [vdpa]
   genl_family_rcv_msg_doit+0xd9/0x130
   genl_family_rcv_msg+0x14d/0x220
   ? __pfx_vdpa_nl_cmd_dev_add_set_doit+0x10/0x10 [vdpa]
   ? _copy_to_user+0x1a/0x30
   ? move_addr_to_user+0x4b/0xe0
   genl_rcv_msg+0x47/0xa0
   ? __import_iovec+0x46/0x150
   ? __pfx_genl_rcv_msg+0x10/0x10
   netlink_rcv_skb+0x54/0x100
   genl_rcv+0x24/0x40
   netlink_unicast+0x245/0x370
   netlink_sendmsg+0x206/0x440
   __sys_sendto+0x1dc/0x1f0
   ? do_read_fault+0x10c/0x1d0
   ? do_pte_missing+0x10d/0x190
   __x64_sys_sendto+0x20/0x30
   do_syscall_64+0x5c/0xf0
   ? __count_memcg_events+0x4f/0xb0
   ? mm_account_fault+0x6c/0x100
   ? handle_mm_fault+0x116/0x270
   ? do_user_addr_fault+0x1d6/0x6a0
   ? do_syscall_64+0x6b/0xf0
   ? clear_bhb_loop+0x25/0x80
   ? clear_bhb_loop+0x25/0x80
   ? clear_bhb_loop+0x25/0x80
   ? clear_bhb_loop+0x25/0x80
   ? clear_bhb_loop+0x25/0x80
   entry_SYSCALL_64_after_hwframe+0x78/0x80

Fixes: 512c0cdd80c1 ("vdpa/mlx5: Decouple cvq iotlb handling from hw mapping 
code")
Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/core/mr.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 4758914ccf86..bf56f3d69625 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -581,6 +581,9 @@ static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev 
*mvdev)
 
 void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev)
 {
+   if (!mvdev->res.valid)
+   return;
+
for (int i = 0; i < MLX5_VDPA_NUM_AS; i++)
mlx5_vdpa_update_mr(mvdev, NULL, i);
 
-- 
2.45.1

[PATCH] vdpa/mlx5: Use random MAC address when no nic vport MAC set

2024-08-27 Thread Dragos Tatulea

When the vdpa device is configured without a specific MAC
address, the vport MAC address is used. However, this
address can be 0 which prevents the driver from properly
configuring the MPFS and breaks steering.

The solution is to simply generate a random MAC address
when no MAC is set on the nic vport.

Now it's possible to create a vdpa device without a
MAC address and run qemu with this device without needing
to configure an explicit MAC address.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Jiri Pirko 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index fa78e8288ebb..1c26139d02fe 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3824,6 +3824,9 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
if (err)
goto err_alloc;
+
+   if (is_zero_ether_addr(config->mac))
+   eth_random_addr(config->mac);
}
 
if (!is_zero_ether_addr(config->mac)) {
-- 
2.45.1

Re: [RFC] Why is set_config not supported in mlx5_vnet?

2024-08-26 Thread Dragos Tatulea



On 26.08.24 16:24, Andrew Lunn wrote:
> On Mon, Aug 26, 2024 at 11:06:09AM +0200, Dragos Tatulea wrote:
>>
>>
>> On 23.08.24 18:54, Carlos Bilbao wrote:
>>> Hello,
>>>
>>> I'm debugging my vDPA setup, and when using ioctl to retrieve the
>>> configuration, I noticed that it's running in half duplex mode:
>>>
>>> Configuration data (24 bytes):
>>>   MAC address: (Mac address)
>>>   Status: 0x0001
>>>   Max virtqueue pairs: 8
>>>   MTU: 1500
>>>   Speed: 0 Mb
>>>   Duplex: Half Duplex
>>>   RSS max key size: 0
>>>   RSS max indirection table length: 0
>>>   Supported hash types: 0x
>>>
>>> I believe this might be contributing to the underperformance of vDPA.
>> mlx5_vdpa vDPA devicess currently do not support the 
>> VIRTIO_NET_F_SPEED_DUPLEX
>> feature which reports speed and duplex. You can check the state on the
>> PF.
> 
> Then it should probably report DUPLEX_UNKNOWN.
> 
> The speed of 0 also suggests SPEED_UNKNOWN is not being returned. So
> this just looks buggy in general.
>
The virtio spec doesn't mention what those values should be when
VIRTIO_NET_F_SPEED_DUPLEX is not supported.

Jason, should vdpa_dev_net_config_fill() initialize the speed/duplex
fields to SPEED/DUPLEX_UNKNOWN instead of 0?

Thanks,
Dragos

Re: [RFC] Why is set_config not supported in mlx5_vnet?

2024-08-26 Thread Dragos Tatulea




On 26.08.24 16:26, Carlos Bilbao wrote:
> Hello Dragos,
> 
> On 8/26/24 4:06 AM, Dragos Tatulea wrote:
>>
>> On 23.08.24 18:54, Carlos Bilbao wrote:
>>> Hello,
>>>
>>> I'm debugging my vDPA setup, and when using ioctl to retrieve the
>>> configuration, I noticed that it's running in half duplex mode:
>>>
>>> Configuration data (24 bytes):
>>>   MAC address: (Mac address)
>>>   Status: 0x0001
>>>   Max virtqueue pairs: 8
>>>   MTU: 1500
>>>   Speed: 0 Mb
>>>   Duplex: Half Duplex
>>>   RSS max key size: 0
>>>   RSS max indirection table length: 0
>>>   Supported hash types: 0x
>>>
>>> I believe this might be contributing to the underperformance of vDPA.
>> mlx5_vdpa vDPA devicess currently do not support the 
>> VIRTIO_NET_F_SPEED_DUPLEX
>> feature which reports speed and duplex. You can check the state on the
>> PF.
> 
> 
> According to ethtool, all my devices are running at full duplex. I assume I
> can disregard this configuration output from the module then.
> 
Yep.

> 
>>
>>> While looking into how to change this option for Mellanox, I read the 
>>> following
>>> kernel code in mlx5_vnet.c:
>>>
>>> static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
>>> offset, const void *buf,
>>>  unsigned int len)
>>> {
>>> /* not supported */
>>> }
>>>
>>> I was wondering why this is the case.
>> TBH, I don't know why it was not added. But in general, the control VQ is the
>> better way as it's dynamic.
>>
>>> Is there another way for me to change
>>> these configuration settings?
>>>
>> The configuration is done using control VQ for most things (MTU, MAC, VQs,
>> etc). Make sure that you have the CTRL_VQ feature set (should be on by
>> default). It should appear in `vdpa mgmtdev show` and `vdpa dev config
>> show`.
> 
> 
> I see that CTRL_VQ is indeed enabled. Is there any documentation on how to
> use the control VQ to get/set vDPA configuration values?
> 
>
You are most likely using it already through through qemu. You can check
if the CTR_VQ feature also shows up in the output of `vdpa dev config show`.

What values are you trying to configure btw?

Thanks,
Dragos

Re: [RFC] Why is set_config not supported in mlx5_vnet?

2024-08-26 Thread Dragos Tatulea




On 23.08.24 18:54, Carlos Bilbao wrote:
> Hello,
> 
> I'm debugging my vDPA setup, and when using ioctl to retrieve the
> configuration, I noticed that it's running in half duplex mode:
> 
> Configuration data (24 bytes):
>   MAC address: (Mac address)
>   Status: 0x0001
>   Max virtqueue pairs: 8
>   MTU: 1500
>   Speed: 0 Mb
>   Duplex: Half Duplex
>   RSS max key size: 0
>   RSS max indirection table length: 0
>   Supported hash types: 0x
> 
> I believe this might be contributing to the underperformance of vDPA.
mlx5_vdpa vDPA devicess currently do not support the VIRTIO_NET_F_SPEED_DUPLEX
feature which reports speed and duplex. You can check the state on the
PF.


> While looking into how to change this option for Mellanox, I read the 
> following
> kernel code in mlx5_vnet.c:
> 
> static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
> offset, const void *buf,
>  unsigned int len)
> {
> /* not supported */
> }
> 
> I was wondering why this is the case.
TBH, I don't know why it was not added. But in general, the control VQ is the
better way as it's dynamic.

> Is there another way for me to change
> these configuration settings?
> 
The configuration is done using control VQ for most things (MTU, MAC, VQs,
etc). Make sure that you have the CTRL_VQ feature set (should be on by
default). It should appear in `vdpa mgmtdev show` and `vdpa dev config
show`.

Thanks,
Dragos

[PATCH vhost 7/7] vdpa/mlx5: Postpone MR deletion

2024-08-21 Thread Dragos Tatulea

Currently, when a new MR is set up, the old MR is deleted. MR deletion
is about 30-40% the time of MR creation. As deleting the old MR is not
important for the process of setting up the new MR, this operation
can be postponed.

This series adds a workqueue that does MR garbage collection at a later
point. If the MR lock is taken, the handler will back off and
reschedule. The exception during shutdown: then the handler must
not postpone the work.

Note that this is only a speculative optimization: if there is some
mapping operation that is triggered while the garbage collector handler
has the lock taken, this operation it will have to wait for the handler
to finish.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h | 10 ++
 drivers/vdpa/mlx5/core/mr.c| 51 --
 drivers/vdpa/mlx5/net/mlx5_vnet.c  |  3 +-
 3 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index c3e17bc888e8..2cedf7e2dbc4 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -86,8 +86,18 @@ enum {
 struct mlx5_vdpa_mr_resources {
struct mlx5_vdpa_mr *mr[MLX5_VDPA_NUM_AS];
unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
+
+   /* Pre-deletion mr list */
struct list_head mr_list_head;
+
+   /* Deferred mr list */
+   struct list_head mr_gc_list_head;
+   struct workqueue_struct *wq_gc;
+   struct delayed_work gc_dwork_ent;
+
struct mutex lock;
+
+   atomic_t shutdown;
 };
 
 struct mlx5_vdpa_dev {
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index ec75f165f832..43fce6b39cf2 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -653,14 +653,46 @@ static void _mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev 
*mvdev, struct mlx5_vdpa_
kfree(mr);
 }
 
+#define MLX5_VDPA_MR_GC_TRIGGER_MS 2000
+
+static void mlx5_vdpa_mr_gc_handler(struct work_struct *work)
+{
+   struct mlx5_vdpa_mr_resources *mres;
+   struct mlx5_vdpa_mr *mr, *tmp;
+   struct mlx5_vdpa_dev *mvdev;
+
+   mres = container_of(work, struct mlx5_vdpa_mr_resources, 
gc_dwork_ent.work);
+
+   if (atomic_read(&mres->shutdown)) {
+   mutex_lock(&mres->lock);
+   } else if (!mutex_trylock(&mres->lock)) {
+   queue_delayed_work(mres->wq_gc, &mres->gc_dwork_ent,
+  
msecs_to_jiffies(MLX5_VDPA_MR_GC_TRIGGER_MS));
+   return;
+   }
+
+   mvdev = container_of(mres, struct mlx5_vdpa_dev, mres);
+
+   list_for_each_entry_safe(mr, tmp, &mres->mr_gc_list_head, mr_list) {
+   _mlx5_vdpa_destroy_mr(mvdev, mr);
+   }
+
+   mutex_unlock(&mres->lock);
+}
+
 static void _mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_mr *mr)
 {
+   struct mlx5_vdpa_mr_resources *mres = &mvdev->mres;
+
if (!mr)
return;
 
-   if (refcount_dec_and_test(&mr->refcount))
-   _mlx5_vdpa_destroy_mr(mvdev, mr);
+   if (refcount_dec_and_test(&mr->refcount)) {
+   list_move_tail(&mr->mr_list, &mres->mr_gc_list_head);
+   queue_delayed_work(mres->wq_gc, &mres->gc_dwork_ent,
+  
msecs_to_jiffies(MLX5_VDPA_MR_GC_TRIGGER_MS));
+   }
 }
 
 void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev,
@@ -848,9 +880,17 @@ int mlx5_vdpa_init_mr_resources(struct mlx5_vdpa_dev 
*mvdev)
 {
struct mlx5_vdpa_mr_resources *mres = &mvdev->mres;
 
-   INIT_LIST_HEAD(&mres->mr_list_head);
+   mres->wq_gc = create_singlethread_workqueue("mlx5_vdpa_mr_gc");
+   if (!mres->wq_gc)
+   return -ENOMEM;
+
+   INIT_DELAYED_WORK(&mres->gc_dwork_ent, mlx5_vdpa_mr_gc_handler);
+
mutex_init(&mres->lock);
 
+   INIT_LIST_HEAD(&mres->mr_list_head);
+   INIT_LIST_HEAD(&mres->mr_gc_list_head);
+
return 0;
 }
 
@@ -858,5 +898,10 @@ void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev 
*mvdev)
 {
struct mlx5_vdpa_mr_resources *mres = &mvdev->mres;
 
+   atomic_set(&mres->shutdown, 1);
+
+   flush_delayed_work(&mres->gc_dwork_ent);
+   destroy_workqueue(mres->wq_gc);
+   mres->wq_gc = NULL;
mutex_destroy(&mres->lock);
 }
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 1cadcb05a5c7..ee9482ef51e6 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3435,6 +3435,8 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev)
free_fixed_resources(ndev);
mlx5_vdpa_clean_mrs(mvdev);
mlx5_vdpa_destroy_mr_resources(&ndev-&g

[PATCH vhost 5/7] vdpa/mlx5: Rename mr_mtx -> lock

2024-08-21 Thread Dragos Tatulea

Now that the mr resources have their own namespace in the
struct, give the lock a clearer name.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h |  2 +-
 drivers/vdpa/mlx5/core/mr.c| 20 ++--
 drivers/vdpa/mlx5/core/resources.c |  6 +++---
 drivers/vdpa/mlx5/net/mlx5_vnet.c  |  4 ++--
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 5ae6deea2a8a..89b564cecddf 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -87,7 +87,7 @@ struct mlx5_vdpa_mr_resources {
struct mlx5_vdpa_mr *mr[MLX5_VDPA_NUM_AS];
unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
struct list_head mr_list_head;
-   struct mutex mr_mtx;
+   struct mutex lock;
 };
 
 struct mlx5_vdpa_dev {
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 2c8660e5c0de..f20f2a8a701d 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -666,9 +666,9 @@ static void _mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev,
 void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_mr *mr)
 {
-   mutex_lock(&mvdev->mres.mr_mtx);
+   mutex_lock(&mvdev->mres.lock);
_mlx5_vdpa_put_mr(mvdev, mr);
-   mutex_unlock(&mvdev->mres.mr_mtx);
+   mutex_unlock(&mvdev->mres.lock);
 }
 
 static void _mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
@@ -683,9 +683,9 @@ static void _mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
 void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_mr *mr)
 {
-   mutex_lock(&mvdev->mres.mr_mtx);
+   mutex_lock(&mvdev->mres.lock);
_mlx5_vdpa_get_mr(mvdev, mr);
-   mutex_unlock(&mvdev->mres.mr_mtx);
+   mutex_unlock(&mvdev->mres.lock);
 }
 
 void mlx5_vdpa_update_mr(struct mlx5_vdpa_dev *mvdev,
@@ -694,19 +694,19 @@ void mlx5_vdpa_update_mr(struct mlx5_vdpa_dev *mvdev,
 {
struct mlx5_vdpa_mr *old_mr = mvdev->mres.mr[asid];
 
-   mutex_lock(&mvdev->mres.mr_mtx);
+   mutex_lock(&mvdev->mres.lock);
 
_mlx5_vdpa_put_mr(mvdev, old_mr);
mvdev->mres.mr[asid] = new_mr;
 
-   mutex_unlock(&mvdev->mres.mr_mtx);
+   mutex_unlock(&mvdev->mres.lock);
 }
 
 static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev *mvdev)
 {
struct mlx5_vdpa_mr *mr;
 
-   mutex_lock(&mvdev->mres.mr_mtx);
+   mutex_lock(&mvdev->mres.lock);
 
list_for_each_entry(mr, &mvdev->mres.mr_list_head, mr_list) {
 
@@ -715,7 +715,7 @@ static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev 
*mvdev)
   mr, mr->mkey, 
refcount_read(&mr->refcount));
}
 
-   mutex_unlock(&mvdev->mres.mr_mtx);
+   mutex_unlock(&mvdev->mres.lock);
 
 }
 
@@ -779,9 +779,9 @@ struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct 
mlx5_vdpa_dev *mvdev,
if (!mr)
return ERR_PTR(-ENOMEM);
 
-   mutex_lock(&mvdev->mres.mr_mtx);
+   mutex_lock(&mvdev->mres.lock);
err = _mlx5_vdpa_create_mr(mvdev, mr, iotlb);
-   mutex_unlock(&mvdev->mres.mr_mtx);
+   mutex_unlock(&mvdev->mres.lock);
 
if (err)
goto out_err;
diff --git a/drivers/vdpa/mlx5/core/resources.c 
b/drivers/vdpa/mlx5/core/resources.c
index 3e3b3049cb08..fe2ca3458f6c 100644
--- a/drivers/vdpa/mlx5/core/resources.c
+++ b/drivers/vdpa/mlx5/core/resources.c
@@ -256,7 +256,7 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
mlx5_vdpa_warn(mvdev, "resources already allocated\n");
return -EINVAL;
}
-   mutex_init(&mvdev->mres.mr_mtx);
+   mutex_init(&mvdev->mres.lock);
res->uar = mlx5_get_uars_page(mdev);
if (IS_ERR(res->uar)) {
err = PTR_ERR(res->uar);
@@ -301,7 +301,7 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
 err_uctx:
mlx5_put_uars_page(mdev, res->uar);
 err_uars:
-   mutex_destroy(&mvdev->mres.mr_mtx);
+   mutex_destroy(&mvdev->mres.lock);
return err;
 }
 
@@ -318,7 +318,7 @@ void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev)
dealloc_pd(mvdev, res->pdn, res->uid);
destroy_uctx(mvdev, res->uid);
mlx5_put_uars_page(mvdev->mdev, res->uar);
-   mutex_destroy(&mvdev->mres.mr_mtx);
+   mutex_destroy(&mvdev->mres.lock);
res->valid = false;
 }
 
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 3e55a7f1afcd..8a51c492a62a 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3639,10 +3639,10 @@ static int mlx5_s

[PATCH vhost 0/7] vdpa/mlx5: Optimze MKEY operations

2024-08-21 Thread Dragos Tatulea

This series improves the time of .set_map() operations by parallelizing
the MKEY creation and deletion for direct MKEYs. Looking at the top
level MKEY creation/deletion functions, the following improvement can be
seen:

|---+-|
| operation | improvement |
|---+-|
| create_user_mr()  | 3-5x|
| destroy_user_mr() | 8x  |
|---+-|

The last part of the series introduces lazy MKEY deletion which
postpones the MKEY deletion to a later point in a workqueue.

As this series and the previous ones were targeting live migration,
we can also observe improvements on this front:

|---+--+--|
| Stage | Downtime #1 (ms) | Downtime #2 (ms) |
|---+--+--|
| Baseline  | 3140 | 3630 |
| Parallel MKEY ops | 1200 | 2000 |
| Deferred deletion | 1014 | 1253 |
|---+--+--|

Test configuration: 256 GB VM, 32 CPUs x 2 threads per core, 4 x mlx5
vDPA devices x 32 VQs (16 VQPs)

This series must be applied on top of the parallel VQ suspend/resume
series [0].

[0] https://lore.kernel.org/all/20240816090159.1967650-1-dtatu...@nvidia.com/

Dragos Tatulea (7):
  vdpa/mlx5: Create direct MKEYs in parallel
  vdpa/mlx5: Delete direct MKEYs in parallel
  vdpa/mlx5: Rename function
  vdpa/mlx5: Extract mr members in own resource struct
  vdpa/mlx5: Rename mr_mtx -> lock
  vdpa/mlx5: Introduce init/destroy for MR resources
  vdpa/mlx5: Postpone MR deletion

 drivers/vdpa/mlx5/core/mlx5_vdpa.h |  25 ++-
 drivers/vdpa/mlx5/core/mr.c| 284 -
 drivers/vdpa/mlx5/core/resources.c |   3 -
 drivers/vdpa/mlx5/net/mlx5_vnet.c  |  53 +++---
 4 files changed, 293 insertions(+), 72 deletions(-)

-- 
2.45.1

[PATCH vhost 6/7] vdpa/mlx5: Introduce init/destroy for MR resources

2024-08-21 Thread Dragos Tatulea

There's currently not a lot of action happening during
the init/destroy of MR resources. But more will be added
in the upcoming patches.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h |  2 ++
 drivers/vdpa/mlx5/core/mr.c| 17 +
 drivers/vdpa/mlx5/core/resources.c |  3 ---
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 10 --
 4 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 89b564cecddf..c3e17bc888e8 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -138,6 +138,8 @@ int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, u32 
*mkey, u32 *in,
 int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 mkey);
 struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev,
 struct vhost_iotlb *iotlb);
+int mlx5_vdpa_init_mr_resources(struct mlx5_vdpa_dev *mvdev);
+void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev);
 void mlx5_vdpa_clean_mrs(struct mlx5_vdpa_dev *mvdev);
 void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_mr *mr);
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index f20f2a8a701d..ec75f165f832 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -843,3 +843,20 @@ int mlx5_vdpa_reset_mr(struct mlx5_vdpa_dev *mvdev, 
unsigned int asid)
 
return 0;
 }
+
+int mlx5_vdpa_init_mr_resources(struct mlx5_vdpa_dev *mvdev)
+{
+   struct mlx5_vdpa_mr_resources *mres = &mvdev->mres;
+
+   INIT_LIST_HEAD(&mres->mr_list_head);
+   mutex_init(&mres->lock);
+
+   return 0;
+}
+
+void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev)
+{
+   struct mlx5_vdpa_mr_resources *mres = &mvdev->mres;
+
+   mutex_destroy(&mres->lock);
+}
diff --git a/drivers/vdpa/mlx5/core/resources.c 
b/drivers/vdpa/mlx5/core/resources.c
index fe2ca3458f6c..aeae31d0cefa 100644
--- a/drivers/vdpa/mlx5/core/resources.c
+++ b/drivers/vdpa/mlx5/core/resources.c
@@ -256,7 +256,6 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
mlx5_vdpa_warn(mvdev, "resources already allocated\n");
return -EINVAL;
}
-   mutex_init(&mvdev->mres.lock);
res->uar = mlx5_get_uars_page(mdev);
if (IS_ERR(res->uar)) {
err = PTR_ERR(res->uar);
@@ -301,7 +300,6 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
 err_uctx:
mlx5_put_uars_page(mdev, res->uar);
 err_uars:
-   mutex_destroy(&mvdev->mres.lock);
return err;
 }
 
@@ -318,7 +316,6 @@ void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev)
dealloc_pd(mvdev, res->pdn, res->uid);
destroy_uctx(mvdev, res->uid);
mlx5_put_uars_page(mvdev->mdev, res->uar);
-   mutex_destroy(&mvdev->mres.lock);
res->valid = false;
 }
 
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 8a51c492a62a..1cadcb05a5c7 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3434,6 +3434,7 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev)
 
free_fixed_resources(ndev);
mlx5_vdpa_clean_mrs(mvdev);
+   mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
if (!is_zero_ether_addr(ndev->config.mac)) {
pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
@@ -3962,12 +3963,15 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
if (err)
goto err_mpfs;
 
-   INIT_LIST_HEAD(&mvdev->mres.mr_list_head);
+   err = mlx5_vdpa_init_mr_resources(mvdev);
+   if (err)
+   goto err_res;
+
 
if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
err = mlx5_vdpa_create_dma_mr(mvdev);
if (err)
-   goto err_res;
+   goto err_mr_res;
}
 
err = alloc_fixed_resources(ndev);
@@ -4009,6 +4013,8 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
free_fixed_resources(ndev);
 err_mr:
mlx5_vdpa_clean_mrs(mvdev);
+err_mr_res:
+   mlx5_vdpa_destroy_mr_resources(mvdev);
 err_res:
mlx5_vdpa_free_resources(&ndev->mvdev);
 err_mpfs:
-- 
2.45.1

[PATCH vhost 4/7] vdpa/mlx5: Extract mr members in own resource struct

2024-08-21 Thread Dragos Tatulea

Group all mapping related resources into their own structure.

Upcoming patches will add more members in this new structure.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h | 13 ++-
 drivers/vdpa/mlx5/core/mr.c| 30 -
 drivers/vdpa/mlx5/core/resources.c |  6 ++---
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 36 +++---
 4 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 4d217d18239c..5ae6deea2a8a 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -83,10 +83,18 @@ enum {
MLX5_VDPA_NUM_AS = 2
 };
 
+struct mlx5_vdpa_mr_resources {
+   struct mlx5_vdpa_mr *mr[MLX5_VDPA_NUM_AS];
+   unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
+   struct list_head mr_list_head;
+   struct mutex mr_mtx;
+};
+
 struct mlx5_vdpa_dev {
struct vdpa_device vdev;
struct mlx5_core_dev *mdev;
struct mlx5_vdpa_resources res;
+   struct mlx5_vdpa_mr_resources mres;
 
u64 mlx_features;
u64 actual_features;
@@ -95,13 +103,8 @@ struct mlx5_vdpa_dev {
u16 max_idx;
u32 generation;
 
-   struct mlx5_vdpa_mr *mr[MLX5_VDPA_NUM_AS];
-   struct list_head mr_list_head;
-   /* serialize mr access */
-   struct mutex mr_mtx;
struct mlx5_control_vq cvq;
struct workqueue_struct *wq;
-   unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
bool suspended;
 
struct mlx5_async_ctx async_ctx;
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 149edea09c8f..2c8660e5c0de 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -666,9 +666,9 @@ static void _mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev,
 void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_mr *mr)
 {
-   mutex_lock(&mvdev->mr_mtx);
+   mutex_lock(&mvdev->mres.mr_mtx);
_mlx5_vdpa_put_mr(mvdev, mr);
-   mutex_unlock(&mvdev->mr_mtx);
+   mutex_unlock(&mvdev->mres.mr_mtx);
 }
 
 static void _mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
@@ -683,39 +683,39 @@ static void _mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
 void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_mr *mr)
 {
-   mutex_lock(&mvdev->mr_mtx);
+   mutex_lock(&mvdev->mres.mr_mtx);
_mlx5_vdpa_get_mr(mvdev, mr);
-   mutex_unlock(&mvdev->mr_mtx);
+   mutex_unlock(&mvdev->mres.mr_mtx);
 }
 
 void mlx5_vdpa_update_mr(struct mlx5_vdpa_dev *mvdev,
 struct mlx5_vdpa_mr *new_mr,
 unsigned int asid)
 {
-   struct mlx5_vdpa_mr *old_mr = mvdev->mr[asid];
+   struct mlx5_vdpa_mr *old_mr = mvdev->mres.mr[asid];
 
-   mutex_lock(&mvdev->mr_mtx);
+   mutex_lock(&mvdev->mres.mr_mtx);
 
_mlx5_vdpa_put_mr(mvdev, old_mr);
-   mvdev->mr[asid] = new_mr;
+   mvdev->mres.mr[asid] = new_mr;
 
-   mutex_unlock(&mvdev->mr_mtx);
+   mutex_unlock(&mvdev->mres.mr_mtx);
 }
 
 static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev *mvdev)
 {
struct mlx5_vdpa_mr *mr;
 
-   mutex_lock(&mvdev->mr_mtx);
+   mutex_lock(&mvdev->mres.mr_mtx);
 
-   list_for_each_entry(mr, &mvdev->mr_list_head, mr_list) {
+   list_for_each_entry(mr, &mvdev->mres.mr_list_head, mr_list) {
 
mlx5_vdpa_warn(mvdev, "mkey still alive after resource delete: "
  "mr: %p, mkey: 0x%x, refcount: %u\n",
   mr, mr->mkey, 
refcount_read(&mr->refcount));
}
 
-   mutex_unlock(&mvdev->mr_mtx);
+   mutex_unlock(&mvdev->mres.mr_mtx);
 
 }
 
@@ -753,7 +753,7 @@ static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev,
if (err)
goto err_iotlb;
 
-   list_add_tail(&mr->mr_list, &mvdev->mr_list_head);
+   list_add_tail(&mr->mr_list, &mvdev->mres.mr_list_head);
 
return 0;
 
@@ -779,9 +779,9 @@ struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct 
mlx5_vdpa_dev *mvdev,
if (!mr)
return ERR_PTR(-ENOMEM);
 
-   mutex_lock(&mvdev->mr_mtx);
+   mutex_lock(&mvdev->mres.mr_mtx);
err = _mlx5_vdpa_create_mr(mvdev, mr, iotlb);
-   mutex_unlock(&mvdev->mr_mtx);
+   mutex_unlock(&mvdev->mres.mr_mtx);
 
if (err)
goto out_err;
@@ -801,7 +801,7 @@ int mlx5_vdpa_update_cvq_iotlb(struct mlx5_vdpa_dev *mvdev,
 {
int err;
 
-   if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] != asid)
+   if (mvdev->mres.group2asid[MLX5_VDPA_CVQ_GROUP]

[PATCH vhost 2/7] vdpa/mlx5: Delete direct MKEYs in parallel

2024-08-21 Thread Dragos Tatulea

Use the async interface to issue MTT MKEY deletion.

This makes destroy_user_mr() on average 8x times faster.
This number is also dependent on the size of the MR being
deleted.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/core/mr.c | 66 +
 1 file changed, 66 insertions(+)

diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 66e6a15f823f..8cedf2969991 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -55,6 +55,11 @@ struct mlx5_create_mkey_mem {
DECLARE_FLEX_ARRAY(__be64, mtt);
 };
 
+struct mlx5_destroy_mkey_mem {
+   u8 out[MLX5_ST_SZ_BYTES(destroy_mkey_out)];
+   u8 in[MLX5_ST_SZ_BYTES(destroy_mkey_in)];
+};
+
 static void fill_create_direct_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_direct_mr *mr,
  struct mlx5_create_mkey_mem *mem)
@@ -91,6 +96,17 @@ static void create_direct_mr_end(struct mlx5_vdpa_dev *mvdev,
mr->mr = mlx5_idx_to_mkey(mkey_index);
 }
 
+static void fill_destroy_direct_mr(struct mlx5_vdpa_dev *mvdev,
+  struct mlx5_vdpa_direct_mr *mr,
+  struct mlx5_destroy_mkey_mem *mem)
+{
+   void *in = &mem->in;
+
+   MLX5_SET(destroy_mkey_in, in, uid, mvdev->res.uid);
+   MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY);
+   MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mr->mr));
+}
+
 static void destroy_direct_mr(struct mlx5_vdpa_dev *mvdev, struct 
mlx5_vdpa_direct_mr *mr)
 {
if (!mr->mr)
@@ -255,6 +271,55 @@ static int create_direct_keys(struct mlx5_vdpa_dev *mvdev, 
struct mlx5_vdpa_mr *
return err;
 }
 
+static int destroy_direct_keys(struct mlx5_vdpa_dev *mvdev, struct 
mlx5_vdpa_mr *mr)
+{
+   struct mlx5_destroy_mkey_mem *cmd_mem;
+   struct mlx5_vdpa_async_cmd *cmds;
+   struct mlx5_vdpa_direct_mr *dmr;
+   int err = 0;
+   int i = 0;
+
+   cmds = kvcalloc(mr->num_directs, sizeof(*cmds), GFP_KERNEL);
+   cmd_mem = kvcalloc(mr->num_directs, sizeof(*cmd_mem), GFP_KERNEL);
+   if (!cmds || !cmd_mem) {
+   err = -ENOMEM;
+   goto done;
+   }
+
+   list_for_each_entry(dmr, &mr->head, list) {
+   cmds[i].out = cmd_mem[i].out;
+   cmds[i].outlen = sizeof(cmd_mem[i].out);
+   cmds[i].in = cmd_mem[i].in;
+   cmds[i].inlen = sizeof(cmd_mem[i].in);
+   fill_destroy_direct_mr(mvdev, dmr, &cmd_mem[i]);
+   i++;
+   }
+
+   err = mlx5_vdpa_exec_async_cmds(mvdev, cmds, mr->num_directs);
+   if (err) {
+
+   mlx5_vdpa_err(mvdev, "error issuing MTT mkey deletion for 
direct mrs: %d\n", err);
+   goto done;
+   }
+
+   i = 0;
+   list_for_each_entry(dmr, &mr->head, list) {
+   struct mlx5_vdpa_async_cmd *cmd = &cmds[i++];
+
+   dmr->mr = 0;
+   if (cmd->err) {
+   err = err ? err : cmd->err;
+   mlx5_vdpa_err(mvdev, "error deleting MTT mkey [0x%llx, 
0x%llx]: %d\n",
+   dmr->start, dmr->end, cmd->err);
+   }
+   }
+
+done:
+   kvfree(cmd_mem);
+   kvfree(cmds);
+   return err;
+}
+
 static int create_indirect_key(struct mlx5_vdpa_dev *mvdev, struct 
mlx5_vdpa_mr *mr)
 {
int inlen;
@@ -563,6 +628,7 @@ static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, 
struct mlx5_vdpa_mr *mr
struct mlx5_vdpa_direct_mr *n;
 
destroy_indirect_key(mvdev, mr);
+   destroy_direct_keys(mvdev, mr);
list_for_each_entry_safe_reverse(dmr, n, &mr->head, list) {
list_del_init(&dmr->list);
unmap_direct_mr(mvdev, dmr);
-- 
2.45.1

[PATCH vhost 3/7] vdpa/mlx5: Rename function

2024-08-21 Thread Dragos Tatulea

A followup patch will use this name for something else.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h | 2 +-
 drivers/vdpa/mlx5/core/mr.c| 2 +-
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 8 
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 24fa00afb24f..4d217d18239c 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -135,7 +135,7 @@ int mlx5_vdpa_create_mkey(struct mlx5_vdpa_dev *mvdev, u32 
*mkey, u32 *in,
 int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 mkey);
 struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev,
 struct vhost_iotlb *iotlb);
-void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev);
+void mlx5_vdpa_clean_mrs(struct mlx5_vdpa_dev *mvdev);
 void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev,
  struct mlx5_vdpa_mr *mr);
 void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev,
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 8cedf2969991..149edea09c8f 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -719,7 +719,7 @@ static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev 
*mvdev)
 
 }
 
-void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev)
+void mlx5_vdpa_clean_mrs(struct mlx5_vdpa_dev *mvdev)
 {
for (int i = 0; i < MLX5_VDPA_NUM_AS; i++)
mlx5_vdpa_update_mr(mvdev, NULL, i);
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 822092eccb32..cf2b77ebc72b 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3223,7 +3223,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
 err_driver:
unregister_link_notifier(ndev);
 err_setup:
-   mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
+   mlx5_vdpa_clean_mrs(&ndev->mvdev);
ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
 err_clear:
up_write(&ndev->reslock);
@@ -3275,7 +3275,7 @@ static int mlx5_vdpa_compat_reset(struct vdpa_device 
*vdev, u32 flags)
}
 
if (flags & VDPA_RESET_F_CLEAN_MAP)
-   mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
+   mlx5_vdpa_clean_mrs(&ndev->mvdev);
ndev->mvdev.status = 0;
ndev->mvdev.suspended = false;
ndev->cur_num_vqs = MLX5V_DEFAULT_VQ_COUNT;
@@ -3433,7 +3433,7 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev)
ndev = to_mlx5_vdpa_ndev(mvdev);
 
free_fixed_resources(ndev);
-   mlx5_vdpa_destroy_mr_resources(mvdev);
+   mlx5_vdpa_clean_mrs(mvdev);
if (!is_zero_ether_addr(ndev->config.mac)) {
pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
@@ -4008,7 +4008,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
 err_res2:
free_fixed_resources(ndev);
 err_mr:
-   mlx5_vdpa_destroy_mr_resources(mvdev);
+   mlx5_vdpa_clean_mrs(mvdev);
 err_res:
mlx5_vdpa_free_resources(&ndev->mvdev);
 err_mpfs:
-- 
2.45.1

[PATCH vhost 1/7] vdpa/mlx5: Create direct MKEYs in parallel

2024-08-21 Thread Dragos Tatulea

Use the async interface to issue MTT MKEY creation.
Extra care is taken at the allocation of FW input commands
due to the MTT tables having variable sizes depending on
MR.

The indirect MKEY is still created synchronously at the
end as the direct MKEYs need to be filled in.

This makes create_user_mr() 3-5x faster, depending on
the size of the MR.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/core/mr.c | 118 +---
 1 file changed, 96 insertions(+), 22 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 4758914ccf86..66e6a15f823f 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -49,17 +49,18 @@ static void populate_mtts(struct mlx5_vdpa_direct_mr *mr, 
__be64 *mtt)
}
 }
 
-static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct 
mlx5_vdpa_direct_mr *mr)
+struct mlx5_create_mkey_mem {
+   u8 out[MLX5_ST_SZ_BYTES(create_mkey_out)];
+   u8 in[MLX5_ST_SZ_BYTES(create_mkey_in)];
+   DECLARE_FLEX_ARRAY(__be64, mtt);
+};
+
+static void fill_create_direct_mr(struct mlx5_vdpa_dev *mvdev,
+ struct mlx5_vdpa_direct_mr *mr,
+ struct mlx5_create_mkey_mem *mem)
 {
-   int inlen;
+   void *in = &mem->in;
void *mkc;
-   void *in;
-   int err;
-
-   inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 
roundup(MLX5_ST_SZ_BYTES(mtt) * mr->nsg, 16);
-   in = kvzalloc(inlen, GFP_KERNEL);
-   if (!in)
-   return -ENOMEM;
 
MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid);
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
@@ -76,18 +77,25 @@ static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, 
struct mlx5_vdpa_direct
MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
 get_octo_len(mr->end - mr->start, mr->log_size));
populate_mtts(mr, MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt));
-   err = mlx5_vdpa_create_mkey(mvdev, &mr->mr, in, inlen);
-   kvfree(in);
-   if (err) {
-   mlx5_vdpa_warn(mvdev, "Failed to create direct MR\n");
-   return err;
-   }
 
-   return 0;
+   MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
+   MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid);
+}
+
+static void create_direct_mr_end(struct mlx5_vdpa_dev *mvdev,
+struct mlx5_vdpa_direct_mr *mr,
+struct mlx5_create_mkey_mem *mem)
+{
+   u32 mkey_index = MLX5_GET(create_mkey_out, mem->out, mkey_index);
+
+   mr->mr = mlx5_idx_to_mkey(mkey_index);
 }
 
 static void destroy_direct_mr(struct mlx5_vdpa_dev *mvdev, struct 
mlx5_vdpa_direct_mr *mr)
 {
+   if (!mr->mr)
+   return;
+
mlx5_vdpa_destroy_mkey(mvdev, mr->mr);
 }
 
@@ -179,6 +187,74 @@ static int klm_byte_size(int nklms)
return 16 * ALIGN(nklms, 4);
 }
 
+static int create_direct_keys(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr 
*mr)
+{
+   struct mlx5_vdpa_async_cmd *cmds = NULL;
+   struct mlx5_vdpa_direct_mr *dmr;
+   int err = 0;
+   int i = 0;
+
+   cmds = kvcalloc(mr->num_directs, sizeof(*cmds), GFP_KERNEL);
+   if (!cmds)
+   return -ENOMEM;
+
+   list_for_each_entry(dmr, &mr->head, list) {
+   struct mlx5_create_mkey_mem *cmd_mem;
+   int mttlen, mttcount;
+
+   mttlen = roundup(MLX5_ST_SZ_BYTES(mtt) * dmr->nsg, 16);
+   mttcount = mttlen / sizeof(cmd_mem->mtt[0]);
+   cmd_mem = kvcalloc(1, struct_size(cmd_mem, mtt, mttcount), 
GFP_KERNEL);
+   if (!cmd_mem) {
+   err = -ENOMEM;
+   goto done;
+   }
+
+   cmds[i].out = cmd_mem->out;
+   cmds[i].outlen = sizeof(cmd_mem->out);
+   cmds[i].in = cmd_mem->in;
+   cmds[i].inlen = struct_size(cmd_mem, mtt, mttcount);
+
+   fill_create_direct_mr(mvdev, dmr, cmd_mem);
+
+   i++;
+   }
+
+   err = mlx5_vdpa_exec_async_cmds(mvdev, cmds, mr->num_directs);
+   if (err) {
+
+   mlx5_vdpa_err(mvdev, "error issuing MTT mkey creation for 
direct mrs: %d\n", err);
+   goto done;
+   }
+
+   i = 0;
+   list_for_each_entry(dmr, &mr->head, list) {
+   struct mlx5_vdpa_async_cmd *cmd = &cmds[i++];
+   struct mlx5_create_mkey_mem *cmd_mem;
+
+   cmd_mem = container_of(cmd->out, struct mlx5_create_mkey_mem, 
out);
+
+   if (!cmd->err) {
+   create_direct_mr_end(mvdev, dmr, cmd_mem);
+   } else {
+   err = err ? err : cmd->err;
+   mlx5_vdpa_err(mvd

Re: [PATCH vhost 0/7] vdpa/mlx5: Parallelize device suspend/resume

2024-08-16 Thread Dragos Tatulea




On 02.08.24 15:14, Michael S. Tsirkin wrote:
> On Fri, Aug 02, 2024 at 10:20:17AM +0300, Dragos Tatulea wrote:
>> This series parallelizes the mlx5_vdpa device suspend and resume
>> operations through the firmware async API. The purpose is to reduce live
>> migration downtime.
>>
>> The series starts with changing the VQ suspend and resume commands
>> to the async API. After that, the switch is made to issue multiple
>> commands of the same type in parallel.
>>
>> Finally, a bonus improvement is thrown in: keep the notifierd enabled
>> during suspend but make it a NOP. Upon resume make sure that the link
>> state is forwarded. This shaves around 30ms per device constant time.
>>
>> For 1 vDPA device x 32 VQs (16 VQPs), on a large VM (256 GB RAM, 32 CPUs
>> x 2 threads per core), the improvements are:
>>
>> +---+++---+
>> | operation | Before | After  | Reduction |
>> |---+++---|
>> | mlx5_vdpa_suspend | 37 ms  | 2.5 ms | 14x   |
>> | mlx5_vdpa_resume  | 16 ms  | 5 ms   |  3x   |
>> +---+++---+
>>
>> Note for the maintainers:
>> The first patch contains changes for mlx5_core. This must be applied
>> into the mlx5-vhost tree [0] first. Once this patch is applied on
>> mlx5-vhost, the change has to be pulled from mlx5-vdpa into the vhost
>> tree and only then the remaining patches can be applied.
> 
> Or maintainer just acks it and I apply directly.
> 
Tariq reviewed the patch, he is a mlx5_core maintainer. So consider it acked.
Just sent the v2 with the same note in the cover letter.

Thanks,
Dragos

> Let me know when all this can happen.
> 
>> [0] 
>> https://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git/log/?h=mlx5-vhost
>>
>> Dragos Tatulea (7):
>>   net/mlx5: Support throttled commands from async API
>>   vdpa/mlx5: Introduce error logging function
>>   vdpa/mlx5: Use async API for vq query command
>>   vdpa/mlx5: Use async API for vq modify commands
>>   vdpa/mlx5: Parallelize device suspend
>>   vdpa/mlx5: Parallelize device resume
>>   vdpa/mlx5: Keep notifiers during suspend but ignore
>>
>>  drivers/net/ethernet/mellanox/mlx5/core/cmd.c |  21 +-
>>  drivers/vdpa/mlx5/core/mlx5_vdpa.h|   7 +
>>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 435 +-
>>  3 files changed, 333 insertions(+), 130 deletions(-)
>>
>> -- 
>> 2.45.2
>

[PATCH vhost v2 10/10] vdpa/mlx5: Parallelize VQ suspend/resume for CVQ MQ command

2024-08-16 Thread Dragos Tatulea

change_num_qps() is still suspending/resuming VQs one by one.
This change switches to parallel suspend/resume.

When increasing the number of queues the flow has changed a bit for
simplicity: the setup_vq() function will always be called before
resume_vqs(). If the VQ is initialized, setup_vq() will exit early. If
the VQ is not initialized, setup_vq() will create it and resume_vqs()
will resume it.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Tariq Toukan 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 22 --
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index d1a01c229110..822092eccb32 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2229,25 +2229,27 @@ static int change_num_qps(struct mlx5_vdpa_dev *mvdev, 
int newqps)
if (err)
return err;
 
-   for (i = cur_vqs - 1; i >= new_vqs; i--) {
-   struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i];
-
-   if (is_resumable(ndev))
-   suspend_vq(ndev, mvq);
-   else
-   teardown_vq(ndev, mvq);
+   if (is_resumable(ndev)) {
+   suspend_vqs(ndev, new_vqs, cur_vqs - new_vqs);
+   } else {
+   for (i = new_vqs; i < cur_vqs; i++)
+   teardown_vq(ndev, &ndev->vqs[i]);
}
 
ndev->cur_num_vqs = new_vqs;
} else {
ndev->cur_num_vqs = new_vqs;
-   for (i = cur_vqs; i < new_vqs; i++) {
-   struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i];
 
-   err = mvq->initialized ? resume_vq(ndev, mvq) : 
setup_vq(ndev, mvq, true);
+   for (i = cur_vqs; i < new_vqs; i++) {
+   err = setup_vq(ndev, &ndev->vqs[i], false);
if (err)
goto clean_added;
}
+
+   err = resume_vqs(ndev, cur_vqs, new_vqs - cur_vqs);
+   if (err)
+   goto clean_added;
+
err = modify_rqt(ndev, new_vqs);
if (err)
goto clean_added;
-- 
2.45.1

[PATCH vhost v2 09/10] vdpa/mlx5: Small improvement for change_num_qps()

2024-08-16 Thread Dragos Tatulea

change_num_qps() has a lot of multiplications by 2 to convert
the number of VQ pairs to number of VQs. This patch simplifies
the code by doing the VQP -> VQ count conversion at the beginning
in a variable.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Tariq Toukan 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 65063c507130..d1a01c229110 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2219,16 +2219,17 @@ static virtio_net_ctrl_ack handle_ctrl_mac(struct 
mlx5_vdpa_dev *mvdev, u8 cmd)
 static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
 {
struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-   int cur_qps = ndev->cur_num_vqs / 2;
+   int cur_vqs = ndev->cur_num_vqs;
+   int new_vqs = newqps * 2;
int err;
int i;
 
-   if (cur_qps > newqps) {
-   err = modify_rqt(ndev, 2 * newqps);
+   if (cur_vqs > new_vqs) {
+   err = modify_rqt(ndev, new_vqs);
if (err)
return err;
 
-   for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--) {
+   for (i = cur_vqs - 1; i >= new_vqs; i--) {
struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i];
 
if (is_resumable(ndev))
@@ -2237,27 +2238,27 @@ static int change_num_qps(struct mlx5_vdpa_dev *mvdev, 
int newqps)
teardown_vq(ndev, mvq);
}
 
-   ndev->cur_num_vqs = 2 * newqps;
+   ndev->cur_num_vqs = new_vqs;
} else {
-   ndev->cur_num_vqs = 2 * newqps;
-   for (i = cur_qps * 2; i < 2 * newqps; i++) {
+   ndev->cur_num_vqs = new_vqs;
+   for (i = cur_vqs; i < new_vqs; i++) {
struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i];
 
err = mvq->initialized ? resume_vq(ndev, mvq) : 
setup_vq(ndev, mvq, true);
if (err)
goto clean_added;
}
-   err = modify_rqt(ndev, 2 * newqps);
+   err = modify_rqt(ndev, new_vqs);
if (err)
goto clean_added;
}
return 0;
 
 clean_added:
-   for (--i; i >= 2 * cur_qps; --i)
+   for (--i; i >= cur_vqs; --i)
teardown_vq(ndev, &ndev->vqs[i]);
 
-   ndev->cur_num_vqs = 2 * cur_qps;
+   ndev->cur_num_vqs = cur_vqs;
 
return err;
 }
-- 
2.45.1

[PATCH vhost v2 08/10] vdpa/mlx5: Keep notifiers during suspend but ignore

2024-08-16 Thread Dragos Tatulea

Unregistering notifiers is a costly operation. Instead of removing
the notifiers during device suspend and adding them back at resume,
simply ignore the call when the device is suspended.

At resume time call queue_link_work() to make sure that the device state
is propagated in case there were changes.

For 1 vDPA device x 32 VQs (16 VQPs) attached to a large VM (256 GB RAM,
32 CPUs x 2 threads per core), the device suspend time is reduced from
~13 ms to ~2.5 ms.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Tariq Toukan 
Acked-by: Eugenio Pérez 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 0773bec917be..65063c507130 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2850,6 +2850,9 @@ static int event_handler(struct notifier_block *nb, 
unsigned long event, void *p
struct mlx5_eqe *eqe = param;
int ret = NOTIFY_DONE;
 
+   if (ndev->mvdev.suspended)
+   return NOTIFY_DONE;
+
if (event == MLX5_EVENT_TYPE_PORT_CHANGE) {
switch (eqe->sub_type) {
case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
@@ -3595,7 +3598,6 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
mlx5_vdpa_info(mvdev, "suspending device\n");
 
down_write(&ndev->reslock);
-   unregister_link_notifier(ndev);
err = suspend_vqs(ndev, 0, ndev->cur_num_vqs);
mlx5_vdpa_cvq_suspend(mvdev);
mvdev->suspended = true;
@@ -3617,7 +3619,7 @@ static int mlx5_vdpa_resume(struct vdpa_device *vdev)
down_write(&ndev->reslock);
mvdev->suspended = false;
err = resume_vqs(ndev, 0, ndev->cur_num_vqs);
-   register_link_notifier(ndev);
+   queue_link_work(ndev);
up_write(&ndev->reslock);
 
return err;
-- 
2.45.1

[PATCH vhost v2 07/10] vdpa/mlx5: Parallelize device resume

2024-08-16 Thread Dragos Tatulea

Currently device resume works on vqs serially. Building up on previous
changes that converted vq operations to the async api, this patch
parallelizes the device resume.

For 1 vDPA device x 32 VQs (16 VQPs) attached to a large VM (256 GB RAM,
32 CPUs x 2 threads per core), the device resume time is reduced from
~16 ms to ~4.5 ms.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Tariq Toukan 
Acked-by: Eugenio Pérez 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 40 +++
 1 file changed, 14 insertions(+), 26 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 5fba16c80dbb..0773bec917be 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1675,10 +1675,15 @@ static int suspend_vq(struct mlx5_vdpa_net *ndev, 
struct mlx5_vdpa_virtqueue *mv
return suspend_vqs(ndev, mvq->index, 1);
 }
 
-static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
+static int resume_vqs(struct mlx5_vdpa_net *ndev, int start_vq, int num_vqs)
 {
+   struct mlx5_vdpa_virtqueue *mvq;
int err;
 
+   if (start_vq >= ndev->mvdev.max_vqs)
+   return -EINVAL;
+
+   mvq = &ndev->vqs[start_vq];
if (!mvq->initialized)
return 0;
 
@@ -1690,13 +1695,9 @@ static int resume_vq(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *mvq
/* Due to a FW quirk we need to modify the VQ fields first then 
change state.
 * This should be fixed soon. After that, a single command can 
be used.
 */
-   err = modify_virtqueues(ndev, mvq->index, 1, mvq->fw_state);
-   if (err) {
-   mlx5_vdpa_err(&ndev->mvdev,
-   "modify vq properties failed for vq %u, err: 
%d\n",
-   mvq->index, err);
+   err = modify_virtqueues(ndev, start_vq, num_vqs, mvq->fw_state);
+   if (err)
return err;
-   }
break;
case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND:
if (!is_resumable(ndev)) {
@@ -1712,25 +1713,12 @@ static int resume_vq(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *mvq
return -EINVAL;
}
 
-   err = modify_virtqueues(ndev, mvq->index, 1, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
-   if (err)
-   mlx5_vdpa_err(&ndev->mvdev, "modify to resume failed for vq %u, 
err: %d\n",
- mvq->index, err);
-
-   return err;
+   return modify_virtqueues(ndev, start_vq, num_vqs, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
 }
 
-static int resume_vqs(struct mlx5_vdpa_net *ndev)
+static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
 {
-   int err = 0;
-
-   for (int i = 0; i < ndev->cur_num_vqs; i++) {
-   int local_err = resume_vq(ndev, &ndev->vqs[i]);
-
-   err = local_err ? local_err : err;
-   }
-
-   return err;
+   return resume_vqs(ndev, mvq->index, 1);
 }
 
 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
@@ -3080,7 +3068,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev 
*mvdev,
return err;
}
 
-   resume_vqs(ndev);
+   resume_vqs(ndev, 0, ndev->cur_num_vqs);
 
return 0;
 }
@@ -3204,7 +3192,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
teardown_vq_resources(ndev);
 
if (ndev->setup) {
-   err = resume_vqs(ndev);
+   err = resume_vqs(ndev, 0, ndev->cur_num_vqs);
if (err) {
mlx5_vdpa_warn(mvdev, "failed to resume 
VQs\n");
goto err_driver;
@@ -3628,7 +3616,7 @@ static int mlx5_vdpa_resume(struct vdpa_device *vdev)
 
down_write(&ndev->reslock);
mvdev->suspended = false;
-   err = resume_vqs(ndev);
+   err = resume_vqs(ndev, 0, ndev->cur_num_vqs);
register_link_notifier(ndev);
up_write(&ndev->reslock);
 
-- 
2.45.1

[PATCH vhost v2 06/10] vdpa/mlx5: Parallelize device suspend

2024-08-16 Thread Dragos Tatulea

Currently device suspend works on vqs serially. Building up on previous
changes that converted vq operations to the async api, this patch
parallelizes the device suspend:
1) Suspend all active vqs parallel.
2) Query suspended vqs in parallel.

For 1 vDPA device x 32 VQs (16 VQPs) attached to a large VM (256 GB RAM,
32 CPUs x 2 threads per core), the device suspend time is reduced from
~37 ms to ~13 ms.

A later patch will remove the link unregister operation which will make
it even faster.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Tariq Toukan 
Acked-by: Eugenio Pérez 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 56 ---
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 9be7a88d71a7..5fba16c80dbb 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1630,49 +1630,51 @@ static int modify_virtqueues(struct mlx5_vdpa_net 
*ndev, int start_vq, int num_v
return err;
 }
 
-static int suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
+static int suspend_vqs(struct mlx5_vdpa_net *ndev, int start_vq, int num_vqs)
 {
-   struct mlx5_virtq_attr attr;
+   struct mlx5_vdpa_virtqueue *mvq;
+   struct mlx5_virtq_attr *attrs;
+   int vq_idx, i;
int err;
 
+   if (start_vq >= ndev->cur_num_vqs)
+   return -EINVAL;
+
+   mvq = &ndev->vqs[start_vq];
if (!mvq->initialized)
return 0;
 
if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
return 0;
 
-   err = modify_virtqueues(ndev, mvq->index, 1, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND);
-   if (err) {
-   mlx5_vdpa_err(&ndev->mvdev, "modify to suspend failed, err: 
%d\n", err);
-   return err;
-   }
-
-   err = query_virtqueues(ndev, mvq->index, 1, &attr);
-   if (err) {
-   mlx5_vdpa_err(&ndev->mvdev, "failed to query virtqueue, err: 
%d\n", err);
+   err = modify_virtqueues(ndev, start_vq, num_vqs, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND);
+   if (err)
return err;
-   }
-
-   mvq->avail_idx = attr.available_index;
-   mvq->used_idx = attr.used_index;
-
-   return 0;
-}
 
-static int suspend_vqs(struct mlx5_vdpa_net *ndev)
-{
-   int err = 0;
-   int i;
+   attrs = kcalloc(num_vqs, sizeof(struct mlx5_virtq_attr), GFP_KERNEL);
+   if (!attrs)
+   return -ENOMEM;
 
-   for (i = 0; i < ndev->cur_num_vqs; i++) {
-   int local_err = suspend_vq(ndev, &ndev->vqs[i]);
+   err = query_virtqueues(ndev, start_vq, num_vqs, attrs);
+   if (err)
+   goto done;
 
-   err = local_err ? local_err : err;
+   for (i = 0, vq_idx = start_vq; i < num_vqs; i++, vq_idx++) {
+   mvq = &ndev->vqs[vq_idx];
+   mvq->avail_idx = attrs[i].available_index;
+   mvq->used_idx = attrs[i].used_index;
}
 
+done:
+   kfree(attrs);
return err;
 }
 
+static int suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
+{
+   return suspend_vqs(ndev, mvq->index, 1);
+}
+
 static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
 {
int err;
@@ -3053,7 +3055,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev 
*mvdev,
bool teardown = !is_resumable(ndev);
int err;
 
-   suspend_vqs(ndev);
+   suspend_vqs(ndev, 0, ndev->cur_num_vqs);
if (teardown) {
err = save_channels_info(ndev);
if (err)
@@ -3606,7 +3608,7 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
 
down_write(&ndev->reslock);
unregister_link_notifier(ndev);
-   err = suspend_vqs(ndev);
+   err = suspend_vqs(ndev, 0, ndev->cur_num_vqs);
mlx5_vdpa_cvq_suspend(mvdev);
mvdev->suspended = true;
up_write(&ndev->reslock);
-- 
2.45.1

[PATCH vhost v2 05/10] vdpa/mlx5: Use async API for vq modify commands

2024-08-16 Thread Dragos Tatulea

Switch firmware vq modify command to be issued via the async API to
allow future parallelization. The new refactored function applies the
modify on a range of vqs and waits for their execution to complete.

For now the command is still used in a serial fashion. A later patch
will switch to modifying multiple vqs in parallel.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Tariq Toukan 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 154 --
 1 file changed, 106 insertions(+), 48 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 413b24398ef2..9be7a88d71a7 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1189,6 +1189,11 @@ struct mlx5_virtqueue_query_mem {
u8 out[MLX5_ST_SZ_BYTES(query_virtio_net_q_out)];
 };
 
+struct mlx5_virtqueue_modify_mem {
+   u8 in[MLX5_ST_SZ_BYTES(modify_virtio_net_q_in)];
+   u8 out[MLX5_ST_SZ_BYTES(modify_virtio_net_q_out)];
+};
+
 static void fill_query_virtqueue_cmd(struct mlx5_vdpa_net *ndev,
 struct mlx5_vdpa_virtqueue *mvq,
 struct mlx5_virtqueue_query_mem *cmd)
@@ -1298,51 +1303,30 @@ static bool modifiable_virtqueue_fields(struct 
mlx5_vdpa_virtqueue *mvq)
return true;
 }
 
-static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
-   struct mlx5_vdpa_virtqueue *mvq,
-   int state)
+static void fill_modify_virtqueue_cmd(struct mlx5_vdpa_net *ndev,
+ struct mlx5_vdpa_virtqueue *mvq,
+ int state,
+ struct mlx5_virtqueue_modify_mem *cmd)
 {
-   int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
-   u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
struct mlx5_vdpa_mr *desc_mr = NULL;
struct mlx5_vdpa_mr *vq_mr = NULL;
-   bool state_change = false;
void *obj_context;
void *cmd_hdr;
void *vq_ctx;
-   void *in;
-   int err;
-
-   if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE)
-   return 0;
-
-   if (!modifiable_virtqueue_fields(mvq))
-   return -EINVAL;
 
-   in = kzalloc(inlen, GFP_KERNEL);
-   if (!in)
-   return -ENOMEM;
-
-   cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, 
general_obj_in_cmd_hdr);
+   cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, cmd->in, 
general_obj_in_cmd_hdr);
 
MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 
MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 
MLX5_OBJ_TYPE_VIRTIO_NET_Q);
MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
 
-   obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
+   obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, cmd->in, 
obj_context);
vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, 
virtio_q_context);
 
-   if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) {
-   if (!is_valid_state_change(mvq->fw_state, state, 
is_resumable(ndev))) {
-   err = -EINVAL;
-   goto done;
-   }
-
+   if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE)
MLX5_SET(virtio_net_q_object, obj_context, state, state);
-   state_change = true;
-   }
 
if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS) {
MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
@@ -1388,38 +1372,36 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
}
 
MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, 
mvq->modified_fields);
-   err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
-   if (err)
-   goto done;
+}
 
-   if (state_change)
-   mvq->fw_state = state;
+static void modify_virtqueue_end(struct mlx5_vdpa_net *ndev,
+struct mlx5_vdpa_virtqueue *mvq,
+int state)
+{
+   struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
 
if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) {
+   unsigned int asid = mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP];
+   struct mlx5_vdpa_mr *vq_mr = mvdev->mr[asid];
+
mlx5_vdpa_put_mr(mvdev, mvq->vq_mr);
mlx5_vdpa_get_mr(mvdev, vq_mr);
mvq->vq_mr = vq_mr;
}
 
if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) {
+   unsigned int asid = 
mvdev->group2as

[PATCH vhost v2 04/10] vdpa/mlx5: Use async API for vq query command

2024-08-16 Thread Dragos Tatulea

Switch firmware vq query command to be issued via the async API to
allow future parallelization.

For now the command is still serial but the infrastructure is there
to issue commands in parallel, including ratelimiting the number
of issued async commands to firmware.

A later patch will switch to issuing more commands at a time.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Tariq Toukan 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h |   2 +
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 101 ++---
 2 files changed, 78 insertions(+), 25 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index b34e9b93d56e..24fa00afb24f 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -103,6 +103,8 @@ struct mlx5_vdpa_dev {
struct workqueue_struct *wq;
unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
bool suspended;
+
+   struct mlx5_async_ctx async_ctx;
 };
 
 struct mlx5_vdpa_async_cmd {
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 12133e5d1285..413b24398ef2 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1184,40 +1184,87 @@ struct mlx5_virtq_attr {
u16 used_index;
 };
 
-static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *mvq,
-  struct mlx5_virtq_attr *attr)
-{
-   int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
-   u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
-   void *out;
-   void *obj_context;
-   void *cmd_hdr;
-   int err;
-
-   out = kzalloc(outlen, GFP_KERNEL);
-   if (!out)
-   return -ENOMEM;
+struct mlx5_virtqueue_query_mem {
+   u8 in[MLX5_ST_SZ_BYTES(query_virtio_net_q_in)];
+   u8 out[MLX5_ST_SZ_BYTES(query_virtio_net_q_out)];
+};
 
-   cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, 
general_obj_in_cmd_hdr);
+static void fill_query_virtqueue_cmd(struct mlx5_vdpa_net *ndev,
+struct mlx5_vdpa_virtqueue *mvq,
+struct mlx5_virtqueue_query_mem *cmd)
+{
+   void *cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, cmd->in, 
general_obj_in_cmd_hdr);
 
MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 
MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 
MLX5_OBJ_TYPE_VIRTIO_NET_Q);
MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
-   err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
-   if (err)
-   goto err_cmd;
+}
+
+static void query_virtqueue_end(struct mlx5_vdpa_net *ndev,
+   struct mlx5_virtqueue_query_mem *cmd,
+   struct mlx5_virtq_attr *attr)
+{
+   void *obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, cmd->out, 
obj_context);
 
-   obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
memset(attr, 0, sizeof(*attr));
attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, 
hw_available_index);
attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, 
hw_used_index);
-   kfree(out);
-   return 0;
+}
 
-err_cmd:
-   kfree(out);
+static int query_virtqueues(struct mlx5_vdpa_net *ndev,
+   int start_vq,
+   int num_vqs,
+   struct mlx5_virtq_attr *attrs)
+{
+   struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
+   struct mlx5_virtqueue_query_mem *cmd_mem;
+   struct mlx5_vdpa_async_cmd *cmds;
+   int err = 0;
+
+   WARN(start_vq + num_vqs > mvdev->max_vqs, "query vq range invalid [%d, 
%d), max_vqs: %u\n",
+start_vq, start_vq + num_vqs, mvdev->max_vqs);
+
+   cmds = kvcalloc(num_vqs, sizeof(*cmds), GFP_KERNEL);
+   cmd_mem = kvcalloc(num_vqs, sizeof(*cmd_mem), GFP_KERNEL);
+   if (!cmds || !cmd_mem) {
+   err = -ENOMEM;
+   goto done;
+   }
+
+   for (int i = 0; i < num_vqs; i++) {
+   cmds[i].in = &cmd_mem[i].in;
+   cmds[i].inlen = sizeof(cmd_mem[i].in);
+   cmds[i].out = &cmd_mem[i].out;
+   cmds[i].outlen = sizeof(cmd_mem[i].out);
+   fill_query_virtqueue_cmd(ndev, &ndev->vqs[start_vq + i], 
&cmd_mem[i]);
+   }
+
+   err = mlx5_vdpa_exec_async_cmds(&ndev->mvdev, cmds, num_vqs);
+   if (err) {
+   mlx5_vdpa_err(mvdev, "error issuing query cmd for vq range [%d, 
%d): %d\n",
+ start_vq, start_vq + num_vqs, err);
+   goto done;
+   }
+
+   for

[PATCH vhost v2 03/10] vdpa/mlx5: Introduce async fw command wrapper

2024-08-16 Thread Dragos Tatulea

Introduce a new function mlx5_vdpa_exec_async_cmds() which
wraps the mlx5_core async firmware command API in a way
that will be used to parallelize certain operation in this
driver.

The wrapper deals with the case when mlx5_cmd_exec_cb() returns
EBUSY due to the command being throttled.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Tariq Toukan 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h | 15 ++
 drivers/vdpa/mlx5/core/resources.c | 73 ++
 2 files changed, 88 insertions(+)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 424d445ebee4..b34e9b93d56e 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -105,6 +105,18 @@ struct mlx5_vdpa_dev {
bool suspended;
 };
 
+struct mlx5_vdpa_async_cmd {
+   int err;
+   struct mlx5_async_work cb_work;
+   struct completion cmd_done;
+
+   void *in;
+   size_t inlen;
+
+   void *out;
+   size_t outlen;
+};
+
 int mlx5_vdpa_create_tis(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tisn);
 void mlx5_vdpa_destroy_tis(struct mlx5_vdpa_dev *mvdev, u32 tisn);
 int mlx5_vdpa_create_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 
*rqtn);
@@ -134,6 +146,9 @@ int mlx5_vdpa_update_cvq_iotlb(struct mlx5_vdpa_dev *mvdev,
unsigned int asid);
 int mlx5_vdpa_create_dma_mr(struct mlx5_vdpa_dev *mvdev);
 int mlx5_vdpa_reset_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid);
+int mlx5_vdpa_exec_async_cmds(struct mlx5_vdpa_dev *mvdev,
+ struct mlx5_vdpa_async_cmd *cmds,
+ int num_cmds);
 
 #define mlx5_vdpa_err(__dev, format, ...)  
\
dev_err((__dev)->mdev->device, "%s:%d:(pid %d) error: " format, 
__func__, __LINE__,\
diff --git a/drivers/vdpa/mlx5/core/resources.c 
b/drivers/vdpa/mlx5/core/resources.c
index 5c5a41b64bfc..22ea32fe007b 100644
--- a/drivers/vdpa/mlx5/core/resources.c
+++ b/drivers/vdpa/mlx5/core/resources.c
@@ -321,3 +321,76 @@ void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev)
mutex_destroy(&mvdev->mr_mtx);
res->valid = false;
 }
+
+static void virtqueue_cmd_callback(int status, struct mlx5_async_work *context)
+{
+   struct mlx5_vdpa_async_cmd *cmd =
+   container_of(context, struct mlx5_vdpa_async_cmd, cb_work);
+
+   cmd->err = mlx5_cmd_check(context->ctx->dev, status, cmd->in, cmd->out);
+   complete(&cmd->cmd_done);
+}
+
+static int issue_async_cmd(struct mlx5_vdpa_dev *mvdev,
+  struct mlx5_vdpa_async_cmd *cmds,
+  int issued,
+  int *completed)
+
+{
+   struct mlx5_vdpa_async_cmd *cmd = &cmds[issued];
+   int err;
+
+retry:
+   err = mlx5_cmd_exec_cb(&mvdev->async_ctx,
+  cmd->in, cmd->inlen,
+  cmd->out, cmd->outlen,
+  virtqueue_cmd_callback,
+  &cmd->cb_work);
+   if (err == -EBUSY) {
+   if (*completed < issued) {
+   /* Throttled by own commands: wait for oldest 
completion. */
+   wait_for_completion(&cmds[*completed].cmd_done);
+   (*completed)++;
+
+   goto retry;
+   } else {
+   /* Throttled by external commands: switch to sync api. 
*/
+   err = mlx5_cmd_exec(mvdev->mdev,
+   cmd->in, cmd->inlen,
+   cmd->out, cmd->outlen);
+   if (!err)
+   (*completed)++;
+   }
+   }
+
+   return err;
+}
+
+int mlx5_vdpa_exec_async_cmds(struct mlx5_vdpa_dev *mvdev,
+ struct mlx5_vdpa_async_cmd *cmds,
+ int num_cmds)
+{
+   int completed = 0;
+   int issued = 0;
+   int err = 0;
+
+   for (int i = 0; i < num_cmds; i++)
+   init_completion(&cmds[i].cmd_done);
+
+   while (issued < num_cmds) {
+
+   err = issue_async_cmd(mvdev, cmds, issued, &completed);
+   if (err) {
+   mlx5_vdpa_err(mvdev, "error issuing command %d of %d: 
%d\n",
+ issued, num_cmds, err);
+   break;
+   }
+
+   issued++;
+   }
+
+   while (completed < issued)
+   wait_for_completion(&cmds[completed++].cmd_done);
+
+   return err;
+}
-- 
2.45.1

[PATCH mlx5-vhost v2 01/10] net/mlx5: Support throttled commands from async API

2024-08-16 Thread Dragos Tatulea

Currently, commands that qualify as throttled can't be used via the
async API. That's due to the fact that the throttle semaphore can sleep
but the async API can't.

This patch allows throttling in the async API by using the tentative
variant of the semaphore and upon failure (semaphore at 0) returns EBUSY
to signal to the caller that they need to wait for the completion of
previously issued commands.

Furthermore, make sure that the semaphore is released in the callback.

Signed-off-by: Dragos Tatulea 
Cc: Leon Romanovsky 
Reviewed-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 21 ++-
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c 
b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 20768ef2e9d2..f69c977c1569 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -1882,10 +1882,12 @@ static int cmd_exec(struct mlx5_core_dev *dev, void 
*in, int in_size, void *out,
 
throttle_op = mlx5_cmd_is_throttle_opcode(opcode);
if (throttle_op) {
-   /* atomic context may not sleep */
-   if (callback)
-   return -EINVAL;
-   down(&dev->cmd.vars.throttle_sem);
+   if (callback) {
+   if (down_trylock(&dev->cmd.vars.throttle_sem))
+   return -EBUSY;
+   } else {
+   down(&dev->cmd.vars.throttle_sem);
+   }
}
 
pages_queue = is_manage_pages(in);
@@ -2091,10 +2093,19 @@ static void mlx5_cmd_exec_cb_handler(int status, void 
*_work)
 {
struct mlx5_async_work *work = _work;
struct mlx5_async_ctx *ctx;
+   struct mlx5_core_dev *dev;
+   u16 opcode;
 
ctx = work->ctx;
-   status = cmd_status_err(ctx->dev, status, work->opcode, work->op_mod, 
work->out);
+   dev = ctx->dev;
+   opcode = work->opcode;
+   status = cmd_status_err(dev, status, work->opcode, work->op_mod, 
work->out);
work->user_callback(status, work);
+   /* Can't access "work" from this point on. It could have been freed in
+* the callback.
+*/
+   if (mlx5_cmd_is_throttle_opcode(opcode))
+   up(&dev->cmd.vars.throttle_sem);
if (atomic_dec_and_test(&ctx->num_inflight))
complete(&ctx->inflight_done);
 }
-- 
2.45.1

[PATCH vhost v2 02/10] vdpa/mlx5: Introduce error logging function

2024-08-16 Thread Dragos Tatulea

mlx5_vdpa_err() was missing. This patch adds it and uses it in the
necessary places.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Tariq Toukan 
Acked-by: Eugenio Pérez 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h |  5 +
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 24 
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 50aac8fe57ef..424d445ebee4 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -135,6 +135,11 @@ int mlx5_vdpa_update_cvq_iotlb(struct mlx5_vdpa_dev *mvdev,
 int mlx5_vdpa_create_dma_mr(struct mlx5_vdpa_dev *mvdev);
 int mlx5_vdpa_reset_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid);
 
+#define mlx5_vdpa_err(__dev, format, ...)  
\
+   dev_err((__dev)->mdev->device, "%s:%d:(pid %d) error: " format, 
__func__, __LINE__,\
+current->pid, ##__VA_ARGS__)
+
+
 #define mlx5_vdpa_warn(__dev, format, ...) 
\
dev_warn((__dev)->mdev->device, "%s:%d:(pid %d) warning: " format, 
__func__, __LINE__, \
 current->pid, ##__VA_ARGS__)
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index fa78e8288ebb..12133e5d1285 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1538,13 +1538,13 @@ static int suspend_vq(struct mlx5_vdpa_net *ndev, 
struct mlx5_vdpa_virtqueue *mv
 
err = modify_virtqueue_state(ndev, mvq, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND);
if (err) {
-   mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed, err: 
%d\n", err);
+   mlx5_vdpa_err(&ndev->mvdev, "modify to suspend failed, err: 
%d\n", err);
return err;
}
 
err = query_virtqueue(ndev, mvq, &attr);
if (err) {
-   mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue, err: 
%d\n", err);
+   mlx5_vdpa_err(&ndev->mvdev, "failed to query virtqueue, err: 
%d\n", err);
return err;
}
 
@@ -1585,7 +1585,7 @@ static int resume_vq(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *mvq
 */
err = modify_virtqueue(ndev, mvq, 0);
if (err) {
-   mlx5_vdpa_warn(&ndev->mvdev,
+   mlx5_vdpa_err(&ndev->mvdev,
"modify vq properties failed for vq %u, err: 
%d\n",
mvq->index, err);
return err;
@@ -1600,15 +1600,15 @@ static int resume_vq(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *mvq
case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY:
return 0;
default:
-   mlx5_vdpa_warn(&ndev->mvdev, "resume vq %u called from bad 
state %d\n",
+   mlx5_vdpa_err(&ndev->mvdev, "resume vq %u called from bad state 
%d\n",
   mvq->index, mvq->fw_state);
return -EINVAL;
}
 
err = modify_virtqueue_state(ndev, mvq, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
if (err)
-   mlx5_vdpa_warn(&ndev->mvdev, "modify to resume failed for vq 
%u, err: %d\n",
-  mvq->index, err);
+   mlx5_vdpa_err(&ndev->mvdev, "modify to resume failed for vq %u, 
err: %d\n",
+ mvq->index, err);
 
return err;
 }
@@ -2002,13 +2002,13 @@ static int setup_steering(struct mlx5_vdpa_net *ndev)
 
ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, 
MLX5_FLOW_NAMESPACE_BYPASS);
if (!ns) {
-   mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n");
+   mlx5_vdpa_err(&ndev->mvdev, "failed to get flow namespace\n");
return -EOPNOTSUPP;
}
 
ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
if (IS_ERR(ndev->rxft)) {
-   mlx5_vdpa_warn(&ndev->mvdev, "failed to create flow table\n");
+   mlx5_vdpa_err(&ndev->mvdev, "failed to create flow table\n");
return PTR_ERR(ndev->rxft);
}
mlx5_vdpa_add_rx_flow_table(ndev);
@@ -2530,7 +2530,7 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device 
*vdev, u16 idx, struct vdpa
 
err = query_virtqueue(ndev, mvq, &attr);
if (err) {
-   mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
+   mlx5_vdpa_err(mvdev, "failed to query virtqueue\n");

[PATCH vhost v2 00/10] vdpa/mlx5: Parallelize device suspend/resume

2024-08-16 Thread Dragos Tatulea

This series parallelizes the mlx5_vdpa device suspend and resume
operations through the firmware async API. The purpose is to reduce live
migration downtime.

The series starts with changing the VQ suspend and resume commands
to the async API. After that, the switch is made to issue multiple
commands of the same type in parallel.

Then, the an additional improvement is added: keep the notifiers enabled
during suspend but make it a NOP. Upon resume make sure that the link
state is forwarded. This shaves around 30ms per device constant time.

Finally, use parallel VQ suspend and resume during the CVQ MQ command.

For 1 vDPA device x 32 VQs (16 VQPs), on a large VM (256 GB RAM, 32 CPUs
x 2 threads per core), the improvements are:

+---+++---+
| operation | Before | After  | Reduction |
|---+++---|
| mlx5_vdpa_suspend | 37 ms  | 2.5 ms | 14x   |
| mlx5_vdpa_resume  | 16 ms  | 5 ms   |  3x   |
+---+++---+

---
v2:
- Changed to parallel VQ suspend/resume during CVQ MQ command.
  Support added in the last 2 patches.
- Made the fw async command more generic and moved it to resources.c.
  Did that because the following series (parallel mkey ops) needs this
  code as well.
  Dropped Acked-by from Eugenio on modified patches.
- Fixed kfree -> kvfree.
- Removed extra newline caught during review.
- As discussed in the v1, the series can be pulled in completely in
  the vhost tree [0]. The mlx5_core patch was reviewed by Tariq who is
  also a maintainer for mlx5_core.

[0] - 
https://lore.kernel.org/virtualization/6582792d-8db2-4bc0-bf3a-248fe5c8f...@nvidia.com/T/#maefabb2fde5adfb322d16ca16ae64d540f75b7d2

Dragos Tatulea (10):
  net/mlx5: Support throttled commands from async API
  vdpa/mlx5: Introduce error logging function
  vdpa/mlx5: Introduce async fw command wrapper
  vdpa/mlx5: Use async API for vq query command
  vdpa/mlx5: Use async API for vq modify commands
  vdpa/mlx5: Parallelize device suspend
  vdpa/mlx5: Parallelize device resume
  vdpa/mlx5: Keep notifiers during suspend but ignore
  vdpa/mlx5: Small improvement for change_num_qps()
  vdpa/mlx5: Parallelize VQ suspend/resume for CVQ MQ command

 drivers/net/ethernet/mellanox/mlx5/core/cmd.c |  21 +-
 drivers/vdpa/mlx5/core/mlx5_vdpa.h|  22 +
 drivers/vdpa/mlx5/core/resources.c|  73 
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 396 +++---
 4 files changed, 361 insertions(+), 151 deletions(-)

-- 
2.45.1

Re: [RFC PATCH] vhost_vdpa: assign irq bypass producer token correctly

2024-08-14 Thread Dragos Tatulea




On 14.08.24 07:29, Jason Wang wrote:
> On Tue, Aug 13, 2024 at 8:53 PM Dragos Tatulea  wrote:
>>
>>
>>
>> On 13.08.24 08:26, Jason Wang wrote:
>>> On Mon, Aug 12, 2024 at 7:22 PM Dragos Tatulea  wrote:
>>>>
>>>>
>>>>
>>>> On 12.08.24 08:49, Jason Wang wrote:
>>>>> On Mon, Aug 12, 2024 at 1:47 PM Jason Wang  wrote:
>>>>>>
>>>>>> On Fri, Aug 9, 2024 at 2:04 AM Dragos Tatulea  
>>>>>> wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 08.08.24 10:20, Jason Wang wrote:
>>>>>>>> We used to call irq_bypass_unregister_producer() in
>>>>>>>> vhost_vdpa_setup_vq_irq() which is problematic as we don't know if the
>>>>>>>> token pointer is still valid or not.
>>>>>>>>
>>>>>>>> Actually, we use the eventfd_ctx as the token so the life cycle of the
>>>>>>>> token should be bound to the VHOST_SET_VRING_CALL instead of
>>>>>>>> vhost_vdpa_setup_vq_irq() which could be called by set_status().
>>>>>>>>
>>>>>>>> Fixing this by setting up  irq bypass producer's token when handling
>>>>>>>> VHOST_SET_VRING_CALL and un-registering the producer before calling
>>>>>>>> vhost_vring_ioctl() to prevent a possible use after free as eventfd
>>>>>>>> could have been released in vhost_vring_ioctl().
>>>>>>>>
>>>>>>>> Fixes: 2cf1ba9a4d15 ("vhost_vdpa: implement IRQ offloading in 
>>>>>>>> vhost_vdpa")
>>>>>>>> Signed-off-by: Jason Wang 
>>>>>>>> ---
>>>>>>>> Note for Dragos: Please check whether this fixes your issue. I
>>>>>>>> slightly test it with vp_vdpa in L2.
>>>>>>>> ---
>>>>>>>>  drivers/vhost/vdpa.c | 12 +---
>>>>>>>>  1 file changed, 9 insertions(+), 3 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
>>>>>>>> index e31ec9ebc4ce..388226a48bcc 100644
>>>>>>>> --- a/drivers/vhost/vdpa.c
>>>>>>>> +++ b/drivers/vhost/vdpa.c
>>>>>>>> @@ -209,11 +209,9 @@ static void vhost_vdpa_setup_vq_irq(struct 
>>>>>>>> vhost_vdpa *v, u16 qid)
>>>>>>>>   if (irq < 0)
>>>>>>>>   return;
>>>>>>>>
>>>>>>>> - irq_bypass_unregister_producer(&vq->call_ctx.producer);
>>>>>>>>   if (!vq->call_ctx.ctx)
>>>>>>>>   return;
>>>>>>>>
>>>>>>>> - vq->call_ctx.producer.token = vq->call_ctx.ctx;
>>>>>>>>   vq->call_ctx.producer.irq = irq;
>>>>>>>>   ret = irq_bypass_register_producer(&vq->call_ctx.producer);
>>>>>>>>   if (unlikely(ret))
>>>>>>>> @@ -709,6 +707,12 @@ static long vhost_vdpa_vring_ioctl(struct 
>>>>>>>> vhost_vdpa *v, unsigned int cmd,
>>>>>>>>   vq->last_avail_idx = vq_state.split.avail_index;
>>>>>>>>   }
>>>>>>>>   break;
>>>>>>>> + case VHOST_SET_VRING_CALL:
>>>>>>>> + if (vq->call_ctx.ctx) {
>>>>>>>> + vhost_vdpa_unsetup_vq_irq(v, idx);
>>>>>>>> + vq->call_ctx.producer.token = NULL;
>>>>>>>> + }
>>>>>>>> + break;
>>>>>>>>   }
>>>>>>>>
>>>>>>>>   r = vhost_vring_ioctl(&v->vdev, cmd, argp);
>>>>>>>> @@ -747,13 +751,14 @@ static long vhost_vdpa_vring_ioctl(struct 
>>>>>>>> vhost_vdpa *v, unsigned int cmd,
>>>>>>>>   cb.callback = vhost_vdpa_virtqueue_cb;
>>>>>>>>   cb.private = vq;
>>>>>>>>   cb.trigger = vq->call_ctx.ctx;
>>>>>>>> +

Re: [RFC PATCH] vhost_vdpa: assign irq bypass producer token correctly

2024-08-13 Thread Dragos Tatulea




On 13.08.24 08:26, Jason Wang wrote:
> On Mon, Aug 12, 2024 at 7:22 PM Dragos Tatulea  wrote:
>>
>>
>>
>> On 12.08.24 08:49, Jason Wang wrote:
>>> On Mon, Aug 12, 2024 at 1:47 PM Jason Wang  wrote:
>>>>
>>>> On Fri, Aug 9, 2024 at 2:04 AM Dragos Tatulea  wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 08.08.24 10:20, Jason Wang wrote:
>>>>>> We used to call irq_bypass_unregister_producer() in
>>>>>> vhost_vdpa_setup_vq_irq() which is problematic as we don't know if the
>>>>>> token pointer is still valid or not.
>>>>>>
>>>>>> Actually, we use the eventfd_ctx as the token so the life cycle of the
>>>>>> token should be bound to the VHOST_SET_VRING_CALL instead of
>>>>>> vhost_vdpa_setup_vq_irq() which could be called by set_status().
>>>>>>
>>>>>> Fixing this by setting up  irq bypass producer's token when handling
>>>>>> VHOST_SET_VRING_CALL and un-registering the producer before calling
>>>>>> vhost_vring_ioctl() to prevent a possible use after free as eventfd
>>>>>> could have been released in vhost_vring_ioctl().
>>>>>>
>>>>>> Fixes: 2cf1ba9a4d15 ("vhost_vdpa: implement IRQ offloading in 
>>>>>> vhost_vdpa")
>>>>>> Signed-off-by: Jason Wang 
>>>>>> ---
>>>>>> Note for Dragos: Please check whether this fixes your issue. I
>>>>>> slightly test it with vp_vdpa in L2.
>>>>>> ---
>>>>>>  drivers/vhost/vdpa.c | 12 +---
>>>>>>  1 file changed, 9 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
>>>>>> index e31ec9ebc4ce..388226a48bcc 100644
>>>>>> --- a/drivers/vhost/vdpa.c
>>>>>> +++ b/drivers/vhost/vdpa.c
>>>>>> @@ -209,11 +209,9 @@ static void vhost_vdpa_setup_vq_irq(struct 
>>>>>> vhost_vdpa *v, u16 qid)
>>>>>>   if (irq < 0)
>>>>>>   return;
>>>>>>
>>>>>> - irq_bypass_unregister_producer(&vq->call_ctx.producer);
>>>>>>   if (!vq->call_ctx.ctx)
>>>>>>   return;
>>>>>>
>>>>>> - vq->call_ctx.producer.token = vq->call_ctx.ctx;
>>>>>>   vq->call_ctx.producer.irq = irq;
>>>>>>   ret = irq_bypass_register_producer(&vq->call_ctx.producer);
>>>>>>   if (unlikely(ret))
>>>>>> @@ -709,6 +707,12 @@ static long vhost_vdpa_vring_ioctl(struct 
>>>>>> vhost_vdpa *v, unsigned int cmd,
>>>>>>   vq->last_avail_idx = vq_state.split.avail_index;
>>>>>>   }
>>>>>>   break;
>>>>>> + case VHOST_SET_VRING_CALL:
>>>>>> + if (vq->call_ctx.ctx) {
>>>>>> + vhost_vdpa_unsetup_vq_irq(v, idx);
>>>>>> + vq->call_ctx.producer.token = NULL;
>>>>>> + }
>>>>>> + break;
>>>>>>   }
>>>>>>
>>>>>>   r = vhost_vring_ioctl(&v->vdev, cmd, argp);
>>>>>> @@ -747,13 +751,14 @@ static long vhost_vdpa_vring_ioctl(struct 
>>>>>> vhost_vdpa *v, unsigned int cmd,
>>>>>>   cb.callback = vhost_vdpa_virtqueue_cb;
>>>>>>   cb.private = vq;
>>>>>>   cb.trigger = vq->call_ctx.ctx;
>>>>>> + vq->call_ctx.producer.token = vq->call_ctx.ctx;
>>>>>> + vhost_vdpa_setup_vq_irq(v, idx);
>>>>>>   } else {
>>>>>>   cb.callback = NULL;
>>>>>>   cb.private = NULL;
>>>>>>   cb.trigger = NULL;
>>>>>>   }
>>>>>>   ops->set_vq_cb(vdpa, idx, &cb);
>>>>>> - vhost_vdpa_setup_vq_irq(v, idx);
>>>>>>   break;
>>>>>>
>>>>>>   case VHOST_SET_VRING_NUM:
>>>>>> @

Re: [RFC PATCH] vhost_vdpa: assign irq bypass producer token correctly

2024-08-12 Thread Dragos Tatulea




On 12.08.24 08:49, Jason Wang wrote:
> On Mon, Aug 12, 2024 at 1:47 PM Jason Wang  wrote:
>>
>> On Fri, Aug 9, 2024 at 2:04 AM Dragos Tatulea  wrote:
>>>
>>>
>>>
>>> On 08.08.24 10:20, Jason Wang wrote:
>>>> We used to call irq_bypass_unregister_producer() in
>>>> vhost_vdpa_setup_vq_irq() which is problematic as we don't know if the
>>>> token pointer is still valid or not.
>>>>
>>>> Actually, we use the eventfd_ctx as the token so the life cycle of the
>>>> token should be bound to the VHOST_SET_VRING_CALL instead of
>>>> vhost_vdpa_setup_vq_irq() which could be called by set_status().
>>>>
>>>> Fixing this by setting up  irq bypass producer's token when handling
>>>> VHOST_SET_VRING_CALL and un-registering the producer before calling
>>>> vhost_vring_ioctl() to prevent a possible use after free as eventfd
>>>> could have been released in vhost_vring_ioctl().
>>>>
>>>> Fixes: 2cf1ba9a4d15 ("vhost_vdpa: implement IRQ offloading in vhost_vdpa")
>>>> Signed-off-by: Jason Wang 
>>>> ---
>>>> Note for Dragos: Please check whether this fixes your issue. I
>>>> slightly test it with vp_vdpa in L2.
>>>> ---
>>>>  drivers/vhost/vdpa.c | 12 +---
>>>>  1 file changed, 9 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
>>>> index e31ec9ebc4ce..388226a48bcc 100644
>>>> --- a/drivers/vhost/vdpa.c
>>>> +++ b/drivers/vhost/vdpa.c
>>>> @@ -209,11 +209,9 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa 
>>>> *v, u16 qid)
>>>>   if (irq < 0)
>>>>   return;
>>>>
>>>> - irq_bypass_unregister_producer(&vq->call_ctx.producer);
>>>>   if (!vq->call_ctx.ctx)
>>>>   return;
>>>>
>>>> - vq->call_ctx.producer.token = vq->call_ctx.ctx;
>>>>   vq->call_ctx.producer.irq = irq;
>>>>   ret = irq_bypass_register_producer(&vq->call_ctx.producer);
>>>>   if (unlikely(ret))
>>>> @@ -709,6 +707,12 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa 
>>>> *v, unsigned int cmd,
>>>>   vq->last_avail_idx = vq_state.split.avail_index;
>>>>   }
>>>>   break;
>>>> + case VHOST_SET_VRING_CALL:
>>>> + if (vq->call_ctx.ctx) {
>>>> + vhost_vdpa_unsetup_vq_irq(v, idx);
>>>> + vq->call_ctx.producer.token = NULL;
>>>> + }
>>>> + break;
>>>>   }
>>>>
>>>>   r = vhost_vring_ioctl(&v->vdev, cmd, argp);
>>>> @@ -747,13 +751,14 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa 
>>>> *v, unsigned int cmd,
>>>>   cb.callback = vhost_vdpa_virtqueue_cb;
>>>>   cb.private = vq;
>>>>   cb.trigger = vq->call_ctx.ctx;
>>>> + vq->call_ctx.producer.token = vq->call_ctx.ctx;
>>>> + vhost_vdpa_setup_vq_irq(v, idx);
>>>>   } else {
>>>>   cb.callback = NULL;
>>>>   cb.private = NULL;
>>>>   cb.trigger = NULL;
>>>>   }
>>>>   ops->set_vq_cb(vdpa, idx, &cb);
>>>> - vhost_vdpa_setup_vq_irq(v, idx);
>>>>   break;
>>>>
>>>>   case VHOST_SET_VRING_NUM:
>>>> @@ -1419,6 +1424,7 @@ static int vhost_vdpa_open(struct inode *inode, 
>>>> struct file *filep)
>>>>   for (i = 0; i < nvqs; i++) {
>>>>   vqs[i] = &v->vqs[i];
>>>>   vqs[i]->handle_kick = handle_vq_kick;
>>>> + vqs[i]->call_ctx.ctx = NULL;
>>>>   }
>>>>   vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false,
>>>>  vhost_vdpa_process_iotlb_msg);
>>>
>>> No more crashes, but now getting a lot of:
>>>  vhost-vdpa-X: vq Y, irq bypass producer (token a66e28ab) 
>>> registration fails, ret =  -16
>>>
>>> ... seems like the irq_bypass_unregister_producer()

Re: [RFC PATCH] vhost_vdpa: assign irq bypass producer token correctly

2024-08-08 Thread Dragos Tatulea




On 08.08.24 10:20, Jason Wang wrote:
> We used to call irq_bypass_unregister_producer() in
> vhost_vdpa_setup_vq_irq() which is problematic as we don't know if the
> token pointer is still valid or not.
> 
> Actually, we use the eventfd_ctx as the token so the life cycle of the
> token should be bound to the VHOST_SET_VRING_CALL instead of
> vhost_vdpa_setup_vq_irq() which could be called by set_status().
> 
> Fixing this by setting up  irq bypass producer's token when handling
> VHOST_SET_VRING_CALL and un-registering the producer before calling
> vhost_vring_ioctl() to prevent a possible use after free as eventfd
> could have been released in vhost_vring_ioctl().
> 
> Fixes: 2cf1ba9a4d15 ("vhost_vdpa: implement IRQ offloading in vhost_vdpa")
> Signed-off-by: Jason Wang 
> ---
> Note for Dragos: Please check whether this fixes your issue. I
> slightly test it with vp_vdpa in L2.
> ---
>  drivers/vhost/vdpa.c | 12 +---
>  1 file changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> index e31ec9ebc4ce..388226a48bcc 100644
> --- a/drivers/vhost/vdpa.c
> +++ b/drivers/vhost/vdpa.c
> @@ -209,11 +209,9 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa 
> *v, u16 qid)
>   if (irq < 0)
>   return;
>  
> - irq_bypass_unregister_producer(&vq->call_ctx.producer);
>   if (!vq->call_ctx.ctx)
>   return;
>  
> - vq->call_ctx.producer.token = vq->call_ctx.ctx;
>   vq->call_ctx.producer.irq = irq;
>   ret = irq_bypass_register_producer(&vq->call_ctx.producer);
>   if (unlikely(ret))
> @@ -709,6 +707,12 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, 
> unsigned int cmd,
>   vq->last_avail_idx = vq_state.split.avail_index;
>   }
>   break;
> + case VHOST_SET_VRING_CALL:
> + if (vq->call_ctx.ctx) {
> + vhost_vdpa_unsetup_vq_irq(v, idx);
> + vq->call_ctx.producer.token = NULL;
> + }
> + break;
>   }
>  
>   r = vhost_vring_ioctl(&v->vdev, cmd, argp);
> @@ -747,13 +751,14 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa 
> *v, unsigned int cmd,
>   cb.callback = vhost_vdpa_virtqueue_cb;
>   cb.private = vq;
>   cb.trigger = vq->call_ctx.ctx;
> + vq->call_ctx.producer.token = vq->call_ctx.ctx;
> + vhost_vdpa_setup_vq_irq(v, idx);
>   } else {
>   cb.callback = NULL;
>   cb.private = NULL;
>   cb.trigger = NULL;
>   }
>   ops->set_vq_cb(vdpa, idx, &cb);
> - vhost_vdpa_setup_vq_irq(v, idx);
>   break;
>  
>   case VHOST_SET_VRING_NUM:
> @@ -1419,6 +1424,7 @@ static int vhost_vdpa_open(struct inode *inode, struct 
> file *filep)
>   for (i = 0; i < nvqs; i++) {
>   vqs[i] = &v->vqs[i];
>   vqs[i]->handle_kick = handle_vq_kick;
> + vqs[i]->call_ctx.ctx = NULL;
>   }
>   vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false,
>  vhost_vdpa_process_iotlb_msg);

No more crashes, but now getting a lot of:
 vhost-vdpa-X: vq Y, irq bypass producer (token a66e28ab) registration 
fails, ret =  -16

... seems like the irq_bypass_unregister_producer() that was removed
might still be needed somewhere?

Thanks,
Dragos

Re: [PATCH vhost 0/7] vdpa/mlx5: Parallelize device suspend/resume

2024-08-07 Thread Dragos Tatulea




On 07.08.24 15:25, Eugenio Perez Martin wrote:
> On Fri, Aug 2, 2024 at 9:24 AM Dragos Tatulea  wrote:
>>
>> This series parallelizes the mlx5_vdpa device suspend and resume
>> operations through the firmware async API. The purpose is to reduce live
>> migration downtime.
>>
>> The series starts with changing the VQ suspend and resume commands
>> to the async API. After that, the switch is made to issue multiple
>> commands of the same type in parallel.
>>
> 
> There is a missed opportunity processing the CVQ MQ command here,
> isn't it? It can be applied on top in another series for sure.
> 
Initially I considered that it would complicate the code too much in
change_num_qps(). But in the current state of the patches it's doable.

Will send a V2 with an extra patch for this.

>> Finally, a bonus improvement is thrown in: keep the notifierd enabled
>> during suspend but make it a NOP. Upon resume make sure that the link
>> state is forwarded. This shaves around 30ms per device constant time.
>>
>> For 1 vDPA device x 32 VQs (16 VQPs), on a large VM (256 GB RAM, 32 CPUs
>> x 2 threads per core), the improvements are:
>>
>> +---+++---+
>> | operation | Before | After  | Reduction |
>> |---+++---|
>> | mlx5_vdpa_suspend | 37 ms  | 2.5 ms | 14x   |
>> | mlx5_vdpa_resume  | 16 ms  | 5 ms   |  3x   |
>> +---+++---+
>>
> 
> Looks great :).
> 
> Apart from the nitpick,
>
> Acked-by: Eugenio Pérez 
> 
> For the vhost part.
Thanks!

> 
> Thanks!
> 
>> Note for the maintainers:
>> The first patch contains changes for mlx5_core. This must be applied
>> into the mlx5-vhost tree [0] first. Once this patch is applied on
>> mlx5-vhost, the change has to be pulled from mlx5-vdpa into the vhost
>> tree and only then the remaining patches can be applied.
>>
>> [0] 
>> https://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git/log/?h=mlx5-vhost
>>
>> Dragos Tatulea (7):
>>   net/mlx5: Support throttled commands from async API
>>   vdpa/mlx5: Introduce error logging function
>>   vdpa/mlx5: Use async API for vq query command
>>   vdpa/mlx5: Use async API for vq modify commands
>>   vdpa/mlx5: Parallelize device suspend
>>   vdpa/mlx5: Parallelize device resume
>>   vdpa/mlx5: Keep notifiers during suspend but ignore
>>
>>  drivers/net/ethernet/mellanox/mlx5/core/cmd.c |  21 +-
>>  drivers/vdpa/mlx5/core/mlx5_vdpa.h|   7 +
>>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 435 +-
>>  3 files changed, 333 insertions(+), 130 deletions(-)
>>
>> --
>> 2.45.2
>>
>

Re: [RFC PATCH vhost] vhost-vdpa: Fix invalid irq bypass unregister

2024-08-06 Thread Dragos Tatulea




On 06.08.24 10:18, Dragos Tatulea wrote:
> (Re-sending. I messed up the previous message, sorry about that.)
> 
> On 06.08.24 04:57, Jason Wang wrote:
>> On Mon, Aug 5, 2024 at 11:59 PM Dragos Tatulea  wrote:
>>>
>>> On 05.08.24 05:17, Jason Wang wrote:
>>>> On Fri, Aug 2, 2024 at 2:51 PM Dragos Tatulea  wrote:
>>>>>
>>>>> On Fri, 2024-08-02 at 11:29 +0800, Jason Wang wrote:
>>>>>> On Thu, Aug 1, 2024 at 11:38 PM Dragos Tatulea  
>>>>>> wrote:
>>>>>>>
>>>>>>> The following workflow triggers the crash referenced below:
>>>>>>>
>>>>>>> 1) vhost_vdpa_unsetup_vq_irq() unregisters the irq bypass producer
>>>>>>>but the producer->token is still valid.
>>>>>>> 2) vq context gets released and reassigned to another vq.
>>>>>>
>>>>>> Just to make sure I understand here, which structure is referred to as
>>>>>> "vq context" here? I guess it's not call_ctx as it is a part of the vq
>>>>>> itself.
>>>>>>
>>>>>>> 3) That other vq registers it's producer with the same vq context
>>>>>>>pointer as token in vhost_vdpa_setup_vq_irq().
>>>>>>
>>>>>> Or did you mean when a single eventfd is shared among different vqs?
>>>>>>
>>>>> Yes, that's what I mean: vq->call_ctx.ctx which is a eventfd_ctx.
>>>>>
>>>>> But I don't think it's shared in this case, only that the old eventfd_ctx 
>>>>> value
>>>>> is lingering in producer->token. And this old eventfd_ctx is assigned now 
>>>>> to
>>>>> another vq.
>>>>
>>>> Just to make sure I understand the issue. The eventfd_ctx should be
>>>> still valid until a new VHOST_SET_VRING_CALL().
>>>>
>>> I think it's not about the validity of the eventfd_ctx. More about
>>> the lingering ctx value of the producer after vhost_vdpa_unsetup_vq_irq().
>>
>> Probably, but
>>
>>> That value is the eventfd ctx, but it could be anything else really...
>>
>> I mean we hold a refcnt of the eventfd so it should be valid until the
>> next set_vring_call() or vhost_dev_cleanup().
>>
>> But I do spot some possible issue:
>>
>> 1) We swap and assign new ctx in vhost_vring_ioctl():
>>
>> swap(ctx, vq->call_ctx.ctx);
>>
>> 2) and old ctx will be put there as well:
>>
>> if (!IS_ERR_OR_NULL(ctx))
>> eventfd_ctx_put(ctx);
>>
>> 3) but in vdpa, we try to unregister the producer with the new token:
>>
>> static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
>>void __user *argp)
>> {
>> ...
>> r = vhost_vring_ioctl(&v->vdev, cmd, argp);
>> ...
>> switch (cmd) {
>> ...
>> case VHOST_SET_VRING_CALL:
>> if (vq->call_ctx.ctx) {
>> cb.callback = vhost_vdpa_virtqueue_cb;
>> cb.private = vq;
>> cb.trigger = vq->call_ctx.ctx;
>> } else {
>> cb.callback = NULL;
>> cb.private = NULL;
>> cb.trigger = NULL;
>> }
>> ops->set_vq_cb(vdpa, idx, &cb);
>> vhost_vdpa_setup_vq_irq(v, idx);
>>
>> in vhost_vdpa_setup_vq_irq() we had:
>>
>> irq_bypass_unregister_producer(&vq->call_ctx.producer);
>>
>> here the producer->token still points to the old one...
>>
>> Is this what you have seen?
> Yup. That is the issue. The unregister already happened at
> vhost_vdpa_unsetup_vq_irq(). So this second unregister will
> work on an already unregistered element due to the token still
> being set.
> 
>>
>>>
>>>
>>>> I may miss something but the only way to assign exactly the same
>>>> eventfd_ctx value to another vq is where the guest tries to share the
>>>> MSI-X vector among virtqueues, then qemu will use a single eventfd as
>>>> the callback for multiple virtqueues. If this is true:
>>>>
>>> I don't think this is the case. I see the issue happening when running qemu 
>>> vdpa
>>> live migration tests on the same host. From a vdpa device it's basically a 
>>> device
>>> starting on a VM over and over.
>>>
>>>> For bypass registering, only the first registering can succeed as the
>>>> following registering will fail because the irq bypass manager already
>>>> had exactly the same producer token.
>>>> For registering, all unregistering can succeed:
>>>>
>>>> 1) the first unregistering will do the real job that unregister the token
>>>> 2) the following unregistering will do nothing by iterating the
>>>> producer token list without finding a match one
>>>>
>>>> Maybe you can show me the userspace behaviour (ioctls) when you see this?
>>>>
>>> Sure, what would you need? qemu traces?
>>
>> Yes, that would be helpful.
>>
> Will try to get them.
As the traces are quite large (~5MB), I uploaded them in this location [0].
I used the following qemu traces:
--trace vhost_vdpa* --trace virtio_net_handle*

[0] 
https://drive.google.com/file/d/1XyXYyockJ_O7zMgI7vot6AxYjze9Ljju/view?usp=sharing

Thanks,
Dragos

Re: [RFC PATCH vhost] vhost-vdpa: Fix invalid irq bypass unregister

2024-08-06 Thread Dragos Tatulea

(Re-sending. I messed up the previous message, sorry about that.)

On 06.08.24 04:57, Jason Wang wrote:
> On Mon, Aug 5, 2024 at 11:59 PM Dragos Tatulea  wrote:
>>
>> On 05.08.24 05:17, Jason Wang wrote:
>>> On Fri, Aug 2, 2024 at 2:51 PM Dragos Tatulea  wrote:
>>>>
>>>> On Fri, 2024-08-02 at 11:29 +0800, Jason Wang wrote:
>>>>> On Thu, Aug 1, 2024 at 11:38 PM Dragos Tatulea  
>>>>> wrote:
>>>>>>
>>>>>> The following workflow triggers the crash referenced below:
>>>>>>
>>>>>> 1) vhost_vdpa_unsetup_vq_irq() unregisters the irq bypass producer
>>>>>>but the producer->token is still valid.
>>>>>> 2) vq context gets released and reassigned to another vq.
>>>>>
>>>>> Just to make sure I understand here, which structure is referred to as
>>>>> "vq context" here? I guess it's not call_ctx as it is a part of the vq
>>>>> itself.
>>>>>
>>>>>> 3) That other vq registers it's producer with the same vq context
>>>>>>pointer as token in vhost_vdpa_setup_vq_irq().
>>>>>
>>>>> Or did you mean when a single eventfd is shared among different vqs?
>>>>>
>>>> Yes, that's what I mean: vq->call_ctx.ctx which is a eventfd_ctx.
>>>>
>>>> But I don't think it's shared in this case, only that the old eventfd_ctx 
>>>> value
>>>> is lingering in producer->token. And this old eventfd_ctx is assigned now 
>>>> to
>>>> another vq.
>>>
>>> Just to make sure I understand the issue. The eventfd_ctx should be
>>> still valid until a new VHOST_SET_VRING_CALL().
>>>
>> I think it's not about the validity of the eventfd_ctx. More about
>> the lingering ctx value of the producer after vhost_vdpa_unsetup_vq_irq().
> 
> Probably, but
> 
>> That value is the eventfd ctx, but it could be anything else really...
> 
> I mean we hold a refcnt of the eventfd so it should be valid until the
> next set_vring_call() or vhost_dev_cleanup().
> 
> But I do spot some possible issue:
> 
> 1) We swap and assign new ctx in vhost_vring_ioctl():
> 
> swap(ctx, vq->call_ctx.ctx);
> 
> 2) and old ctx will be put there as well:
> 
> if (!IS_ERR_OR_NULL(ctx))
> eventfd_ctx_put(ctx);
> 
> 3) but in vdpa, we try to unregister the producer with the new token:
> 
> static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
>void __user *argp)
> {
> ...
> r = vhost_vring_ioctl(&v->vdev, cmd, argp);
> ...
> switch (cmd) {
> ...
> case VHOST_SET_VRING_CALL:
> if (vq->call_ctx.ctx) {
> cb.callback = vhost_vdpa_virtqueue_cb;
> cb.private = vq;
> cb.trigger = vq->call_ctx.ctx;
> } else {
> cb.callback = NULL;
> cb.private = NULL;
> cb.trigger = NULL;
> }
> ops->set_vq_cb(vdpa, idx, &cb);
> vhost_vdpa_setup_vq_irq(v, idx);
> 
> in vhost_vdpa_setup_vq_irq() we had:
> 
> irq_bypass_unregister_producer(&vq->call_ctx.producer);
> 
> here the producer->token still points to the old one...
> 
> Is this what you have seen?
Yup. That is the issue. The unregister already happened at
vhost_vdpa_unsetup_vq_irq(). So this second unregister will
work on an already unregistered element due to the token still
being set.

> 
>>
>>
>>> I may miss something but the only way to assign exactly the same
>>> eventfd_ctx value to another vq is where the guest tries to share the
>>> MSI-X vector among virtqueues, then qemu will use a single eventfd as
>>> the callback for multiple virtqueues. If this is true:
>>>
>> I don't think this is the case. I see the issue happening when running qemu 
>> vdpa
>> live migration tests on the same host. From a vdpa device it's basically a 
>> device
>> starting on a VM over and over.
>>
>>> For bypass registering, only the first registering can succeed as the
>>> following registering will fail because the irq bypass manager already
>>> had exactly the same producer token.
>>> For registering, all unregistering can succeed:
>>>
>>> 1) the first unregis

Re: [RFC PATCH vhost] vhost-vdpa: Fix invalid irq bypass unregister

2024-08-05 Thread Dragos Tatulea

On 05.08.24 05:17, Jason Wang wrote:
> On Fri, Aug 2, 2024 at 2:51 PM Dragos Tatulea  wrote:
>>
>> On Fri, 2024-08-02 at 11:29 +0800, Jason Wang wrote:
>>> On Thu, Aug 1, 2024 at 11:38 PM Dragos Tatulea  wrote:
>>>>
>>>> The following workflow triggers the crash referenced below:
>>>>
>>>> 1) vhost_vdpa_unsetup_vq_irq() unregisters the irq bypass producer
>>>>but the producer->token is still valid.
>>>> 2) vq context gets released and reassigned to another vq.
>>>
>>> Just to make sure I understand here, which structure is referred to as
>>> "vq context" here? I guess it's not call_ctx as it is a part of the vq
>>> itself.
>>>
>>>> 3) That other vq registers it's producer with the same vq context
>>>>pointer as token in vhost_vdpa_setup_vq_irq().
>>>
>>> Or did you mean when a single eventfd is shared among different vqs?
>>>
>> Yes, that's what I mean: vq->call_ctx.ctx which is a eventfd_ctx.
>>
>> But I don't think it's shared in this case, only that the old eventfd_ctx 
>> value
>> is lingering in producer->token. And this old eventfd_ctx is assigned now to
>> another vq.
> 
> Just to make sure I understand the issue. The eventfd_ctx should be
> still valid until a new VHOST_SET_VRING_CALL().
> 
I think it's not about the validity of the eventfd_ctx. More about
the lingering ctx value of the producer after vhost_vdpa_unsetup_vq_irq().
That value is the eventfd ctx, but it could be anything else really...


> I may miss something but the only way to assign exactly the same
> eventfd_ctx value to another vq is where the guest tries to share the
> MSI-X vector among virtqueues, then qemu will use a single eventfd as
> the callback for multiple virtqueues. If this is true:
> 
I don't think this is the case. I see the issue happening when running qemu vdpa
live migration tests on the same host. From a vdpa device it's basically a 
device
starting on a VM over and over.

> For bypass registering, only the first registering can succeed as the
> following registering will fail because the irq bypass manager already
> had exactly the same producer token.
> For registering, all unregistering can succeed:
> 
> 1) the first unregistering will do the real job that unregister the token
> 2) the following unregistering will do nothing by iterating the
> producer token list without finding a match one
> 
> Maybe you can show me the userspace behaviour (ioctls) when you see this?
> 
Sure, what would you need? qemu traces?

Thanks,
Dragos

> Thanks
> 
>>
>>>> 4) The original vq tries to unregister it's producer which it has
>>>>already unlinked in step 1. irq_bypass_unregister_producer() will go
>>>>ahead and unlink the producer once again. That happens because:
>>>>   a) The producer has a token.
>>>>   b) An element with that token is found. But that element comes
>>>>  from step 3.
>>>>
>>>> I see 3 ways to fix this:
>>>> 1) Fix the vhost-vdpa part. What this patch does. vfio has a different
>>>>workflow.
>>>> 2) Set the token to NULL directly in irq_bypass_unregister_producer()
>>>>after unlinking the producer. But that makes the API asymmetrical.
>>>> 3) Make irq_bypass_unregister_producer() also compare the pointer
>>>>elements not just the tokens and do the unlink only on match.
>>>>
>>>> Any thoughts?
>>>>
>>>> Oops: general protection fault, probably for non-canonical address 
>>>> 0xdead0108:  [#1] SMP
>>>> CPU: 8 PID: 5190 Comm: qemu-system-x86 Not tainted 6.10.0-rc7+ #6
>>>> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
>>>> rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
>>>> RIP: 0010:irq_bypass_unregister_producer+0xa5/0xd0
>>>> RSP: 0018:c900034d7e50 EFLAGS: 00010246
>>>> RAX: dead0122 RBX: 888353d12718 RCX: 88810336a000
>>>> RDX: dead0100 RSI: 829243a0 RDI: 
>>>> RBP: 888353c42000 R08: 888104882738 R09: 88810336a000
>>>> R10: 888448ab2050 R11:  R12: 888353d126a0
>>>> R13: 0004 R14: 0055 R15: 0004
>>>> FS:  7f9df9403c80() GS:88852cc0() 
>>>> knlGS:
>>>> CS:  0010 DS:  ES:  CR0: 80050033
>>>> CR2: 562dff

[PATCH vhost 7/7] vdpa/mlx5: Keep notifiers during suspend but ignore

2024-08-02 Thread Dragos Tatulea

Unregistering notifiers is a costly operation. Instead of removing
the notifiers during device suspend and adding them back at resume,
simply ignore the call when the device is suspended.

At resume time call queue_link_work() to make sure that the device state
is propagated in case there were changes.

For 1 vDPA device x 32 VQs (16 VQPs) attached to a large VM (256 GB RAM,
32 CPUs x 2 threads per core), the device suspend time is reduced from
~13 ms to ~2.5 ms.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Tariq Toukan 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 87d355aba380..af96e49697d0 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2934,6 +2934,9 @@ static int event_handler(struct notifier_block *nb, 
unsigned long event, void *p
struct mlx5_eqe *eqe = param;
int ret = NOTIFY_DONE;
 
+   if (ndev->mvdev.suspended)
+   return NOTIFY_DONE;
+
if (event == MLX5_EVENT_TYPE_PORT_CHANGE) {
switch (eqe->sub_type) {
case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
@@ -3679,7 +3682,6 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
mlx5_vdpa_info(mvdev, "suspending device\n");
 
down_write(&ndev->reslock);
-   unregister_link_notifier(ndev);
err = suspend_vqs(ndev, 0, ndev->cur_num_vqs);
mlx5_vdpa_cvq_suspend(mvdev);
mvdev->suspended = true;
@@ -3701,7 +3703,7 @@ static int mlx5_vdpa_resume(struct vdpa_device *vdev)
down_write(&ndev->reslock);
mvdev->suspended = false;
err = resume_vqs(ndev, 0, ndev->cur_num_vqs);
-   register_link_notifier(ndev);
+   queue_link_work(ndev);
up_write(&ndev->reslock);
 
return err;
-- 
2.45.2

[PATCH vhost 6/7] vdpa/mlx5: Parallelize device resume

2024-08-02 Thread Dragos Tatulea

Currently device resume works on vqs serially. Building up on previous
changes that converted vq operations to the async api, this patch
parallelizes the device resume.

For 1 vDPA device x 32 VQs (16 VQPs) attached to a large VM (256 GB RAM,
32 CPUs x 2 threads per core), the device resume time is reduced from
~16 ms to ~4.5 ms.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Tariq Toukan 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 40 +++
 1 file changed, 14 insertions(+), 26 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 1887939c5673..87d355aba380 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1759,10 +1759,15 @@ static int suspend_vq(struct mlx5_vdpa_net *ndev, 
struct mlx5_vdpa_virtqueue *mv
return suspend_vqs(ndev, mvq->index, 1);
 }
 
-static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
+static int resume_vqs(struct mlx5_vdpa_net *ndev, int start_vq, int num_vqs)
 {
+   struct mlx5_vdpa_virtqueue *mvq;
int err;
 
+   if (start_vq >= ndev->mvdev.max_vqs)
+   return -EINVAL;
+
+   mvq = &ndev->vqs[start_vq];
if (!mvq->initialized)
return 0;
 
@@ -1774,13 +1779,9 @@ static int resume_vq(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *mvq
/* Due to a FW quirk we need to modify the VQ fields first then 
change state.
 * This should be fixed soon. After that, a single command can 
be used.
 */
-   err = modify_virtqueues(ndev, mvq->index, 1, mvq->fw_state);
-   if (err) {
-   mlx5_vdpa_err(&ndev->mvdev,
-   "modify vq properties failed for vq %u, err: 
%d\n",
-   mvq->index, err);
+   err = modify_virtqueues(ndev, start_vq, num_vqs, mvq->fw_state);
+   if (err)
return err;
-   }
break;
case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND:
if (!is_resumable(ndev)) {
@@ -1796,25 +1797,12 @@ static int resume_vq(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *mvq
return -EINVAL;
}
 
-   err = modify_virtqueues(ndev, mvq->index, 1, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
-   if (err)
-   mlx5_vdpa_err(&ndev->mvdev, "modify to resume failed for vq %u, 
err: %d\n",
- mvq->index, err);
-
-   return err;
+   return modify_virtqueues(ndev, start_vq, num_vqs, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
 }
 
-static int resume_vqs(struct mlx5_vdpa_net *ndev)
+static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
 {
-   int err = 0;
-
-   for (int i = 0; i < ndev->cur_num_vqs; i++) {
-   int local_err = resume_vq(ndev, &ndev->vqs[i]);
-
-   err = local_err ? local_err : err;
-   }
-
-   return err;
+   return resume_vqs(ndev, mvq->index, 1);
 }
 
 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
@@ -3164,7 +3152,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev 
*mvdev,
return err;
}
 
-   resume_vqs(ndev);
+   resume_vqs(ndev, 0, ndev->cur_num_vqs);
 
return 0;
 }
@@ -3288,7 +3276,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
teardown_vq_resources(ndev);
 
if (ndev->setup) {
-   err = resume_vqs(ndev);
+   err = resume_vqs(ndev, 0, ndev->cur_num_vqs);
if (err) {
mlx5_vdpa_warn(mvdev, "failed to resume 
VQs\n");
goto err_driver;
@@ -3712,7 +3700,7 @@ static int mlx5_vdpa_resume(struct vdpa_device *vdev)
 
down_write(&ndev->reslock);
mvdev->suspended = false;
-   err = resume_vqs(ndev);
+   err = resume_vqs(ndev, 0, ndev->cur_num_vqs);
register_link_notifier(ndev);
up_write(&ndev->reslock);
 
-- 
2.45.2

[PATCH vhost 5/7] vdpa/mlx5: Parallelize device suspend

2024-08-02 Thread Dragos Tatulea

Currently device suspend works on vqs serially. Building up on previous
changes that converted vq operations to the async api, this patch
parallelizes the device suspend:
1) Suspend all active vqs parallel.
2) Query suspended vqs in parallel.

For 1 vDPA device x 32 VQs (16 VQPs) attached to a large VM (256 GB RAM,
32 CPUs x 2 threads per core), the device suspend time is reduced from
~37 ms to ~13 ms.

A later patch will remove the link unregister operation which will make
it even faster.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Tariq Toukan 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 56 ---
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index e56a0ee1b725..1887939c5673 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1714,49 +1714,51 @@ static int modify_virtqueues(struct mlx5_vdpa_net 
*ndev, int start_vq, int num_v
return err;
 }
 
-static int suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
+static int suspend_vqs(struct mlx5_vdpa_net *ndev, int start_vq, int num_vqs)
 {
-   struct mlx5_virtq_attr attr;
+   struct mlx5_vdpa_virtqueue *mvq;
+   struct mlx5_virtq_attr *attrs;
+   int vq_idx, i;
int err;
 
+   if (start_vq >= ndev->cur_num_vqs)
+   return -EINVAL;
+
+   mvq = &ndev->vqs[start_vq];
if (!mvq->initialized)
return 0;
 
if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
return 0;
 
-   err = modify_virtqueues(ndev, mvq->index, 1, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND);
-   if (err) {
-   mlx5_vdpa_err(&ndev->mvdev, "modify to suspend failed, err: 
%d\n", err);
-   return err;
-   }
-
-   err = query_virtqueues(ndev, mvq->index, 1, &attr);
-   if (err) {
-   mlx5_vdpa_err(&ndev->mvdev, "failed to query virtqueue, err: 
%d\n", err);
+   err = modify_virtqueues(ndev, start_vq, num_vqs, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND);
+   if (err)
return err;
-   }
-
-   mvq->avail_idx = attr.available_index;
-   mvq->used_idx = attr.used_index;
-
-   return 0;
-}
 
-static int suspend_vqs(struct mlx5_vdpa_net *ndev)
-{
-   int err = 0;
-   int i;
+   attrs = kcalloc(num_vqs, sizeof(struct mlx5_virtq_attr), GFP_KERNEL);
+   if (!attrs)
+   return -ENOMEM;
 
-   for (i = 0; i < ndev->cur_num_vqs; i++) {
-   int local_err = suspend_vq(ndev, &ndev->vqs[i]);
+   err = query_virtqueues(ndev, start_vq, num_vqs, attrs);
+   if (err)
+   goto done;
 
-   err = local_err ? local_err : err;
+   for (i = 0, vq_idx = start_vq; i < num_vqs; i++, vq_idx++) {
+   mvq = &ndev->vqs[vq_idx];
+   mvq->avail_idx = attrs[i].available_index;
+   mvq->used_idx = attrs[i].used_index;
}
 
+done:
+   kfree(attrs);
return err;
 }
 
+static int suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
+{
+   return suspend_vqs(ndev, mvq->index, 1);
+}
+
 static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
 {
int err;
@@ -3137,7 +3139,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev 
*mvdev,
bool teardown = !is_resumable(ndev);
int err;
 
-   suspend_vqs(ndev);
+   suspend_vqs(ndev, 0, ndev->cur_num_vqs);
if (teardown) {
err = save_channels_info(ndev);
if (err)
@@ -3690,7 +3692,7 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
 
down_write(&ndev->reslock);
unregister_link_notifier(ndev);
-   err = suspend_vqs(ndev);
+   err = suspend_vqs(ndev, 0, ndev->cur_num_vqs);
mlx5_vdpa_cvq_suspend(mvdev);
mvdev->suspended = true;
up_write(&ndev->reslock);
-- 
2.45.2

[PATCH vhost 4/7] vdpa/mlx5: Use async API for vq modify commands

2024-08-02 Thread Dragos Tatulea

Switch firmware vq modify command to be issued via the async API to
allow future parallelization. The new refactored function applies the
modify on a range of vqs and waits for their execution to complete.

For now the command is still used in a serial fashion. A later patch
will switch to modifying multiple vqs in parallel.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Tariq Toukan 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 150 --
 1 file changed, 103 insertions(+), 47 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index be8df9d9f4df..e56a0ee1b725 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1189,6 +1189,12 @@ struct mlx5_virtqueue_query_mem {
u8 out[MLX5_ST_SZ_BYTES(query_virtio_net_q_out)];
 };
 
+struct mlx5_virtqueue_modify_mem {
+   u8 in[MLX5_ST_SZ_BYTES(modify_virtio_net_q_in)];
+   u8 out[MLX5_ST_SZ_BYTES(modify_virtio_net_q_out)];
+};
+
+
 struct mlx5_vdpa_async_virtqueue_cmd {
int err;
struct mlx5_async_work cb_work;
@@ -1202,6 +1208,7 @@ struct mlx5_vdpa_async_virtqueue_cmd {
 
union {
struct mlx5_virtqueue_query_mem query;
+   struct mlx5_virtqueue_modify_mem modify;
};
 };
 
@@ -1384,51 +1391,35 @@ static bool modifiable_virtqueue_fields(struct 
mlx5_vdpa_virtqueue *mvq)
return true;
 }
 
-static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
-   struct mlx5_vdpa_virtqueue *mvq,
-   int state)
+static void fill_modify_virtqueue_cmd(struct mlx5_vdpa_net *ndev,
+ struct mlx5_vdpa_virtqueue *mvq,
+ int state,
+ struct mlx5_vdpa_async_virtqueue_cmd *cmd)
 {
-   int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
-   u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
struct mlx5_vdpa_mr *desc_mr = NULL;
struct mlx5_vdpa_mr *vq_mr = NULL;
-   bool state_change = false;
void *obj_context;
void *cmd_hdr;
void *vq_ctx;
-   void *in;
-   int err;
 
-   if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE)
-   return 0;
-
-   if (!modifiable_virtqueue_fields(mvq))
-   return -EINVAL;
-
-   in = kzalloc(inlen, GFP_KERNEL);
-   if (!in)
-   return -ENOMEM;
+   cmd->in = &cmd->modify.in;
+   cmd->inlen = sizeof(cmd->modify.in);
+   cmd->out = &cmd->modify.out;
+   cmd->outlen = sizeof(cmd->modify.out);
 
-   cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, 
general_obj_in_cmd_hdr);
+   cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, cmd->in, 
general_obj_in_cmd_hdr);
 
MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 
MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 
MLX5_OBJ_TYPE_VIRTIO_NET_Q);
MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
 
-   obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
+   obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, cmd->in, 
obj_context);
vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, 
virtio_q_context);
 
-   if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) {
-   if (!is_valid_state_change(mvq->fw_state, state, 
is_resumable(ndev))) {
-   err = -EINVAL;
-   goto done;
-   }
-
+   if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE)
MLX5_SET(virtio_net_q_object, obj_context, state, state);
-   state_change = true;
-   }
 
if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS) {
MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
@@ -1474,38 +1465,36 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
}
 
MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, 
mvq->modified_fields);
-   err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
-   if (err)
-   goto done;
+}
 
-   if (state_change)
-   mvq->fw_state = state;
+static void modify_virtqueue_end(struct mlx5_vdpa_net *ndev,
+struct mlx5_vdpa_virtqueue *mvq,
+int state)
+{
+   struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
 
if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) {
+   unsigned int asid = mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP];
+   struct mlx5_vdpa_mr *vq_mr = mvdev->mr[asid];
+
m

[PATCH vhost 3/7] vdpa/mlx5: Use async API for vq query command

2024-08-02 Thread Dragos Tatulea

Switch firmware vq query command to be issued via the async API to
allow future parallelization.

exec_virtqueue_async_cmds() is a generic execution function that will be
used to issue other async operations as well. Handling of throttled
commands is built in.

For now the command is still serial but the infrastructure is there
to issue commands in parallel, including ratelimiting the number
of issued async commands to firmware.

A later patch will switch to issuing more commands at a time.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Tariq Toukan 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h |   2 +
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 181 +
 2 files changed, 161 insertions(+), 22 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 424d445ebee4..12136163d8ad 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -103,6 +103,8 @@ struct mlx5_vdpa_dev {
struct workqueue_struct *wq;
unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
bool suspended;
+
+   struct mlx5_async_ctx async_ctx;
 };
 
 int mlx5_vdpa_create_tis(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tisn);
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 12133e5d1285..be8df9d9f4df 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1184,40 +1184,173 @@ struct mlx5_virtq_attr {
u16 used_index;
 };
 
-static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *mvq,
-  struct mlx5_virtq_attr *attr)
-{
-   int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
-   u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
+struct mlx5_virtqueue_query_mem {
+   u8 in[MLX5_ST_SZ_BYTES(query_virtio_net_q_in)];
+   u8 out[MLX5_ST_SZ_BYTES(query_virtio_net_q_out)];
+};
+
+struct mlx5_vdpa_async_virtqueue_cmd {
+   int err;
+   struct mlx5_async_work cb_work;
+   struct completion cmd_done;
+
+   void *in;
+   size_t inlen;
+
void *out;
-   void *obj_context;
-   void *cmd_hdr;
+   size_t outlen;
+
+   union {
+   struct mlx5_virtqueue_query_mem query;
+   };
+};
+
+static void virtqueue_cmd_callback(int status, struct mlx5_async_work *context)
+{
+   struct mlx5_vdpa_async_virtqueue_cmd *cmd =
+   container_of(context, struct mlx5_vdpa_async_virtqueue_cmd, 
cb_work);
+
+   cmd->err = mlx5_cmd_check(context->ctx->dev, status, cmd->in, cmd->out);
+   complete(&cmd->cmd_done);
+}
+
+static int issue_async_cmd(struct mlx5_vdpa_net *ndev,
+  struct mlx5_vdpa_async_virtqueue_cmd *cmds,
+  int issued,
+  int *completed)
+
+{
+   struct mlx5_vdpa_async_virtqueue_cmd *cmd = &cmds[issued];
int err;
 
-   out = kzalloc(outlen, GFP_KERNEL);
-   if (!out)
-   return -ENOMEM;
+retry:
+   err = mlx5_cmd_exec_cb(&ndev->mvdev.async_ctx,
+  cmd->in, cmd->inlen,
+  cmd->out, cmd->outlen,
+  virtqueue_cmd_callback,
+  &cmd->cb_work);
+   if (err == -EBUSY) {
+   if (*completed < issued) {
+   /* Throttled by own commands: wait for oldest 
completion. */
+   wait_for_completion(&cmds[*completed].cmd_done);
+   (*completed)++;
+
+   goto retry;
+   } else {
+   /* Throttled by external commands: switch to sync api. 
*/
+   err = mlx5_cmd_exec(ndev->mvdev.mdev,
+   cmd->in, cmd->inlen,
+   cmd->out, cmd->outlen);
+   if (!err)
+   (*completed)++;
+   }
+   }
+
+   return err;
+}
+
+static int exec_virtqueue_async_cmds(struct mlx5_vdpa_net *ndev,
+struct mlx5_vdpa_async_virtqueue_cmd *cmds,
+int num_cmds)
+{
+   int completed = 0;
+   int issued = 0;
+   int err = 0;
+
+   for (int i = 0; i < num_cmds; i++)
+   init_completion(&cmds[i].cmd_done);
+
+   while (issued < num_cmds) {
+
+   err = issue_async_cmd(ndev, cmds, issued, &completed);
+   if (err) {
+   mlx5_vdpa_err(&ndev->mvdev, "error issuing command %d 
of %d: %d\n",
+ issued, num_cmds, err);
+   break;
+   }
+
+   issued++;
+   }
+
+   while (completed < issued)
+   wait_for_completion(&

[PATCH vhost 2/7] vdpa/mlx5: Introduce error logging function

2024-08-02 Thread Dragos Tatulea

mlx5_vdpa_err() was missing. This patch adds it and uses it in the
necessary places.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Tariq Toukan 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h |  5 +
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 24 
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 50aac8fe57ef..424d445ebee4 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -135,6 +135,11 @@ int mlx5_vdpa_update_cvq_iotlb(struct mlx5_vdpa_dev *mvdev,
 int mlx5_vdpa_create_dma_mr(struct mlx5_vdpa_dev *mvdev);
 int mlx5_vdpa_reset_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid);
 
+#define mlx5_vdpa_err(__dev, format, ...)  
\
+   dev_err((__dev)->mdev->device, "%s:%d:(pid %d) error: " format, 
__func__, __LINE__,\
+current->pid, ##__VA_ARGS__)
+
+
 #define mlx5_vdpa_warn(__dev, format, ...) 
\
dev_warn((__dev)->mdev->device, "%s:%d:(pid %d) warning: " format, 
__func__, __LINE__, \
 current->pid, ##__VA_ARGS__)
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index fa78e8288ebb..12133e5d1285 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1538,13 +1538,13 @@ static int suspend_vq(struct mlx5_vdpa_net *ndev, 
struct mlx5_vdpa_virtqueue *mv
 
err = modify_virtqueue_state(ndev, mvq, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND);
if (err) {
-   mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed, err: 
%d\n", err);
+   mlx5_vdpa_err(&ndev->mvdev, "modify to suspend failed, err: 
%d\n", err);
return err;
}
 
err = query_virtqueue(ndev, mvq, &attr);
if (err) {
-   mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue, err: 
%d\n", err);
+   mlx5_vdpa_err(&ndev->mvdev, "failed to query virtqueue, err: 
%d\n", err);
return err;
}
 
@@ -1585,7 +1585,7 @@ static int resume_vq(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *mvq
 */
err = modify_virtqueue(ndev, mvq, 0);
if (err) {
-   mlx5_vdpa_warn(&ndev->mvdev,
+   mlx5_vdpa_err(&ndev->mvdev,
"modify vq properties failed for vq %u, err: 
%d\n",
mvq->index, err);
return err;
@@ -1600,15 +1600,15 @@ static int resume_vq(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *mvq
case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY:
return 0;
default:
-   mlx5_vdpa_warn(&ndev->mvdev, "resume vq %u called from bad 
state %d\n",
+   mlx5_vdpa_err(&ndev->mvdev, "resume vq %u called from bad state 
%d\n",
   mvq->index, mvq->fw_state);
return -EINVAL;
}
 
err = modify_virtqueue_state(ndev, mvq, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
if (err)
-   mlx5_vdpa_warn(&ndev->mvdev, "modify to resume failed for vq 
%u, err: %d\n",
-  mvq->index, err);
+   mlx5_vdpa_err(&ndev->mvdev, "modify to resume failed for vq %u, 
err: %d\n",
+ mvq->index, err);
 
return err;
 }
@@ -2002,13 +2002,13 @@ static int setup_steering(struct mlx5_vdpa_net *ndev)
 
ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, 
MLX5_FLOW_NAMESPACE_BYPASS);
if (!ns) {
-   mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n");
+   mlx5_vdpa_err(&ndev->mvdev, "failed to get flow namespace\n");
return -EOPNOTSUPP;
}
 
ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
if (IS_ERR(ndev->rxft)) {
-   mlx5_vdpa_warn(&ndev->mvdev, "failed to create flow table\n");
+   mlx5_vdpa_err(&ndev->mvdev, "failed to create flow table\n");
return PTR_ERR(ndev->rxft);
}
mlx5_vdpa_add_rx_flow_table(ndev);
@@ -2530,7 +2530,7 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device 
*vdev, u16 idx, struct vdpa
 
err = query_virtqueue(ndev, mvq, &attr);
if (err) {
-   mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
+   mlx5_vdpa_err(mvdev, "failed to query virtqueue\n");
return err;
}
state->

[PATCH vhost 0/7] vdpa/mlx5: Parallelize device suspend/resume

2024-08-02 Thread Dragos Tatulea

This series parallelizes the mlx5_vdpa device suspend and resume
operations through the firmware async API. The purpose is to reduce live
migration downtime.

The series starts with changing the VQ suspend and resume commands
to the async API. After that, the switch is made to issue multiple
commands of the same type in parallel.

Finally, a bonus improvement is thrown in: keep the notifierd enabled
during suspend but make it a NOP. Upon resume make sure that the link
state is forwarded. This shaves around 30ms per device constant time.

For 1 vDPA device x 32 VQs (16 VQPs), on a large VM (256 GB RAM, 32 CPUs
x 2 threads per core), the improvements are:

+---+++---+
| operation | Before | After  | Reduction |
|---+++---|
| mlx5_vdpa_suspend | 37 ms  | 2.5 ms | 14x   |
| mlx5_vdpa_resume  | 16 ms  | 5 ms   |  3x   |
+---+++---+

Note for the maintainers:
The first patch contains changes for mlx5_core. This must be applied
into the mlx5-vhost tree [0] first. Once this patch is applied on
mlx5-vhost, the change has to be pulled from mlx5-vdpa into the vhost
tree and only then the remaining patches can be applied.

[0] 
https://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git/log/?h=mlx5-vhost

Dragos Tatulea (7):
  net/mlx5: Support throttled commands from async API
  vdpa/mlx5: Introduce error logging function
  vdpa/mlx5: Use async API for vq query command
  vdpa/mlx5: Use async API for vq modify commands
  vdpa/mlx5: Parallelize device suspend
  vdpa/mlx5: Parallelize device resume
  vdpa/mlx5: Keep notifiers during suspend but ignore

 drivers/net/ethernet/mellanox/mlx5/core/cmd.c |  21 +-
 drivers/vdpa/mlx5/core/mlx5_vdpa.h|   7 +
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 435 +-
 3 files changed, 333 insertions(+), 130 deletions(-)

-- 
2.45.2

Re: [RFC PATCH vhost] vhost-vdpa: Fix invalid irq bypass unregister

2024-08-01 Thread Dragos Tatulea

On Fri, 2024-08-02 at 11:29 +0800, Jason Wang wrote:
> On Thu, Aug 1, 2024 at 11:38 PM Dragos Tatulea  wrote:
> > 
> > The following workflow triggers the crash referenced below:
> > 
> > 1) vhost_vdpa_unsetup_vq_irq() unregisters the irq bypass producer
> >but the producer->token is still valid.
> > 2) vq context gets released and reassigned to another vq.
> 
> Just to make sure I understand here, which structure is referred to as
> "vq context" here? I guess it's not call_ctx as it is a part of the vq
> itself.
> 
> > 3) That other vq registers it's producer with the same vq context
> >pointer as token in vhost_vdpa_setup_vq_irq().
> 
> Or did you mean when a single eventfd is shared among different vqs?
> 
Yes, that's what I mean: vq->call_ctx.ctx which is a eventfd_ctx.

But I don't think it's shared in this case, only that the old eventfd_ctx value
is lingering in producer->token. And this old eventfd_ctx is assigned now to
another vq.

> > 4) The original vq tries to unregister it's producer which it has
> >already unlinked in step 1. irq_bypass_unregister_producer() will go
> >ahead and unlink the producer once again. That happens because:
> >   a) The producer has a token.
> >   b) An element with that token is found. But that element comes
> >  from step 3.
> > 
> > I see 3 ways to fix this:
> > 1) Fix the vhost-vdpa part. What this patch does. vfio has a different
> >workflow.
> > 2) Set the token to NULL directly in irq_bypass_unregister_producer()
> >after unlinking the producer. But that makes the API asymmetrical.
> > 3) Make irq_bypass_unregister_producer() also compare the pointer
> >elements not just the tokens and do the unlink only on match.
> > 
> > Any thoughts?
> > 
> > Oops: general protection fault, probably for non-canonical address 
> > 0xdead0108:  [#1] SMP
> > CPU: 8 PID: 5190 Comm: qemu-system-x86 Not tainted 6.10.0-rc7+ #6
> > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
> > rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
> > RIP: 0010:irq_bypass_unregister_producer+0xa5/0xd0
> > RSP: 0018:c900034d7e50 EFLAGS: 00010246
> > RAX: dead0122 RBX: 888353d12718 RCX: 88810336a000
> > RDX: dead0100 RSI: 829243a0 RDI: 
> > RBP: 888353c42000 R08: 888104882738 R09: 88810336a000
> > R10: 888448ab2050 R11:  R12: 888353d126a0
> > R13: 0004 R14: 0055 R15: 0004
> > FS:  7f9df9403c80() GS:88852cc0() knlGS:
> > CS:  0010 DS:  ES:  CR0: 80050033
> > CR2: 562dffc6b568 CR3: 00012efbb006 CR4: 00772ef0
> > PKRU: 5554
> > Call Trace:
> >  
> >  ? die_addr+0x36/0x90
> >  ? exc_general_protection+0x1a8/0x390
> >  ? asm_exc_general_protection+0x26/0x30
> >  ? irq_bypass_unregister_producer+0xa5/0xd0
> >  vhost_vdpa_setup_vq_irq+0x5a/0xc0 [vhost_vdpa]
> >  vhost_vdpa_unlocked_ioctl+0xdcd/0xe00 [vhost_vdpa]
> >  ? vhost_vdpa_config_cb+0x30/0x30 [vhost_vdpa]
> >  __x64_sys_ioctl+0x90/0xc0
> >  do_syscall_64+0x4f/0x110
> >  entry_SYSCALL_64_after_hwframe+0x4b/0x53
> > RIP: 0033:0x7f9df930774f
> > RSP: 002b:7ffc55013080 EFLAGS: 0246 ORIG_RAX: 0010
> > RAX: ffda RBX: 562dfe134d20 RCX: 7f9df930774f
> > RDX: 7ffc55013200 RSI: 4008af21 RDI: 0011
> > RBP: 7ffc55013200 R08: 0002 R09: 
> > R10:  R11: 0246 R12: 562dfe134360
> > R13: 562dfe134d20 R14:  R15: 7f9df801e190
> > 
> > Signed-off-by: Dragos Tatulea 
> > ---
> >  drivers/vhost/vdpa.c | 1 +
> >  1 file changed, 1 insertion(+)
> > 
> > diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> > index 478cd46a49ed..d4a7a3918d86 100644
> > --- a/drivers/vhost/vdpa.c
> > +++ b/drivers/vhost/vdpa.c
> > @@ -226,6 +226,7 @@ static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa 
> > *v, u16 qid)
> > struct vhost_virtqueue *vq = &v->vqs[qid];
> > 
> > irq_bypass_unregister_producer(&vq->call_ctx.producer);
> > +   vq->call_ctx.producer.token = NULL;
> >  }
> > 
> >  static int _compat_vdpa_reset(struct vhost_vdpa *v)
> > --
> > 2.45.2
> > 
> 
Thanks

[RFC PATCH vhost] vhost-vdpa: Fix invalid irq bypass unregister

2024-08-01 Thread Dragos Tatulea

The following workflow triggers the crash referenced below:

1) vhost_vdpa_unsetup_vq_irq() unregisters the irq bypass producer
   but the producer->token is still valid.
2) vq context gets released and reassigned to another vq.
3) That other vq registers it's producer with the same vq context
   pointer as token in vhost_vdpa_setup_vq_irq().
4) The original vq tries to unregister it's producer which it has
   already unlinked in step 1. irq_bypass_unregister_producer() will go
   ahead and unlink the producer once again. That happens because:
  a) The producer has a token.
  b) An element with that token is found. But that element comes
 from step 3.

I see 3 ways to fix this:
1) Fix the vhost-vdpa part. What this patch does. vfio has a different
   workflow.
2) Set the token to NULL directly in irq_bypass_unregister_producer()
   after unlinking the producer. But that makes the API asymmetrical.
3) Make irq_bypass_unregister_producer() also compare the pointer
   elements not just the tokens and do the unlink only on match.

Any thoughts?

Oops: general protection fault, probably for non-canonical address 
0xdead0108:  [#1] SMP
CPU: 8 PID: 5190 Comm: qemu-system-x86 Not tainted 6.10.0-rc7+ #6
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:irq_bypass_unregister_producer+0xa5/0xd0
RSP: 0018:c900034d7e50 EFLAGS: 00010246
RAX: dead0122 RBX: 888353d12718 RCX: 88810336a000
RDX: dead0100 RSI: 829243a0 RDI: 
RBP: 888353c42000 R08: 888104882738 R09: 88810336a000
R10: 888448ab2050 R11:  R12: 888353d126a0
R13: 0004 R14: 0055 R15: 0004
FS:  7f9df9403c80() GS:88852cc0() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 562dffc6b568 CR3: 00012efbb006 CR4: 00772ef0
PKRU: 5554
Call Trace:
 
 ? die_addr+0x36/0x90
 ? exc_general_protection+0x1a8/0x390
 ? asm_exc_general_protection+0x26/0x30
 ? irq_bypass_unregister_producer+0xa5/0xd0
 vhost_vdpa_setup_vq_irq+0x5a/0xc0 [vhost_vdpa]
 vhost_vdpa_unlocked_ioctl+0xdcd/0xe00 [vhost_vdpa]
 ? vhost_vdpa_config_cb+0x30/0x30 [vhost_vdpa]
 __x64_sys_ioctl+0x90/0xc0
 do_syscall_64+0x4f/0x110
 entry_SYSCALL_64_after_hwframe+0x4b/0x53
RIP: 0033:0x7f9df930774f
RSP: 002b:7ffc55013080 EFLAGS: 0246 ORIG_RAX: 0010
RAX: ffda RBX: 562dfe134d20 RCX: 7f9df930774f
RDX: 7ffc55013200 RSI: 4008af21 RDI: 0011
RBP: 7ffc55013200 R08: 0002 R09: 
R10:  R11: 0246 R12: 562dfe134360
R13: 562dfe134d20 R14:  R15: 7f9df801e190

Signed-off-by: Dragos Tatulea 
---
 drivers/vhost/vdpa.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 478cd46a49ed..d4a7a3918d86 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -226,6 +226,7 @@ static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, 
u16 qid)
struct vhost_virtqueue *vq = &v->vqs[qid];
 
irq_bypass_unregister_producer(&vq->call_ctx.producer);
+   vq->call_ctx.producer.token = NULL;
 }
 
 static int _compat_vdpa_reset(struct vhost_vdpa *v)
-- 
2.45.2

Re: [PATH v5 3/3] vdpa/mlx5: Add the support of set mac address

2024-07-23 Thread Dragos Tatulea

On Tue, 2024-07-23 at 13:39 +0800, Cindy Lu wrote:
> Add the function to support setting the MAC address.
> For vdpa/mlx5, the function will use mlx5_mpfs_add_mac
> to set the mac address
> 
> Tested in ConnectX-6 Dx device
> 
> Signed-off-by: Cindy Lu 
> ---
>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 28 
>  1 file changed, 28 insertions(+)
> 
> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> index ecfc16151d61..7fce952d650f 100644
> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> @@ -3785,10 +3785,38 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev 
> *v_mdev, struct vdpa_device *
>   destroy_workqueue(wq);
>   mgtdev->ndev = NULL;
>  }
> +static int mlx5_vdpa_set_attr(struct vdpa_mgmt_dev *v_mdev,
> +   struct vdpa_device *dev,
> +   const struct vdpa_dev_set_config *add_config)
> +{
> + struct virtio_net_config *config;
> + struct mlx5_core_dev *pfmdev;
> + struct mlx5_vdpa_dev *mvdev;
> + struct mlx5_vdpa_net *ndev;
> + struct mlx5_core_dev *mdev;
> + int err = -EINVAL;
> +
> + mvdev = to_mvdev(dev);
> + ndev = to_mlx5_vdpa_ndev(mvdev);
> + mdev = mvdev->mdev;
> + config = &ndev->config;
> +
> + down_write(&ndev->reslock);
> + if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
> + pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
> +         err = mlx5_mpfs_add_mac(pfmdev, config->mac);
> + if (0 == err)
if (!err) would be nicer. Not a deal breaker though.

Reviewed-by: Dragos Tatulea 

> + memcpy(config->mac, add_config->net.mac, ETH_ALEN);
> + }
> +
> + up_write(&ndev->reslock);
> + return err;
> +}
>  
>  static const struct vdpa_mgmtdev_ops mdev_ops = {
>   .dev_add = mlx5_vdpa_dev_add,
>   .dev_del = mlx5_vdpa_dev_del,
> + .dev_set_attr = mlx5_vdpa_set_attr,
>  };
>  
>  static struct virtio_device_id id_table[] = {

Re: [PATH v4 3/3] vdpa/mlx5: Add the support of set mac address

2024-07-22 Thread Dragos Tatulea

On Mon, 2024-07-22 at 15:48 +0800, Jason Wang wrote:
> On Mon, Jul 22, 2024 at 9:06 AM Cindy Lu  wrote:
> > 
> > Add the function to support setting the MAC address.
> > For vdpa/mlx5, the function will use mlx5_mpfs_add_mac
> > to set the mac address
> > 
> > Tested in ConnectX-6 Dx device
> > 
> > Signed-off-by: Cindy Lu 
> > ---
> >  drivers/vdpa/mlx5/net/mlx5_vnet.c | 25 +
> >  1 file changed, 25 insertions(+)
> > 
> > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> > b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > index ecfc16151d61..415b527a9c72 100644
> > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > @@ -3785,10 +3785,35 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev 
> > *v_mdev, struct vdpa_device *
> > destroy_workqueue(wq);
> > mgtdev->ndev = NULL;
> >  }
> > +static int mlx5_vdpa_set_attr(struct vdpa_mgmt_dev *v_mdev,
> > + struct vdpa_device *dev,
> > + const struct vdpa_dev_set_config *add_config)
> > +{
> > +   struct mlx5_vdpa_dev *mvdev;
> > +   struct mlx5_vdpa_net *ndev;
> > +   struct mlx5_core_dev *mdev;
> > +   struct virtio_net_config *config;
> > +   struct mlx5_core_dev *pfmdev;
Reverse xmas tree?

> > +   int err = -EOPNOTSUPP;
> > +
> > +   mvdev = to_mvdev(dev);
> > +   ndev = to_mlx5_vdpa_ndev(mvdev);
> > +   mdev = mvdev->mdev;
> > +   config = &ndev->config;
> > +
You still need to take the ndev->reslock.

> > +   if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
> > +   pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
> > +   err = mlx5_mpfs_add_mac(pfmdev, config->mac);
> > +   if (!err)
> > +   memcpy(config->mac, add_config->net.mac, ETH_ALEN);
What is the expected behaviour when the device is in use?

> > +   }
> > +   return err;
> 
> Similar to net simulator, how could be serialize the modification to
> mac address:
> 
> 1) from vdpa tool
> 2) via control virtqueue
> 
> Thanks
> 
> > +}
> > 
> >  static const struct vdpa_mgmtdev_ops mdev_ops = {
> > .dev_add = mlx5_vdpa_dev_add,
> > .dev_del = mlx5_vdpa_dev_del,
> > +   .dev_set_attr = mlx5_vdpa_set_attr,
> >  };
> > 
> >  static struct virtio_device_id id_table[] = {
> > --
> > 2.45.0
> > 
> 
Thanks,
Dragos

Re: [PATCH vhost 20/23] vdpa/mlx5: Pre-create hardware VQs at vdpa .dev_add time

2024-07-08 Thread Dragos Tatulea

Hi Zhu Yanjun,

On Mon, 2024-07-08 at 18:22 +0200, Zhu Yanjun wrote:
> 在 2024/6/17 17:07, Dragos Tatulea 写道:
> > Currently, hardware VQs are created right when the vdpa device gets into
> > DRIVER_OK state. That is easier because most of the VQ state is known by
> > then.
> > 
> > This patch switches to creating all VQs and their associated resources
> > at device creation time. The motivation is to reduce the vdpa device
> > live migration downtime by moving the expensive operation of creating
> > all the hardware VQs and their associated resources out of downtime on
> > the destination VM.
> 
> Hi, Dragos Tatulea
> 
>  From the above, when a device is created, all the VQs and their 
> associated resources are also created.
> If VM live migration does not occur, how much resources are wasted?
> 
> I mean, to achieve a better downtime, how much resource are used?
> 
When you use the vdpa device there are no resources wasted. The HW VQs that were
previously created at VM boot (during DRIVER_OK state) are now created at vdpa
device add time.

The trade-off here is that if you configure different VQ sizes then you will pay
the price of re-creating the VQs.

This could be mitigated by adding a default VQ size parameter that is setable
via the vdpa tool. But this part is not implemented in this series.

Ah, one more thing to keep in mind: the MSIX interrupts will be now allocated at
vdpa device creation time instead of VM startup.

> "
> On a 64 CPU, 256 GB VM with 1 vDPA device of 16 VQps, the full VQ
> resource creation + resume time was ~370ms. Now it's down to 60 ms
> (only VQ config and resume). The measurements were done on a ConnectX6DX
> based vDPA device.
> "
>  From the above, the performance is amazing.
> If we expect to use it in the production hosts, how much resources 
> should we prepare to achieve this downtime?
> 

You do need to have the latest FW (22.41.1000) to be able to get the full
benefit of the optimization.

Thanks,
Dragos
> Zhu Yanjun
> 
> > 
> > The VQs are now created in a blank state. The VQ configuration will
> > happen later, on DRIVER_OK. Then the configuration will be applied when
> > the VQs are moved to the Ready state.
> > 
> > When .set_vq_ready() is called on a VQ before DRIVER_OK, special care is
> > needed: now that the VQ is already created a resume_vq() will be
> > triggered too early when no mr has been configured yet. Skip calling
> > resume_vq() in this case, let it be handled during DRIVER_OK.
> > 
> > For virtio-vdpa, the device configuration is done earlier during
> > .vdpa_dev_add() by vdpa_register_device(). Avoid calling
> > setup_vq_resources() a second time in that case.
> > 
> > Signed-off-by: Dragos Tatulea 
> > Reviewed-by: Cosmin Ratiu 
> > ---
> >   drivers/vdpa/mlx5/net/mlx5_vnet.c | 37 
> > -
> >   1 file changed, 32 insertions(+), 5 deletions(-)
> > 
> > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> > b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > index 249b5afbe34a..b2836fd3d1dd 100644
> > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > @@ -2444,7 +2444,7 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device 
> > *vdev, u16 idx, bool ready
> > mvq = &ndev->vqs[idx];
> > if (!ready) {
> > suspend_vq(ndev, mvq);
> > -   } else {
> > +   } else if (mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) {
> > if (resume_vq(ndev, mvq))
> > ready = false;
> > }
> > @@ -3078,10 +3078,18 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
> > *vdev, u8 status)
> > goto err_setup;
> > }
> > register_link_notifier(ndev);
> > -   err = setup_vq_resources(ndev, true);
> > -   if (err) {
> > -   mlx5_vdpa_warn(mvdev, "failed to setup 
> > driver\n");
> > -   goto err_driver;
> > +   if (ndev->setup) {
> > +   err = resume_vqs(ndev);
> > +   if (err) {
> > +   mlx5_vdpa_warn(mvdev, "failed to resume 
> > VQs\n");
> > +   goto err_driver;
> > +   }
> > +   } else {
> > +   err = setup_vq_resources(ndev, true);
> > +   if (err) {
> > +

[PATCH vhost v3 23/24] vdpa/mlx5: Don't reset VQs more than necessary

2024-07-08 Thread Dragos Tatulea

The vdpa device can be reset many times in sequence without any
significant state changes in between. Previously this was not a problem:
VQs were torn down only on first reset. But after VQ pre-creation was
introduced, each reset will delete and re-create the hardware VQs and
their associated resources.

To solve this problem, avoid resetting hardware VQs if the VQs are still
in a blank state.

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 30 +++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index f2af0ba82dd2..4e464a22381b 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3134,18 +3134,41 @@ static void init_group_to_asid_map(struct mlx5_vdpa_dev 
*mvdev)
mvdev->group2asid[i] = 0;
 }
 
+static bool needs_vqs_reset(const struct mlx5_vdpa_dev *mvdev)
+{
+   struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+   struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[0];
+
+   if (mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK)
+   return true;
+
+   if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT)
+   return true;
+
+   return mvq->modified_fields & (
+   MLX5_VIRTQ_MODIFY_MASK_STATE |
+   MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS |
+   MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX |
+   MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX
+   );
+}
+
 static int mlx5_vdpa_compat_reset(struct vdpa_device *vdev, u32 flags)
 {
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+   bool vq_reset;
 
print_status(mvdev, 0, true);
mlx5_vdpa_info(mvdev, "performing device reset\n");
 
down_write(&ndev->reslock);
unregister_link_notifier(ndev);
-   teardown_vq_resources(ndev);
-   mvqs_set_defaults(ndev);
+   vq_reset = needs_vqs_reset(mvdev);
+   if (vq_reset) {
+   teardown_vq_resources(ndev);
+   mvqs_set_defaults(ndev);
+   }
 
if (flags & VDPA_RESET_F_CLEAN_MAP)
mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
@@ -3165,7 +3188,8 @@ static int mlx5_vdpa_compat_reset(struct vdpa_device 
*vdev, u32 flags)
if (mlx5_vdpa_create_dma_mr(mvdev))
mlx5_vdpa_warn(mvdev, "create MR failed\n");
}
-   setup_vq_resources(ndev, false);
+   if (vq_reset)
+   setup_vq_resources(ndev, false);
up_write(&ndev->reslock);
 
return 0;

-- 
2.45.2

[PATCH vhost v3 20/24] vdpa/mlx5: Use suspend/resume during VQP change

2024-07-08 Thread Dragos Tatulea

Resume a VQ if it is already created when the number of VQ pairs
increases. This is done in preparation for VQ pre-creation which is
coming in a later patch. It is necessary because calling setup_vq() on
an already created VQ will return early and will not enable the queue.

For symmetry, suspend a VQ instead of tearing it down when the number of
VQ pairs decreases. But only if the resume operation is supported.

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index ce1f6a1f36cd..324604b16b91 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2130,14 +2130,22 @@ static int change_num_qps(struct mlx5_vdpa_dev *mvdev, 
int newqps)
if (err)
return err;
 
-   for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--)
-   teardown_vq(ndev, &ndev->vqs[i]);
+   for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--) {
+   struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i];
+
+   if (is_resumable(ndev))
+   suspend_vq(ndev, mvq);
+   else
+   teardown_vq(ndev, mvq);
+   }
 
ndev->cur_num_vqs = 2 * newqps;
} else {
ndev->cur_num_vqs = 2 * newqps;
for (i = cur_qps * 2; i < 2 * newqps; i++) {
-   err = setup_vq(ndev, &ndev->vqs[i], true);
+   struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i];
+
+   err = mvq->initialized ? resume_vq(ndev, mvq) : 
setup_vq(ndev, mvq, true);
if (err)
goto clean_added;
}

-- 
2.45.2

[PATCH vhost v3 24/24] vdpa/mlx5: Don't enable non-active VQs in .set_vq_ready()

2024-07-08 Thread Dragos Tatulea

VQ indices in the range [cur_num_qps, max_vqs) represent queues that
have not yet been activated. .set_vq_ready should not activate these
VQs.

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 4e464a22381b..4557b2d29c02 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1575,6 +1575,9 @@ static int resume_vq(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *mvq
if (!mvq->initialized)
return 0;
 
+   if (mvq->index >= ndev->cur_num_vqs)
+   return 0;
+
switch (mvq->fw_state) {
case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT:
/* Due to a FW quirk we need to modify the VQ fields first then 
change state.

-- 
2.45.2

[PATCH vhost v3 19/24] vdpa/mlx5: Forward error in suspend/resume device

2024-07-08 Thread Dragos Tatulea

Start using the suspend/resume_vq() error return codes previously added.

Reviewed-by: Cosmin Ratiu 
Reviewed-by: Zhu Yanjun 
Reviewed-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index e65d488f7a08..ce1f6a1f36cd 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3436,22 +3436,25 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
 {
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+   int err;
 
mlx5_vdpa_info(mvdev, "suspending device\n");
 
down_write(&ndev->reslock);
unregister_link_notifier(ndev);
-   suspend_vqs(ndev);
+   err = suspend_vqs(ndev);
mlx5_vdpa_cvq_suspend(mvdev);
mvdev->suspended = true;
up_write(&ndev->reslock);
-   return 0;
+
+   return err;
 }
 
 static int mlx5_vdpa_resume(struct vdpa_device *vdev)
 {
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
struct mlx5_vdpa_net *ndev;
+   int err;
 
ndev = to_mlx5_vdpa_ndev(mvdev);
 
@@ -3459,10 +3462,11 @@ static int mlx5_vdpa_resume(struct vdpa_device *vdev)
 
down_write(&ndev->reslock);
mvdev->suspended = false;
-   resume_vqs(ndev);
+   err = resume_vqs(ndev);
register_link_notifier(ndev);
up_write(&ndev->reslock);
-   return 0;
+
+   return err;
 }
 
 static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group,

-- 
2.45.2

[PATCH vhost v3 22/24] vdpa/mlx5: Re-create HW VQs under certain conditions

2024-07-08 Thread Dragos Tatulea

There are a few conditions under which the hardware VQs need a full
teardown and setup:

- VQ size changed to something else than default value. Hardware VQ size
  modification is not supported.

- User turns off certain device features: mergeable buffers, checksum
  virtio 1.0 compliance. In these cases, the TIR and RQT need to be
  re-created.

Add a needs_teardown configuration variable and set it when detecting
the above scenarios. On next DRIVER_OK, the resources will be torn down
first.

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 15 +++
 drivers/vdpa/mlx5/net/mlx5_vnet.h |  1 +
 2 files changed, 16 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 1747f5607838..f2af0ba82dd2 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2390,6 +2390,7 @@ static void mlx5_vdpa_set_vq_num(struct vdpa_device 
*vdev, u16 idx, u32 num)
 }
 
mvq = &ndev->vqs[idx];
+   ndev->needs_teardown = num != mvq->num_ent;
mvq->num_ent = num;
 }
 
@@ -2800,6 +2801,7 @@ static int mlx5_vdpa_set_driver_features(struct 
vdpa_device *vdev, u64 features)
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
u64 old_features = mvdev->actual_features;
+   u64 diff_features;
int err;
 
print_features(mvdev, features, true);
@@ -2822,6 +2824,14 @@ static int mlx5_vdpa_set_driver_features(struct 
vdpa_device *vdev, u64 features)
}
}
 
+   /* When below features diverge from initial device features, VQs need a 
full teardown. */
+#define NEEDS_TEARDOWN_MASK (BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | \
+BIT_ULL(VIRTIO_NET_F_CSUM) | \
+BIT_ULL(VIRTIO_F_VERSION_1))
+
+   diff_features = mvdev->mlx_features ^ mvdev->actual_features;
+   ndev->needs_teardown = !!(diff_features & NEEDS_TEARDOWN_MASK);
+
update_cvq_info(mvdev);
return err;
 }
@@ -3038,6 +3048,7 @@ static void teardown_vq_resources(struct mlx5_vdpa_net 
*ndev)
destroy_rqt(ndev);
teardown_virtqueues(ndev);
ndev->setup = false;
+   ndev->needs_teardown = false;
 }
 
 static int setup_cvq_vring(struct mlx5_vdpa_dev *mvdev)
@@ -3078,6 +3089,10 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
goto err_setup;
}
register_link_notifier(ndev);
+
+   if (ndev->needs_teardown)
+   teardown_vq_resources(ndev);
+
if (ndev->setup) {
err = resume_vqs(ndev);
if (err) {
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.h 
b/drivers/vdpa/mlx5/net/mlx5_vnet.h
index 90b556a57971..00e79a7d0be8 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.h
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.h
@@ -56,6 +56,7 @@ struct mlx5_vdpa_net {
struct dentry *rx_dent;
struct dentry *rx_table_dent;
bool setup;
+   bool needs_teardown;
u32 cur_num_vqs;
u32 rqt_size;
bool nb_registered;

-- 
2.45.2

[PATCH vhost v3 18/24] vdpa/mlx5: Consolidate all VQ modify to Ready to use resume_vq()

2024-07-08 Thread Dragos Tatulea

There are a few more places modifying the VQ to Ready directly. Let's
consolidate them into resume_vq().

The redundant warnings for resume_vq() errors can also be dropped.

There is one special case that needs to be handled for virtio-vdpa:
the initialized flag must be set to true earlier in setup_vq() so that
resume_vq() doesn't return early.

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 8ab5cf1bbc43..e65d488f7a08 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -160,6 +160,7 @@ static void free_fixed_resources(struct mlx5_vdpa_net 
*ndev);
 static void mvqs_set_defaults(struct mlx5_vdpa_net *ndev);
 static int setup_vq_resources(struct mlx5_vdpa_net *ndev, bool filled);
 static void teardown_vq_resources(struct mlx5_vdpa_net *ndev);
+static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq);
 
 static bool mlx5_vdpa_debug;
 
@@ -1500,16 +1501,14 @@ static int setup_vq(struct mlx5_vdpa_net *ndev,
if (err)
goto err_vq;
 
+   mvq->initialized = true;
+
if (mvq->ready) {
-   err = modify_virtqueue_state(ndev, mvq, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
-   if (err) {
-   mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready 
vq idx %d(%d)\n",
-  idx, err);
+   err = resume_vq(ndev, mvq);
+   if (err)
goto err_modify;
-   }
}
 
-   mvq->initialized = true;
return 0;
 
 err_modify:
@@ -2422,7 +2421,6 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device 
*vdev, u16 idx, bool ready
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
struct mlx5_vdpa_virtqueue *mvq;
-   int err;
 
if (!mvdev->actual_features)
return;
@@ -2439,14 +2437,10 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device 
*vdev, u16 idx, bool ready
if (!ready) {
suspend_vq(ndev, mvq);
} else {
-   err = modify_virtqueue_state(ndev, mvq, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
-   if (err) {
-   mlx5_vdpa_warn(mvdev, "modify VQ %d to ready failed 
(%d)\n", idx, err);
+   if (resume_vq(ndev, mvq))
ready = false;
-   }
}
 
-
mvq->ready = ready;
 }
 

-- 
2.45.2

[PATCH vhost v3 21/24] vdpa/mlx5: Pre-create hardware VQs at vdpa .dev_add time

2024-07-08 Thread Dragos Tatulea

Currently, hardware VQs are created right when the vdpa device gets into
DRIVER_OK state. That is easier because most of the VQ state is known by
then.

This patch switches to creating all VQs and their associated resources
at device creation time. The motivation is to reduce the vdpa device
live migration downtime by moving the expensive operation of creating
all the hardware VQs and their associated resources out of downtime on
the destination VM.

The VQs are now created in a blank state. The VQ configuration will
happen later, on DRIVER_OK. Then the configuration will be applied when
the VQs are moved to the Ready state.

When .set_vq_ready() is called on a VQ before DRIVER_OK, special care is
needed: now that the VQ is already created a resume_vq() will be
triggered too early when no mr has been configured yet. Skip calling
resume_vq() in this case, let it be handled during DRIVER_OK.

On a 64 CPU, 256 GB VM with 1 vDPA device of 16 VQps, the full VQ
resource creation + resume time was ~370ms. Now it's down to 60 ms
(only VQ config and resume). The measurements were done on a ConnectX6DX
based vDPA device.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 39 ++-
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 324604b16b91..1747f5607838 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2444,7 +2444,7 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device 
*vdev, u16 idx, bool ready
mvq = &ndev->vqs[idx];
if (!ready) {
suspend_vq(ndev, mvq);
-   } else {
+   } else if (mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) {
if (resume_vq(ndev, mvq))
ready = false;
}
@@ -3078,10 +3078,18 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
goto err_setup;
}
register_link_notifier(ndev);
-   err = setup_vq_resources(ndev, true);
-   if (err) {
-   mlx5_vdpa_warn(mvdev, "failed to setup 
driver\n");
-   goto err_driver;
+   if (ndev->setup) {
+   err = resume_vqs(ndev);
+   if (err) {
+   mlx5_vdpa_warn(mvdev, "failed to resume 
VQs\n");
+   goto err_driver;
+   }
+   } else {
+   err = setup_vq_resources(ndev, true);
+   if (err) {
+   mlx5_vdpa_warn(mvdev, "failed to setup 
driver\n");
+   goto err_driver;
+   }
}
} else {
mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be 
cleared\n");
@@ -3142,6 +3150,7 @@ static int mlx5_vdpa_compat_reset(struct vdpa_device 
*vdev, u32 flags)
if (mlx5_vdpa_create_dma_mr(mvdev))
mlx5_vdpa_warn(mvdev, "create MR failed\n");
}
+   setup_vq_resources(ndev, false);
up_write(&ndev->reslock);
 
return 0;
@@ -3835,8 +3844,23 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
goto err_reg;
 
mgtdev->ndev = ndev;
+
+   /* The VQs might have been pre-created during device register.
+* This happens when virtio_vdpa is loaded before the vdpa device is 
added.
+*/
+   if (!ndev->setup)
+   return 0;
+
+   down_write(&ndev->reslock);
+   err = setup_vq_resources(ndev, false);
+   up_write(&ndev->reslock);
+   if (err)
+   goto err_setup_vq_res;
+
return 0;
 
+err_setup_vq_res:
+   _vdpa_unregister_device(&mvdev->vdev);
 err_reg:
destroy_workqueue(mvdev->wq);
 err_res2:
@@ -3862,6 +3886,11 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev 
*v_mdev, struct vdpa_device *
 
unregister_link_notifier(ndev);
_vdpa_unregister_device(dev);
+
+   down_write(&ndev->reslock);
+   teardown_vq_resources(ndev);
+   up_write(&ndev->reslock);
+
wq = mvdev->wq;
mvdev->wq = NULL;
destroy_workqueue(wq);

-- 
2.45.2

[PATCH vhost v3 16/24] vdpa/mlx5: Accept Init -> Ready VQ transition in resume_vq()

2024-07-08 Thread Dragos Tatulea

Until now resume_vq() was used only for the suspend/resume scenario.
This change also allows calling resume_vq() to bring it from Init to
Ready state (VQ initialization).

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 24 ++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 0a62ce0b4af8..adcc4d63cf83 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1557,11 +1557,31 @@ static void suspend_vqs(struct mlx5_vdpa_net *ndev)
 
 static void resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
 {
-   if (!mvq->initialized || !is_resumable(ndev))
+   if (!mvq->initialized)
return;
 
-   if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND)
+   switch (mvq->fw_state) {
+   case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT:
+   /* Due to a FW quirk we need to modify the VQ fields first then 
change state.
+* This should be fixed soon. After that, a single command can 
be used.
+*/
+   if (modify_virtqueue(ndev, mvq, 0))
+   mlx5_vdpa_warn(&ndev->mvdev,
+   "modify vq properties failed for vq %u\n", 
mvq->index);
+   break;
+   case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND:
+   if (!is_resumable(ndev)) {
+   mlx5_vdpa_warn(&ndev->mvdev, "vq %d is not 
resumable\n", mvq->index);
+   return;
+   }
+   break;
+   case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY:
return;
+   default:
+   mlx5_vdpa_warn(&ndev->mvdev, "resume vq %u called from bad 
state %d\n",
+  mvq->index, mvq->fw_state);
+   return;
+   }
 
if (modify_virtqueue_state(ndev, mvq, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY))
mlx5_vdpa_warn(&ndev->mvdev, "modify to resume failed for vq 
%u\n", mvq->index);

-- 
2.45.2

[PATCH vhost v3 17/24] vdpa/mlx5: Add error code for suspend/resume VQ

2024-07-08 Thread Dragos Tatulea

Instead of blindly calling suspend/resume_vqs(), make then return error
codes.

To keep compatibility, keep suspending or resuming VQs on error and
return the last error code. The assumption here is that the error code
would be the same.

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 77 +++
 1 file changed, 54 insertions(+), 23 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index adcc4d63cf83..8ab5cf1bbc43 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1526,71 +1526,102 @@ static int setup_vq(struct mlx5_vdpa_net *ndev,
return err;
 }
 
-static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
+static int suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
 {
struct mlx5_virtq_attr attr;
+   int err;
 
if (!mvq->initialized)
-   return;
+   return 0;
 
if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
-   return;
+   return 0;
 
-   if (modify_virtqueue_state(ndev, mvq, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
-   mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
+   err = modify_virtqueue_state(ndev, mvq, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND);
+   if (err) {
+   mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed, err: 
%d\n", err);
+   return err;
+   }
 
-   if (query_virtqueue(ndev, mvq, &attr)) {
-   mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
-   return;
+   err = query_virtqueue(ndev, mvq, &attr);
+   if (err) {
+   mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue, err: 
%d\n", err);
+   return err;
}
+
mvq->avail_idx = attr.available_index;
mvq->used_idx = attr.used_index;
+
+   return 0;
 }
 
-static void suspend_vqs(struct mlx5_vdpa_net *ndev)
+static int suspend_vqs(struct mlx5_vdpa_net *ndev)
 {
+   int err = 0;
int i;
 
-   for (i = 0; i < ndev->cur_num_vqs; i++)
-   suspend_vq(ndev, &ndev->vqs[i]);
+   for (i = 0; i < ndev->cur_num_vqs; i++) {
+   int local_err = suspend_vq(ndev, &ndev->vqs[i]);
+
+   err = local_err ? local_err : err;
+   }
+
+   return err;
 }
 
-static void resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
+static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue 
*mvq)
 {
+   int err;
+
if (!mvq->initialized)
-   return;
+   return 0;
 
switch (mvq->fw_state) {
case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT:
/* Due to a FW quirk we need to modify the VQ fields first then 
change state.
 * This should be fixed soon. After that, a single command can 
be used.
 */
-   if (modify_virtqueue(ndev, mvq, 0))
+   err = modify_virtqueue(ndev, mvq, 0);
+   if (err) {
mlx5_vdpa_warn(&ndev->mvdev,
-   "modify vq properties failed for vq %u\n", 
mvq->index);
+   "modify vq properties failed for vq %u, err: 
%d\n",
+   mvq->index, err);
+   return err;
+   }
break;
case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND:
if (!is_resumable(ndev)) {
mlx5_vdpa_warn(&ndev->mvdev, "vq %d is not 
resumable\n", mvq->index);
-   return;
+   return -EINVAL;
}
break;
case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY:
-   return;
+   return 0;
default:
mlx5_vdpa_warn(&ndev->mvdev, "resume vq %u called from bad 
state %d\n",
   mvq->index, mvq->fw_state);
-   return;
+   return -EINVAL;
}
 
-   if (modify_virtqueue_state(ndev, mvq, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY))
-   mlx5_vdpa_warn(&ndev->mvdev, "modify to resume failed for vq 
%u\n", mvq->index);
+   err = modify_virtqueue_state(ndev, mvq, 
MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
+   if (err)
+   mlx5_vdpa_warn(&ndev->mvdev, "modify to resume failed for vq 
%u, err: %d\n",
+  mvq->index, err);
+
+   return err;
 }
 
-static void resume_vqs(struct mlx5_vdpa_net *ndev)
+static int resume_vqs(struct mlx5_vdpa_net *ndev)
 {
-

[PATCH vhost v3 14/24] vdpa/mlx5: Set mkey modified flags on all VQs

2024-07-08 Thread Dragos Tatulea

Otherwise, when virtqueues are moved from INIT to READY the latest mkey
will not be set appropriately.

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 7f1551aa1f78..a8ac542f30f7 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2868,7 +2868,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev 
*mvdev,
 
mlx5_vdpa_update_mr(mvdev, new_mr, asid);
 
-   for (int i = 0; i < ndev->cur_num_vqs; i++)
+   for (int i = 0; i < mvdev->max_vqs; i++)
ndev->vqs[i].modified_fields |= 
MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY |

MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY;
 

-- 
2.45.2

[PATCH vhost v3 13/24] vdpa/mlx5: Start off rqt_size with max VQPs

2024-07-08 Thread Dragos Tatulea

Currently rqt_size is initialized during device flag configuration.
That's because it is the earliest moment when device knows if MQ
(multi queue) is on or off.

Shift this configuration earlier to device creation time. This implies
that non-MQ devices will have a larger RQT size. But the configuration
will still be correct.

This is done in preparation for the pre-creation of hardware virtqueues
at device add time. When that change will be added, RQT will be created
at device creation time so it needs to be initialized to its max size.

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 406cc590fe42..7f1551aa1f78 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2731,10 +2731,6 @@ static int mlx5_vdpa_set_driver_features(struct 
vdpa_device *vdev, u64 features)
return err;
 
ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
-   if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ))
-   ndev->rqt_size = mlx5vdpa16_to_cpu(mvdev, 
ndev->config.max_virtqueue_pairs);
-   else
-   ndev->rqt_size = 1;
 
/* Interested in changes of vq features only. */
if (get_features(old_features) != get_features(mvdev->actual_features)) 
{
@@ -3718,8 +3714,12 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
goto err_alloc;
}
 
-   if (device_features & BIT_ULL(VIRTIO_NET_F_MQ))
+   if (device_features & BIT_ULL(VIRTIO_NET_F_MQ)) {
config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, max_vqs 
/ 2);
+   ndev->rqt_size = max_vqs / 2;
+   } else {
+   ndev->rqt_size = 1;
+   }
 
ndev->mvdev.mlx_features = device_features;
mvdev->vdev.dma_dev = &mdev->pdev->dev;

-- 
2.45.2

[PATCH vhost v3 12/24] vdpa/mlx5: Set an initial size on the VQ

2024-07-08 Thread Dragos Tatulea

The virtqueue size is a pre-requisite for setting up any virtqueue
resources. For the upcoming optimization of creating virtqueues at
device add, the virtqueue size has to be configured.

The queue size check in setup_vq() will always be false. So remove it.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index db86e541b788..406cc590fe42 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -58,6 +58,8 @@ MODULE_LICENSE("Dual BSD/GPL");
  */
 #define MLX5V_DEFAULT_VQ_COUNT 2
 
+#define MLX5V_DEFAULT_VQ_SIZE 256
+
 struct mlx5_vdpa_cq_buf {
struct mlx5_frag_buf_ctrl fbc;
struct mlx5_frag_buf frag_buf;
@@ -1445,9 +1447,6 @@ static int setup_vq(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *mvq)
u16 idx = mvq->index;
int err;
 
-   if (!mvq->num_ent)
-   return 0;
-
if (mvq->initialized)
return 0;
 
@@ -3523,6 +3522,7 @@ static void mvqs_set_defaults(struct mlx5_vdpa_net *ndev)
mvq->ndev = ndev;
mvq->fwqp.fw = true;
mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
+   mvq->num_ent = MLX5V_DEFAULT_VQ_SIZE;
}
 }
 

-- 
2.45.2

[PATCH vhost v3 15/24] vdpa/mlx5: Allow creation of blank VQs

2024-07-08 Thread Dragos Tatulea

Based on the filled flag, create VQs that are filled or blank.
Blank VQs will be filled in later through VQ modify.

Later patches will make use of this to pre-create blank VQs at
vdpa device creation.

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 85 +--
 1 file changed, 55 insertions(+), 30 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index a8ac542f30f7..0a62ce0b4af8 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -158,7 +158,7 @@ static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 
idx)
 
 static void free_fixed_resources(struct mlx5_vdpa_net *ndev);
 static void mvqs_set_defaults(struct mlx5_vdpa_net *ndev);
-static int setup_vq_resources(struct mlx5_vdpa_net *ndev);
+static int setup_vq_resources(struct mlx5_vdpa_net *ndev, bool filled);
 static void teardown_vq_resources(struct mlx5_vdpa_net *ndev);
 
 static bool mlx5_vdpa_debug;
@@ -874,13 +874,16 @@ static bool msix_mode_supported(struct mlx5_vdpa_dev 
*mvdev)
pci_msix_can_alloc_dyn(mvdev->mdev->pdev);
 }
 
-static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *mvq)
+static int create_virtqueue(struct mlx5_vdpa_net *ndev,
+   struct mlx5_vdpa_virtqueue *mvq,
+   bool filled)
 {
int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
struct mlx5_vdpa_mr *vq_mr;
struct mlx5_vdpa_mr *vq_desc_mr;
+   u64 features = filled ? mvdev->actual_features : mvdev->mlx_features;
void *obj_context;
u16 mlx_features;
void *cmd_hdr;
@@ -898,7 +901,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, 
struct mlx5_vdpa_virtque
goto err_alloc;
}
 
-   mlx_features = get_features(ndev->mvdev.actual_features);
+   mlx_features = get_features(features);
cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, 
general_obj_in_cmd_hdr);
 
MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 
MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
@@ -906,8 +909,6 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, 
struct mlx5_vdpa_virtque
MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
 
obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
-   MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, 
mvq->avail_idx);
-   MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, 
mvq->used_idx);
MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
 mlx_features >> 3);
MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_2_0,
@@ -929,17 +930,36 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, 
struct mlx5_vdpa_virtque
MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
-!!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
-   MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
-   MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
-   MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
-   vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]];
-   if (vq_mr)
-   MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, vq_mr->mkey);
-
-   vq_desc_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]];
-   if (vq_desc_mr && MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, 
desc_group_mkey_supported))
-   MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, vq_desc_mr->mkey);
+!!(features & BIT_ULL(VIRTIO_F_VERSION_1)));
+
+   if (filled) {
+   MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, 
mvq->avail_idx);
+   MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, 
mvq->used_idx);
+
+   MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
+   MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
+   MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
+
+   vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]];
+   if (vq_mr)
+   MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, vq_mr->mkey);
+
+   vq_desc_mr = 
mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]];
+   if (vq_desc_mr &&
+   MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, 
desc_group_mkey_supported))
+   MLX5_SE

[PATCH vhost v3 07/24] vdpa/mlx5: Initialize and reset device with one queue pair

2024-07-08 Thread Dragos Tatulea

The virtio spec says that a vdpa device should start off with one queue
pair. The driver is already compliant.

This patch moves the initialization to device add and reset times. This
is done in preparation for the pre-creation of hardware virtqueues at
device add time.

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 23 ---
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index eca6f68c2eda..c8b5c87f001d 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -48,6 +48,16 @@ MODULE_LICENSE("Dual BSD/GPL");
 
 #define MLX5V_UNTAGGED 0x1000
 
+/* Device must start with 1 queue pair, as per VIRTIO v1.2 spec, section
+ * 5.1.6.5.5 "Device operation in multiqueue mode":
+ *
+ * Multiqueue is disabled by default.
+ * The driver enables multiqueue by sending a command using class
+ * VIRTIO_NET_CTRL_MQ. The command selects the mode of multiqueue
+ * operation, as follows: ...
+ */
+#define MLX5V_DEFAULT_VQ_COUNT 2
+
 struct mlx5_vdpa_cq_buf {
struct mlx5_frag_buf_ctrl fbc;
struct mlx5_frag_buf frag_buf;
@@ -2713,16 +2723,6 @@ static int mlx5_vdpa_set_driver_features(struct 
vdpa_device *vdev, u64 features)
else
ndev->rqt_size = 1;
 
-   /* Device must start with 1 queue pair, as per VIRTIO v1.2 spec, section
-* 5.1.6.5.5 "Device operation in multiqueue mode":
-*
-* Multiqueue is disabled by default.
-* The driver enables multiqueue by sending a command using class
-* VIRTIO_NET_CTRL_MQ. The command selects the mode of multiqueue
-* operation, as follows: ...
-*/
-   ndev->cur_num_vqs = 2;
-
update_cvq_info(mvdev);
return err;
 }
@@ -3040,7 +3040,7 @@ static int mlx5_vdpa_compat_reset(struct vdpa_device 
*vdev, u32 flags)
mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
ndev->mvdev.status = 0;
ndev->mvdev.suspended = false;
-   ndev->cur_num_vqs = 0;
+   ndev->cur_num_vqs = MLX5V_DEFAULT_VQ_COUNT;
ndev->mvdev.cvq.received_desc = 0;
ndev->mvdev.cvq.completed_desc = 0;
memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 
1));
@@ -3643,6 +3643,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
err = -ENOMEM;
goto err_alloc;
}
+   ndev->cur_num_vqs = MLX5V_DEFAULT_VQ_COUNT;
 
init_mvqs(ndev);
allocate_irqs(ndev);

-- 
2.45.2

[PATCH vhost v3 03/24] vdpa/mlx5: Drop redundant code

2024-07-08 Thread Dragos Tatulea

Originally, the second loop initialized the CVQ. But (acde3929492b
("vdpa/mlx5: Use consistent RQT size") initialized all the queues in the
first loop, so the second iteration in init_mvqs() is never called
because the first one will iterate up to max_vqs.

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 1ad281cbc541..b4d9ef4f66c8 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3519,12 +3519,6 @@ static void init_mvqs(struct mlx5_vdpa_net *ndev)
mvq->fwqp.fw = true;
mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
}
-   for (; i < ndev->mvdev.max_vqs; i++) {
-   mvq = &ndev->vqs[i];
-   memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
-   mvq->index = i;
-   mvq->ndev = ndev;
-   }
 }
 
 struct mlx5_vdpa_mgmtdev {

-- 
2.45.2

[PATCH vhost v3 06/24] vdpa/mlx5: Remove duplicate suspend code

2024-07-08 Thread Dragos Tatulea

Use the dedicated suspend_vqs() function instead.

Reviewed-by: Cosmin Ratiu 
Reviewed-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 51630b1935f4..eca6f68c2eda 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3355,17 +3355,12 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
 {
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-   struct mlx5_vdpa_virtqueue *mvq;
-   int i;
 
mlx5_vdpa_info(mvdev, "suspending device\n");
 
down_write(&ndev->reslock);
unregister_link_notifier(ndev);
-   for (i = 0; i < ndev->cur_num_vqs; i++) {
-   mvq = &ndev->vqs[i];
-   suspend_vq(ndev, mvq);
-   }
+   suspend_vqs(ndev);
mlx5_vdpa_cvq_suspend(mvdev);
mvdev->suspended = true;
up_write(&ndev->reslock);

-- 
2.45.2

[PATCH vhost v3 11/24] vdpa/mlx5: Add support for modifying the VQ features field

2024-07-08 Thread Dragos Tatulea

This is done in preparation for the pre-creation of hardware virtqueues
at device add time.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 12 +++-
 include/linux/mlx5/mlx5_ifc_vdpa.h |  1 +
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b104849f8477..db86e541b788 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1287,6 +1287,15 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
!!(ndev->mvdev.actual_features & 
BIT_ULL(VIRTIO_F_VERSION_1)));
 
+   if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_QUEUE_FEATURES) {
+   u16 mlx_features = get_features(ndev->mvdev.actual_features);
+
+   MLX5_SET(virtio_net_q_object, obj_context, 
queue_feature_bit_mask_12_3,
+mlx_features >> 3);
+   MLX5_SET(virtio_net_q_object, obj_context, 
queue_feature_bit_mask_2_0,
+mlx_features & 7);
+   }
+
if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) {
vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]];
 
@@ -2734,7 +2743,8 @@ static int mlx5_vdpa_set_driver_features(struct 
vdpa_device *vdev, u64 features)
struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i];
 
mvq->modified_fields |= (
-   MLX5_VIRTQ_MODIFY_MASK_QUEUE_VIRTIO_VERSION
+   MLX5_VIRTQ_MODIFY_MASK_QUEUE_VIRTIO_VERSION |
+   MLX5_VIRTQ_MODIFY_MASK_QUEUE_FEATURES
);
}
}
diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h 
b/include/linux/mlx5/mlx5_ifc_vdpa.h
index 34f27c01cec9..58dfa2ee7c83 100644
--- a/include/linux/mlx5/mlx5_ifc_vdpa.h
+++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
@@ -150,6 +150,7 @@ enum {
MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX= (u64)1 << 8,
MLX5_VIRTQ_MODIFY_MASK_QUEUE_VIRTIO_VERSION = (u64)1 << 10,
MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY= (u64)1 << 11,
+   MLX5_VIRTQ_MODIFY_MASK_QUEUE_FEATURES   = (u64)1 << 12,
MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY  = (u64)1 << 14,
 };
 

-- 
2.45.2

[PATCH vhost v3 10/24] vdpa/mlx5: Add support for modifying the virtio_version VQ field

2024-07-08 Thread Dragos Tatulea

This is done in preparation for the pre-creation of hardware virtqueues
at device add time.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 16 
 include/linux/mlx5/mlx5_ifc_vdpa.h |  1 +
 2 files changed, 17 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 739c2886fc33..b104849f8477 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1283,6 +1283,10 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX)
MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, 
mvq->used_idx);
 
+   if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_QUEUE_VIRTIO_VERSION)
+   MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
+   !!(ndev->mvdev.actual_features & 
BIT_ULL(VIRTIO_F_VERSION_1)));
+
if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) {
vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]];
 
@@ -2709,6 +2713,7 @@ static int mlx5_vdpa_set_driver_features(struct 
vdpa_device *vdev, u64 features)
 {
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+   u64 old_features = mvdev->actual_features;
int err;
 
print_features(mvdev, features, true);
@@ -2723,6 +2728,17 @@ static int mlx5_vdpa_set_driver_features(struct 
vdpa_device *vdev, u64 features)
else
ndev->rqt_size = 1;
 
+   /* Interested in changes of vq features only. */
+   if (get_features(old_features) != get_features(mvdev->actual_features)) 
{
+   for (int i = 0; i < mvdev->max_vqs; ++i) {
+   struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i];
+
+   mvq->modified_fields |= (
+   MLX5_VIRTQ_MODIFY_MASK_QUEUE_VIRTIO_VERSION
+   );
+   }
+   }
+
update_cvq_info(mvdev);
return err;
 }
diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h 
b/include/linux/mlx5/mlx5_ifc_vdpa.h
index 40371c916cf9..34f27c01cec9 100644
--- a/include/linux/mlx5/mlx5_ifc_vdpa.h
+++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
@@ -148,6 +148,7 @@ enum {
MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS   = (u64)1 << 6,
MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX   = (u64)1 << 7,
MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX= (u64)1 << 8,
+   MLX5_VIRTQ_MODIFY_MASK_QUEUE_VIRTIO_VERSION = (u64)1 << 10,
MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY= (u64)1 << 11,
MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY  = (u64)1 << 14,
 };

-- 
2.45.2

[PATCH vhost v3 04/24] vdpa/mlx5: Drop redundant check in teardown_virtqueues()

2024-07-08 Thread Dragos Tatulea

The check is done inside teardown_vq().

Reviewed-by: Cosmin Ratiu 
Reviewed-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 10 ++
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b4d9ef4f66c8..96782b34e2b2 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2559,16 +2559,10 @@ static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev)
 
 static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
 {
-   struct mlx5_vdpa_virtqueue *mvq;
int i;
 
-   for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
-   mvq = &ndev->vqs[i];
-   if (!mvq->initialized)
-   continue;
-
-   teardown_vq(ndev, mvq);
-   }
+   for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--)
+   teardown_vq(ndev, &ndev->vqs[i]);
 }
 
 static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)

-- 
2.45.2

[PATCH vhost v3 09/24] vdpa/mlx5: Rename init_mvqs

2024-07-08 Thread Dragos Tatulea

Function is used to set default values, so name it accordingly.

Reviewed-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index de013b5a2815..739c2886fc33 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -155,7 +155,7 @@ static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 
idx)
 }
 
 static void free_fixed_resources(struct mlx5_vdpa_net *ndev);
-static void init_mvqs(struct mlx5_vdpa_net *ndev);
+static void mvqs_set_defaults(struct mlx5_vdpa_net *ndev);
 static int setup_vq_resources(struct mlx5_vdpa_net *ndev);
 static void teardown_vq_resources(struct mlx5_vdpa_net *ndev);
 
@@ -2810,7 +2810,7 @@ static void restore_channels_info(struct mlx5_vdpa_net 
*ndev)
int i;
 
mlx5_clear_vqs(ndev);
-   init_mvqs(ndev);
+   mvqs_set_defaults(ndev);
for (i = 0; i < ndev->mvdev.max_vqs; i++) {
mvq = &ndev->vqs[i];
ri = &mvq->ri;
@@ -3023,7 +3023,7 @@ static int mlx5_vdpa_compat_reset(struct vdpa_device 
*vdev, u32 flags)
down_write(&ndev->reslock);
unregister_link_notifier(ndev);
teardown_vq_resources(ndev);
-   init_mvqs(ndev);
+   mvqs_set_defaults(ndev);
 
if (flags & VDPA_RESET_F_CLEAN_MAP)
mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
@@ -3485,7 +3485,7 @@ static void free_fixed_resources(struct mlx5_vdpa_net 
*ndev)
res->valid = false;
 }
 
-static void init_mvqs(struct mlx5_vdpa_net *ndev)
+static void mvqs_set_defaults(struct mlx5_vdpa_net *ndev)
 {
struct mlx5_vdpa_virtqueue *mvq;
int i;
@@ -3635,7 +3635,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
}
ndev->cur_num_vqs = MLX5V_DEFAULT_VQ_COUNT;
 
-   init_mvqs(ndev);
+   mvqs_set_defaults(ndev);
allocate_irqs(ndev);
init_rwsem(&ndev->reslock);
config = &ndev->config;

-- 
2.45.2

[PATCH vhost v3 08/24] vdpa/mlx5: Clear and reinitialize software VQ data on reset

2024-07-08 Thread Dragos Tatulea

The hardware VQ configuration is mirrored by data in struct
mlx5_vdpa_virtqueue . Instead of clearing just a few fields at reset,
fully clear the struct and initialize with the appropriate default
values.

As clear_vqs_ready() is used only during reset, get rid of it.

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 16 +++-
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index c8b5c87f001d..de013b5a2815 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2941,18 +2941,6 @@ static void teardown_vq_resources(struct mlx5_vdpa_net 
*ndev)
ndev->setup = false;
 }
 
-static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)
-{
-   int i;
-
-   for (i = 0; i < ndev->mvdev.max_vqs; i++) {
-   ndev->vqs[i].ready = false;
-   ndev->vqs[i].modified_fields = 0;
-   }
-
-   ndev->mvdev.cvq.ready = false;
-}
-
 static int setup_cvq_vring(struct mlx5_vdpa_dev *mvdev)
 {
struct mlx5_control_vq *cvq = &mvdev->cvq;
@@ -3035,12 +3023,14 @@ static int mlx5_vdpa_compat_reset(struct vdpa_device 
*vdev, u32 flags)
down_write(&ndev->reslock);
unregister_link_notifier(ndev);
teardown_vq_resources(ndev);
-   clear_vqs_ready(ndev);
+   init_mvqs(ndev);
+
if (flags & VDPA_RESET_F_CLEAN_MAP)
mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
ndev->mvdev.status = 0;
ndev->mvdev.suspended = false;
ndev->cur_num_vqs = MLX5V_DEFAULT_VQ_COUNT;
+   ndev->mvdev.cvq.ready = false;
ndev->mvdev.cvq.received_desc = 0;
ndev->mvdev.cvq.completed_desc = 0;
memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 
1));

-- 
2.45.2

[PATCH vhost v3 02/24] vdpa/mlx5: Make setup/teardown_vq_resources() symmetrical

2024-07-08 Thread Dragos Tatulea

... by changing the setup_vq_resources() parameter type.

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 3422da0e344b..1ad281cbc541 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -146,7 +146,7 @@ static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 
idx)
 
 static void free_fixed_resources(struct mlx5_vdpa_net *ndev);
 static void init_mvqs(struct mlx5_vdpa_net *ndev);
-static int setup_vq_resources(struct mlx5_vdpa_dev *mvdev);
+static int setup_vq_resources(struct mlx5_vdpa_net *ndev);
 static void teardown_vq_resources(struct mlx5_vdpa_net *ndev);
 
 static bool mlx5_vdpa_debug;
@@ -2862,7 +2862,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev 
*mvdev,
 
if (teardown) {
restore_channels_info(ndev);
-   err = setup_vq_resources(mvdev);
+   err = setup_vq_resources(ndev);
if (err)
return err;
}
@@ -2873,9 +2873,9 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev 
*mvdev,
 }
 
 /* reslock must be held for this function */
-static int setup_vq_resources(struct mlx5_vdpa_dev *mvdev)
+static int setup_vq_resources(struct mlx5_vdpa_net *ndev)
 {
-   struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+   struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
int err;
 
WARN_ON(!rwsem_is_locked(&ndev->reslock));
@@ -2997,7 +2997,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
goto err_setup;
}
register_link_notifier(ndev);
-   err = setup_vq_resources(mvdev);
+   err = setup_vq_resources(ndev);
if (err) {
mlx5_vdpa_warn(mvdev, "failed to setup 
driver\n");
goto err_driver;

-- 
2.45.2

[PATCH vhost v3 05/24] vdpa/mlx5: Iterate over active VQs during suspend/resume

2024-07-08 Thread Dragos Tatulea

No need to iterate over max number of VQs.

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 96782b34e2b2..51630b1935f4 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1504,7 +1504,7 @@ static void suspend_vqs(struct mlx5_vdpa_net *ndev)
 {
int i;
 
-   for (i = 0; i < ndev->mvdev.max_vqs; i++)
+   for (i = 0; i < ndev->cur_num_vqs; i++)
suspend_vq(ndev, &ndev->vqs[i]);
 }
 
@@ -1522,7 +1522,7 @@ static void resume_vq(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *mv
 
 static void resume_vqs(struct mlx5_vdpa_net *ndev)
 {
-   for (int i = 0; i < ndev->mvdev.max_vqs; i++)
+   for (int i = 0; i < ndev->cur_num_vqs; i++)
resume_vq(ndev, &ndev->vqs[i]);
 }
 

-- 
2.45.2

[PATCH vhost v3 01/24] vdpa/mlx5: Clarify meaning thorough function rename

2024-07-08 Thread Dragos Tatulea

setup_driver()/teardown_driver() are a bit vague. These functions are
used for virtqueue resources.

Same for alloc_resources()/teardown_resources(): they represent fixed
resources that are meant to exist during the device lifetime.

Reviewed-by: Cosmin Ratiu 
Acked-by: Eugenio Pérez 
Signed-off-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 28 ++--
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index ecfc16151d61..3422da0e344b 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -144,10 +144,10 @@ static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, 
u16 idx)
return idx <= mvdev->max_idx;
 }
 
-static void free_resources(struct mlx5_vdpa_net *ndev);
+static void free_fixed_resources(struct mlx5_vdpa_net *ndev);
 static void init_mvqs(struct mlx5_vdpa_net *ndev);
-static int setup_driver(struct mlx5_vdpa_dev *mvdev);
-static void teardown_driver(struct mlx5_vdpa_net *ndev);
+static int setup_vq_resources(struct mlx5_vdpa_dev *mvdev);
+static void teardown_vq_resources(struct mlx5_vdpa_net *ndev);
 
 static bool mlx5_vdpa_debug;
 
@@ -2848,7 +2848,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev 
*mvdev,
if (err)
return err;
 
-   teardown_driver(ndev);
+   teardown_vq_resources(ndev);
}
 
mlx5_vdpa_update_mr(mvdev, new_mr, asid);
@@ -2862,7 +2862,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev 
*mvdev,
 
if (teardown) {
restore_channels_info(ndev);
-   err = setup_driver(mvdev);
+   err = setup_vq_resources(mvdev);
if (err)
return err;
}
@@ -2873,7 +2873,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev 
*mvdev,
 }
 
 /* reslock must be held for this function */
-static int setup_driver(struct mlx5_vdpa_dev *mvdev)
+static int setup_vq_resources(struct mlx5_vdpa_dev *mvdev)
 {
struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
int err;
@@ -2931,7 +2931,7 @@ static int setup_driver(struct mlx5_vdpa_dev *mvdev)
 }
 
 /* reslock must be held for this function */
-static void teardown_driver(struct mlx5_vdpa_net *ndev)
+static void teardown_vq_resources(struct mlx5_vdpa_net *ndev)
 {
 
WARN_ON(!rwsem_is_locked(&ndev->reslock));
@@ -2997,7 +2997,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
goto err_setup;
}
register_link_notifier(ndev);
-   err = setup_driver(mvdev);
+   err = setup_vq_resources(mvdev);
if (err) {
mlx5_vdpa_warn(mvdev, "failed to setup 
driver\n");
goto err_driver;
@@ -3040,7 +3040,7 @@ static int mlx5_vdpa_compat_reset(struct vdpa_device 
*vdev, u32 flags)
 
down_write(&ndev->reslock);
unregister_link_notifier(ndev);
-   teardown_driver(ndev);
+   teardown_vq_resources(ndev);
clear_vqs_ready(ndev);
if (flags & VDPA_RESET_F_CLEAN_MAP)
mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
@@ -3197,7 +3197,7 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev)
 
ndev = to_mlx5_vdpa_ndev(mvdev);
 
-   free_resources(ndev);
+   free_fixed_resources(ndev);
mlx5_vdpa_destroy_mr_resources(mvdev);
if (!is_zero_ether_addr(ndev->config.mac)) {
pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
@@ -3467,7 +3467,7 @@ static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
return 0;
 }
 
-static int alloc_resources(struct mlx5_vdpa_net *ndev)
+static int alloc_fixed_resources(struct mlx5_vdpa_net *ndev)
 {
struct mlx5_vdpa_net_resources *res = &ndev->res;
int err;
@@ -3494,7 +3494,7 @@ static int alloc_resources(struct mlx5_vdpa_net *ndev)
return err;
 }
 
-static void free_resources(struct mlx5_vdpa_net *ndev)
+static void free_fixed_resources(struct mlx5_vdpa_net *ndev)
 {
struct mlx5_vdpa_net_resources *res = &ndev->res;
 
@@ -3735,7 +3735,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
goto err_res;
}
 
-   err = alloc_resources(ndev);
+   err = alloc_fixed_resources(ndev);
if (err)
goto err_mr;
 
@@ -3758,7 +3758,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
 err_reg:
destroy_workqueue(mvdev->wq);
 err_res2:
-   free_resources(ndev);
+   free_fixed_resources(ndev);
 err_mr:
mlx5_vdpa_destroy_mr_resources(mvdev);
 err_res:

-- 
2.45.2

1 2 3 4 >

1 - 100 of 303 matches

Mail list logo