[PATCH] vdpa/mlx5: should keep avail_index despite device status

2020-10-01 Thread Si-Wei Liu
A VM with mlx5 vDPA has below warnings while being reset:

vhost VQ 0 ring restore failed: -1: Resource temporarily unavailable (11)
vhost VQ 1 ring restore failed: -1: Resource temporarily unavailable (11)

We should allow userspace emulating the virtio device be
able to get to vq's avail_index, regardless of vDPA device
status. Save the index that was last seen when virtq was
stopped, so that userspace doesn't complain.

Signed-off-by: Si-Wei Liu 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 70676a6..74264e59 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1133,15 +1133,17 @@ static void suspend_vq(struct mlx5_vdpa_net *ndev, 
struct mlx5_vdpa_virtqueue *m
if (!mvq->initialized)
return;
 
-   if (query_virtqueue(ndev, mvq, &attr)) {
-   mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
-   return;
-   }
if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
return;
 
if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
+
+   if (query_virtqueue(ndev, mvq, &attr)) {
+   mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
+   return;
+   }
+   mvq->avail_idx = attr.available_index;
 }
 
 static void suspend_vqs(struct mlx5_vdpa_net *ndev)
@@ -1411,8 +1413,14 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device 
*vdev, u16 idx, struct vdpa
struct mlx5_virtq_attr attr;
int err;
 
-   if (!mvq->initialized)
-   return -EAGAIN;
+   /* If the virtq object was destroyed, use the value saved at
+* the last minute of suspend_vq. This caters for userspace
+* that cares about emulating the index after vq is stopped.
+*/
+   if (!mvq->initialized) {
+   state->avail_index = mvq->avail_idx;
+   return 0;
+   }
 
err = query_virtqueue(ndev, mvq, &attr);
if (err) {
-- 
1.8.3.1



[PATCH] vhost-vdpa: fix page pinning leakage in error path

2020-10-01 Thread Si-Wei Liu
Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path. As the inflight pinned
pages, specifically for memory region that strides across
multiple chunks, would need more than one free page for
book keeping and accounting. For simplicity, pin pages
for all memory in the IOVA range in one go rather than
have multiple pin_user_pages calls to make up the entire
region. This way it's easier to track and account the
pages already mapped, particularly for clean-up in the
error path.

Fixes: 20453a45fb06 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
 drivers/vhost/vdpa.c | 121 +++
 1 file changed, 73 insertions(+), 48 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 796fe97..abc4aa2 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -565,6 +565,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
  perm_to_iommu_flags(perm));
}
 
+   if (r)
+   vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
return r;
 }
 
@@ -592,21 +594,19 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
struct vhost_dev *dev = &v->vdev;
struct vhost_iotlb *iotlb = dev->iotlb;
struct page **page_list;
-   unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
+   struct vm_area_struct **vmas;
unsigned int gup_flags = FOLL_LONGTERM;
-   unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-   unsigned long locked, lock_limit, pinned, i;
+   unsigned long map_pfn, last_pfn = 0;
+   unsigned long npages, lock_limit;
+   unsigned long i, nmap = 0;
u64 iova = msg->iova;
+   long pinned;
int ret = 0;
 
if (vhost_iotlb_itree_first(iotlb, msg->iova,
msg->iova + msg->size - 1))
return -EEXIST;
 
-   page_list = (struct page **) __get_free_page(GFP_KERNEL);
-   if (!page_list)
-   return -ENOMEM;
-
if (msg->perm & VHOST_ACCESS_WO)
gup_flags |= FOLL_WRITE;
 
@@ -614,61 +614,86 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
if (!npages)
return -EINVAL;
 
+   page_list = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
+   vmas = kvmalloc_array(npages, sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+   if (!page_list || !vmas) {
+   ret = -ENOMEM;
+   goto free;
+   }
+
mmap_read_lock(dev->mm);
 
-   locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (locked > lock_limit) {
+   if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
ret = -ENOMEM;
-   goto out;
+   goto unlock;
}
 
-   cur_base = msg->uaddr & PAGE_MASK;
-   iova &= PAGE_MASK;
+   pinned = pin_user_pages(msg->uaddr & PAGE_MASK, npages, gup_flags,
+   page_list, vmas);
+   if (npages != pinned) {
+   if (pinned < 0) {
+   ret = pinned;
+   } else {
+   unpin_user_pages(page_list, pinned);
+   ret = -ENOMEM;
+   }
+   goto unlock;
+   }
 
-   while (npages) {
-   pinned = min_t(unsigned long, npages, list_size);
-   ret = pin_user_pages(cur_base, pinned,
-gup_flags, page_list, NULL);
-   if (ret != pinned)
-   goto out;
-
-   if (!last_pfn)
-   map_pfn = page_to_pfn(page_list[0]);
-
-   for (i = 0; i < ret; i++) {
-   unsigned long this_pfn = page_to_pfn(page_list[i]);
-   u64 csize;
-
-   if (last_pfn && (this_pfn != last_pfn + 1)) {
-   /* Pin a contiguous chunk of memory */
-   csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-   if (vhost_vdpa_map(v, iova, csize,
-  map_pfn << PAGE_SHIFT,
-  msg->perm))
-   goto out;
-   map_pfn = this_pfn;
-   iova += csize;
+   iova &= PAGE_MASK;
+   map_pfn = page_to_pfn(page_list[0]);
+
+   /* One more iteration to avoid extra vdpa_map() call out of loop. */
+   for (i = 0; i <= npages; i++) {
+   unsigned long this_pfn;

[PATCH v2] vhost-vdpa: fix page pinning leakage in error path

2020-10-01 Thread Si-Wei Liu
Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path. As the inflight pinned
pages, specifically for memory region that strides across
multiple chunks, would need more than one free page for
book keeping and accounting. For simplicity, pin pages
for all memory in the IOVA range in one go rather than
have multiple pin_user_pages calls to make up the entire
region. This way it's easier to track and account the
pages already mapped, particularly for clean-up in the
error path.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
Changes in v2:
- Fix incorrect target SHA1 referenced

 drivers/vhost/vdpa.c | 121 +++
 1 file changed, 73 insertions(+), 48 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 796fe97..abc4aa2 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -565,6 +565,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
  perm_to_iommu_flags(perm));
}
 
+   if (r)
+   vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
return r;
 }
 
@@ -592,21 +594,19 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
struct vhost_dev *dev = &v->vdev;
struct vhost_iotlb *iotlb = dev->iotlb;
struct page **page_list;
-   unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
+   struct vm_area_struct **vmas;
unsigned int gup_flags = FOLL_LONGTERM;
-   unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-   unsigned long locked, lock_limit, pinned, i;
+   unsigned long map_pfn, last_pfn = 0;
+   unsigned long npages, lock_limit;
+   unsigned long i, nmap = 0;
u64 iova = msg->iova;
+   long pinned;
int ret = 0;
 
if (vhost_iotlb_itree_first(iotlb, msg->iova,
msg->iova + msg->size - 1))
return -EEXIST;
 
-   page_list = (struct page **) __get_free_page(GFP_KERNEL);
-   if (!page_list)
-   return -ENOMEM;
-
if (msg->perm & VHOST_ACCESS_WO)
gup_flags |= FOLL_WRITE;
 
@@ -614,61 +614,86 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
if (!npages)
return -EINVAL;
 
+   page_list = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
+   vmas = kvmalloc_array(npages, sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+   if (!page_list || !vmas) {
+   ret = -ENOMEM;
+   goto free;
+   }
+
mmap_read_lock(dev->mm);
 
-   locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (locked > lock_limit) {
+   if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
ret = -ENOMEM;
-   goto out;
+   goto unlock;
}
 
-   cur_base = msg->uaddr & PAGE_MASK;
-   iova &= PAGE_MASK;
+   pinned = pin_user_pages(msg->uaddr & PAGE_MASK, npages, gup_flags,
+   page_list, vmas);
+   if (npages != pinned) {
+   if (pinned < 0) {
+   ret = pinned;
+   } else {
+   unpin_user_pages(page_list, pinned);
+   ret = -ENOMEM;
+   }
+   goto unlock;
+   }
 
-   while (npages) {
-   pinned = min_t(unsigned long, npages, list_size);
-   ret = pin_user_pages(cur_base, pinned,
-gup_flags, page_list, NULL);
-   if (ret != pinned)
-   goto out;
-
-   if (!last_pfn)
-   map_pfn = page_to_pfn(page_list[0]);
-
-   for (i = 0; i < ret; i++) {
-   unsigned long this_pfn = page_to_pfn(page_list[i]);
-   u64 csize;
-
-   if (last_pfn && (this_pfn != last_pfn + 1)) {
-   /* Pin a contiguous chunk of memory */
-   csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-   if (vhost_vdpa_map(v, iova, csize,
-  map_pfn << PAGE_SHIFT,
-  msg->perm))
-   goto out;
-   map_pfn = this_pfn;
-   iova += csize;
+   iova &= PAGE_MASK;
+   map_pfn = page_to_pfn(page_list[0]);
+
+   /* One more iteration to avoid extra vdpa_map() call out of loop. */
+   for (i = 0; i <= npa

Re: linux-next: Fixes tag needs some work in the vhost tree

2020-10-01 Thread si-wei liu

Thanks Stephen, the SHA1 referenced is not correct. A v2 patch is posted.

Thanks,
-Siwei


On 10/1/2020 3:32 PM, Stephen Rothwell wrote:

Hi all,

In commit

   c9795f8fbb41 ("vhost-vdpa: fix page pinning leakage in error path")

Fixes tag

   Fixes: 20453a45fb06 ("vhost: introduce vDPA-based backend")

has these problem(s):

   - Target SHA1 does not exist

Maybe you meant

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")





Re: [PATCH] vdpa/mlx5: should keep avail_index despite device status

2020-10-02 Thread Si-Wei Liu
+ Eli.

On Thu, Oct 1, 2020 at 2:02 PM Si-Wei Liu  wrote:
>
> A VM with mlx5 vDPA has below warnings while being reset:
>
> vhost VQ 0 ring restore failed: -1: Resource temporarily unavailable (11)
> vhost VQ 1 ring restore failed: -1: Resource temporarily unavailable (11)
>
> We should allow userspace emulating the virtio device be
> able to get to vq's avail_index, regardless of vDPA device
> status. Save the index that was last seen when virtq was
> stopped, so that userspace doesn't complain.
>
> Signed-off-by: Si-Wei Liu 
> ---
>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 20 ++--
>  1 file changed, 14 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> index 70676a6..74264e59 100644
> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> @@ -1133,15 +1133,17 @@ static void suspend_vq(struct mlx5_vdpa_net *ndev, 
> struct mlx5_vdpa_virtqueue *m
> if (!mvq->initialized)
> return;
>
> -   if (query_virtqueue(ndev, mvq, &attr)) {
> -   mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
> -   return;
> -   }
> if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
> return;
>
> if (modify_virtqueue(ndev, mvq, 
> MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
> mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
> +
> +   if (query_virtqueue(ndev, mvq, &attr)) {
> +   mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
> +   return;
> +   }
> +   mvq->avail_idx = attr.available_index;
>  }
>
>  static void suspend_vqs(struct mlx5_vdpa_net *ndev)
> @@ -1411,8 +1413,14 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device 
> *vdev, u16 idx, struct vdpa
> struct mlx5_virtq_attr attr;
> int err;
>
> -   if (!mvq->initialized)
> -   return -EAGAIN;
> +   /* If the virtq object was destroyed, use the value saved at
> +* the last minute of suspend_vq. This caters for userspace
> +* that cares about emulating the index after vq is stopped.
> +*/
> +   if (!mvq->initialized) {
> +   state->avail_index = mvq->avail_idx;
> +   return 0;
> +   }
>
> err = query_virtqueue(ndev, mvq, &attr);
> if (err) {
> --
> 1.8.3.1
>


[PATCH v3 1/2] vhost-vdpa: fix vhost_vdpa_map() on error condition

2020-10-02 Thread Si-Wei Liu
vhost_vdpa_map() should remove the iotlb entry just added
if the corresponding mapping fails to set up properly.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
 drivers/vhost/vdpa.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 796fe97..0f27919 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -565,6 +565,9 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
  perm_to_iommu_flags(perm));
}
 
+   if (r)
+   vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+
return r;
 }
 
-- 
1.8.3.1



[PATCH v3 0/2] vhost-vdpa mapping error path fixes

2020-10-02 Thread Si-Wei Liu
Commit 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
has following issues in the failure path of IOTLB update:

1) vhost_vdpa_map() does not clean up dangling iotlb entry
   upon mapping failure

2) vhost_vdpa_process_iotlb_update() has leakage of pinned
   pages in case of vhost_vdpa_map() failure

This patchset attempts to address the above issues.

Changes in v3:
- Factor out changes in vhost_vdpa_map() and the fix for
  page pinning leak to separate patches (Jason)

---
Si-Wei Liu (2):
  vhost-vdpa: fix vhost_vdpa_map() on error condition
  vhost-vdpa: fix page pinning leakage in error path

 drivers/vhost/vdpa.c | 122 +++
 1 file changed, 74 insertions(+), 48 deletions(-)

-- 
1.8.3.1



[PATCH v3 2/2] vhost-vdpa: fix page pinning leakage in error path

2020-10-02 Thread Si-Wei Liu
Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path. As the inflight pinned
pages, specifically for memory region that strides across
multiple chunks, would need more than one free page for
book keeping and accounting. For simplicity, pin pages
for all memory in the IOVA range in one go rather than
have multiple pin_user_pages calls to make up the entire
region. This way it's easier to track and account the
pages already mapped, particularly for clean-up in the
error path.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
Changes in v3:
- Factor out vhost_vdpa_map() change to a separate patch

Changes in v2:
- Fix incorrect target SHA1 referenced

 drivers/vhost/vdpa.c | 119 ++-
 1 file changed, 71 insertions(+), 48 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 0f27919..dad41dae 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -595,21 +595,19 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
struct vhost_dev *dev = &v->vdev;
struct vhost_iotlb *iotlb = dev->iotlb;
struct page **page_list;
-   unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
+   struct vm_area_struct **vmas;
unsigned int gup_flags = FOLL_LONGTERM;
-   unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-   unsigned long locked, lock_limit, pinned, i;
+   unsigned long map_pfn, last_pfn = 0;
+   unsigned long npages, lock_limit;
+   unsigned long i, nmap = 0;
u64 iova = msg->iova;
+   long pinned;
int ret = 0;
 
if (vhost_iotlb_itree_first(iotlb, msg->iova,
msg->iova + msg->size - 1))
return -EEXIST;
 
-   page_list = (struct page **) __get_free_page(GFP_KERNEL);
-   if (!page_list)
-   return -ENOMEM;
-
if (msg->perm & VHOST_ACCESS_WO)
gup_flags |= FOLL_WRITE;
 
@@ -617,61 +615,86 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
if (!npages)
return -EINVAL;
 
+   page_list = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
+   vmas = kvmalloc_array(npages, sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+   if (!page_list || !vmas) {
+   ret = -ENOMEM;
+   goto free;
+   }
+
mmap_read_lock(dev->mm);
 
-   locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (locked > lock_limit) {
+   if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
ret = -ENOMEM;
-   goto out;
+   goto unlock;
}
 
-   cur_base = msg->uaddr & PAGE_MASK;
-   iova &= PAGE_MASK;
+   pinned = pin_user_pages(msg->uaddr & PAGE_MASK, npages, gup_flags,
+   page_list, vmas);
+   if (npages != pinned) {
+   if (pinned < 0) {
+   ret = pinned;
+   } else {
+   unpin_user_pages(page_list, pinned);
+   ret = -ENOMEM;
+   }
+   goto unlock;
+   }
 
-   while (npages) {
-   pinned = min_t(unsigned long, npages, list_size);
-   ret = pin_user_pages(cur_base, pinned,
-gup_flags, page_list, NULL);
-   if (ret != pinned)
-   goto out;
-
-   if (!last_pfn)
-   map_pfn = page_to_pfn(page_list[0]);
-
-   for (i = 0; i < ret; i++) {
-   unsigned long this_pfn = page_to_pfn(page_list[i]);
-   u64 csize;
-
-   if (last_pfn && (this_pfn != last_pfn + 1)) {
-   /* Pin a contiguous chunk of memory */
-   csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-   if (vhost_vdpa_map(v, iova, csize,
-  map_pfn << PAGE_SHIFT,
-  msg->perm))
-   goto out;
-   map_pfn = this_pfn;
-   iova += csize;
+   iova &= PAGE_MASK;
+   map_pfn = page_to_pfn(page_list[0]);
+
+   /* One more iteration to avoid extra vdpa_map() call out of loop. */
+   for (i = 0; i <= npages; i++) {
+   unsigned long this_pfn;
+   u64 csize;
+
+   /* The last chunk may have no valid PFN next t

Re: [PATCH v3 2/2] vhost-vdpa: fix page pinning leakage in error path

2020-10-15 Thread si-wei liu



On 10/15/2020 6:11 AM, Michael S. Tsirkin wrote:

On Thu, Oct 15, 2020 at 02:15:32PM +0800, Jason Wang wrote:

On 2020/10/14 上午7:42, si-wei liu wrote:


So what I suggest is to fix the pinning leakage first and do the
possible optimization on top (which is still questionable to me).

OK. Unfortunately, this was picked and got merged in upstream. So I will
post a follow up patch set to 1) revert the commit to the original
__get_free_page() implementation, and 2) fix the accounting and leakage
on top. Will it be fine?


Fine.

Thanks

Fine by me too.


Thanks, Michael & Jason. I will post the fix shortly. Stay tuned.

-Siwei



Re: [PATCH] vdpa/mlx5: Restore the hardware used index after change map

2021-02-02 Thread Si-Wei Liu
On Tue, Feb 2, 2021 at 6:34 AM Eli Cohen  wrote:
>
> When a change of memory map occurs, the hardware resources are destroyed
> and then re-created again with the new memory map. In such case, we need
> to restore the hardware available and used indices. The driver failed to
> restore the used index which is added here.
>
> Fixes 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")
> Signed-off-by: Eli Cohen 
> ---
> This patch is being sent again a single patch the fixes hot memory
> addtion to a qemy process.
>
>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 7 +++
>  1 file changed, 7 insertions(+)
>
> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> index 88dde3455bfd..839f57c64a6f 100644
> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {
> u64 device_addr;
> u64 driver_addr;
> u16 avail_index;
> +   u16 used_index;
> bool ready;
> struct vdpa_callback cb;
> bool restore;
> @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {
> u32 virtq_id;
> struct mlx5_vdpa_net *ndev;
> u16 avail_idx;
> +   u16 used_idx;
> int fw_state;
>
> /* keep last in the struct */
> @@ -804,6 +806,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, 
> struct mlx5_vdpa_virtque
>
> obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
> MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, 
> mvq->avail_idx);
> +   MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, 
> mvq->used_idx);

The saved indexes will apply to the new virtqueue object whenever it
is created. In virtio spec, these indexes will reset back to zero when
the virtio device is reset. But I don't see how it's done today. IOW,
I don't see where avail_idx and used_idx get cleared from the mvq for
device reset via set_status().

-Siwei


> MLX5_SET(virtio_net_q_object, obj_context, 
> queue_feature_bit_mask_12_3,
>  get_features_12_3(ndev->mvdev.actual_features));
> vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, 
> virtio_q_context);
> @@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net *ndev, 
> struct mlx5_vdpa_virtqueue *m
>  struct mlx5_virtq_attr {
> u8 state;
> u16 available_index;
> +   u16 used_index;
>  };
>
>  static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct 
> mlx5_vdpa_virtqueue *mvq,
> @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct mlx5_vdpa_net *ndev, 
> struct mlx5_vdpa_virtqueu
> memset(attr, 0, sizeof(*attr));
> attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
> attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, 
> hw_available_index);
> +   attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, 
> hw_used_index);
> kfree(out);
> return 0;
>
> @@ -1610,6 +1615,7 @@ static int save_channel_info(struct mlx5_vdpa_net 
> *ndev, struct mlx5_vdpa_virtqu
> return err;
>
> ri->avail_index = attr.available_index;
> +   ri->used_index = attr.used_index;
> ri->ready = mvq->ready;
> ri->num_ent = mvq->num_ent;
> ri->desc_addr = mvq->desc_addr;
> @@ -1654,6 +1660,7 @@ static void restore_channels_info(struct mlx5_vdpa_net 
> *ndev)
> continue;
>
> mvq->avail_idx = ri->avail_index;
> +   mvq->used_idx = ri->used_index;
> mvq->ready = ri->ready;
> mvq->num_ent = ri->num_ent;
> mvq->desc_addr = ri->desc_addr;
> --
> 2.29.2
>


Re: [PATCH 1/2] vdpa/mlx5: Avoid unnecessary query virtqueue

2021-02-02 Thread Si-Wei Liu
On Tue, Feb 2, 2021 at 1:23 AM Eli Cohen  wrote:
>
> On Tue, Feb 02, 2021 at 12:38:51AM -0800, Si-Wei Liu wrote:
> > Thanks Eli and Jason for clarifications. See inline.
> >
> > On Mon, Feb 1, 2021 at 11:06 PM Eli Cohen  wrote:
> > >
> > > On Tue, Feb 02, 2021 at 02:02:25PM +0800, Jason Wang wrote:
> > > >
> > > > On 2021/2/2 下午12:15, Si-Wei Liu wrote:
> > > > > On Mon, Feb 1, 2021 at 7:13 PM Jason Wang  wrote:
> > > > > >
> > > > > > On 2021/2/2 上午3:17, Si-Wei Liu wrote:
> > > > > > > On Mon, Feb 1, 2021 at 10:51 AM Si-Wei Liu 
> > > > > > >  wrote:
> > > > > > > > On Thu, Jan 28, 2021 at 5:46 AM Eli Cohen  
> > > > > > > > wrote:
> > > > > > > > > suspend_vq should only suspend the VQ on not save the current 
> > > > > > > > > available
> > > > > > > > > index. This is done when a change of map occurs when the 
> > > > > > > > > driver calls
> > > > > > > > > save_channel_info().
> > > > > > > > Hmmm, suspend_vq() is also called by teardown_vq(), the latter 
> > > > > > > > of
> > > > > > > > which doesn't save the available index as save_channel_info() 
> > > > > > > > doesn't
> > > > > > > > get called in that path at all. How does it handle the case that
> > > > > > > > aget_vq_state() is called from userspace (e.g. QEMU) while the
> > > > > > > > hardware VQ object was torn down, but userspace still wants to 
> > > > > > > > access
> > > > > > > > the queue index?
> > > > > > > >
> > > > > > > > Refer to 
> > > > > > > > https://lore.kernel.org/netdev/1601583511-15138-1-git-send-email-si-wei@oracle.com/
> > > > > > > >
> > > > > > > > vhost VQ 0 ring restore failed: -1: Resource temporarily 
> > > > > > > > unavailable (11)
> > > > > > > > vhost VQ 1 ring restore failed: -1: Resource temporarily 
> > > > > > > > unavailable (11)
> > > > > > > >
> > > > > > > > QEMU will complain with the above warning while VM is being 
> > > > > > > > rebooted
> > > > > > > > or shut down.
> > > > > > > >
> > > > > > > > Looks to me either the kernel driver should cover this 
> > > > > > > > requirement, or
> > > > > > > > the userspace has to bear the burden in saving the index and 
> > > > > > > > not call
> > > > > > > > into kernel if VQ is destroyed.
> > > > > > > Actually, the userspace doesn't have the insights whether virt 
> > > > > > > queue
> > > > > > > will be destroyed if just changing the device status via 
> > > > > > > set_status().
> > > > > > > Looking at other vdpa driver in tree i.e. ifcvf it doesn't behave 
> > > > > > > like
> > > > > > > so. Hence this still looks to me to be Mellanox specifics and
> > > > > > > mlx5_vdpa implementation detail that shouldn't expose to 
> > > > > > > userspace.
> > > > > >
> > > > > > So I think we can simply drop this patch?
> > > > > Yep, I think so. To be honest I don't know why it has anything to do
> > > > > with the memory hotplug issue.
> > > >
> > > >
> > > > Eli may know more, my understanding is that, during memory hotplut, qemu
> > > > need to propagate new memory mappings via set_map(). For mellanox, it 
> > > > means
> > > > it needs to rebuild memory keys, so the virtqueue needs to be suspended.
> > > >
> > >
> > > I think Siwei was asking why the first patch was related to the hotplug
> > > issue.
> >
> > I was thinking how consistency is assured when saving/restoring this
> > h/w avail_index against the one in the virtq memory, particularly in
> > the region_add/.region_del() context (e.g. the hotplug case). Problem
> > is I don't see explicit memory barrier when guest thread updates the
> > avail_index, how does the device make sure the

Re: [PATCH] vdpa/mlx5: Restore the hardware used index after change map

2021-02-03 Thread Si-Wei Liu
On Tue, Feb 2, 2021 at 10:48 PM Eli Cohen  wrote:
>
> On Tue, Feb 02, 2021 at 09:14:02AM -0800, Si-Wei Liu wrote:
> > On Tue, Feb 2, 2021 at 6:34 AM Eli Cohen  wrote:
> > >
> > > When a change of memory map occurs, the hardware resources are destroyed
> > > and then re-created again with the new memory map. In such case, we need
> > > to restore the hardware available and used indices. The driver failed to
> > > restore the used index which is added here.
> > >
> > > Fixes 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 
> > > devices")
> > > Signed-off-by: Eli Cohen 
> > > ---
> > > This patch is being sent again a single patch the fixes hot memory
> > > addtion to a qemy process.
> > >
> > >  drivers/vdpa/mlx5/net/mlx5_vnet.c | 7 +++
> > >  1 file changed, 7 insertions(+)
> > >
> > > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> > > b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > index 88dde3455bfd..839f57c64a6f 100644
> > > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > @@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {
> > > u64 device_addr;
> > > u64 driver_addr;
> > > u16 avail_index;
> > > +   u16 used_index;
> > > bool ready;
> > > struct vdpa_callback cb;
> > > bool restore;
> > > @@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {
> > > u32 virtq_id;
> > > struct mlx5_vdpa_net *ndev;
> > > u16 avail_idx;
> > > +   u16 used_idx;
> > > int fw_state;
> > >
> > > /* keep last in the struct */
> > > @@ -804,6 +806,7 @@ static int create_virtqueue(struct mlx5_vdpa_net 
> > > *ndev, struct mlx5_vdpa_virtque
> > >
> > > obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, 
> > > obj_context);
> > > MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, 
> > > mvq->avail_idx);
> > > +   MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, 
> > > mvq->used_idx);
> >
> > The saved indexes will apply to the new virtqueue object whenever it
> > is created. In virtio spec, these indexes will reset back to zero when
> > the virtio device is reset. But I don't see how it's done today. IOW,
> > I don't see where avail_idx and used_idx get cleared from the mvq for
> > device reset via set_status().
> >
>
> Right, but this is not strictly related to this patch. I will post
> another patch to fix this.

Better to post these two patches in a series.Or else it may cause VM
reboot problem as that is where the device gets reset. The avail_index
did not as the correct value will be written to by driver right after,
but used_idx introduced by this patch is supplied by device hence this
patch alone would introduce regression.

>
> BTW, can you describe a secnario that would cause a reset (through
> calling set_status()) that happens after the VQ has been used?

You can try reboot the guest, that'll be the easy way to test.

-Siwei

>
> > -Siwei
> >
> >
> > > MLX5_SET(virtio_net_q_object, obj_context, 
> > > queue_feature_bit_mask_12_3,
> > >  get_features_12_3(ndev->mvdev.actual_features));
> > > vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, 
> > > virtio_q_context);
> > > @@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net *ndev, 
> > > struct mlx5_vdpa_virtqueue *m
> > >  struct mlx5_virtq_attr {
> > > u8 state;
> > > u16 available_index;
> > > +   u16 used_index;
> > >  };
> > >
> > >  static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct 
> > > mlx5_vdpa_virtqueue *mvq,
> > > @@ -1052,6 +1056,7 @@ static int query_virtqueue(struct mlx5_vdpa_net 
> > > *ndev, struct mlx5_vdpa_virtqueu
> > > memset(attr, 0, sizeof(*attr));
> > > attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
> > > attr->available_index = MLX5_GET(virtio_net_q_object, 
> > > obj_context, hw_available_index);
> > > +   attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, 
> > > hw_used_index);
> > > kfree(out);
> > > return 0;
> > >
> > > @@ -1610,6 +1615,7 @@ static int save_channel_info(struct mlx5_vdpa_net 
> > > *ndev, struct mlx5_vdpa_virtqu
> > > return err;
> > >
> > > ri->avail_index = attr.available_index;
> > > +   ri->used_index = attr.used_index;
> > > ri->ready = mvq->ready;
> > > ri->num_ent = mvq->num_ent;
> > > ri->desc_addr = mvq->desc_addr;
> > > @@ -1654,6 +1660,7 @@ static void restore_channels_info(struct 
> > > mlx5_vdpa_net *ndev)
> > > continue;
> > >
> > > mvq->avail_idx = ri->avail_index;
> > > +   mvq->used_idx = ri->used_index;
> > > mvq->ready = ri->ready;
> > > mvq->num_ent = ri->num_ent;
> > > mvq->desc_addr = ri->desc_addr;
> > > --
> > > 2.29.2
> > >


Re: [PATCH 1/2] vdpa/mlx5: Avoid unnecessary query virtqueue

2021-02-01 Thread Si-Wei Liu
On Thu, Jan 28, 2021 at 5:46 AM Eli Cohen  wrote:
>
> suspend_vq should only suspend the VQ on not save the current available
> index. This is done when a change of map occurs when the driver calls
> save_channel_info().

Hmmm, suspend_vq() is also called by teardown_vq(), the latter of
which doesn't save the available index as save_channel_info() doesn't
get called in that path at all. How does it handle the case that
aget_vq_state() is called from userspace (e.g. QEMU) while the
hardware VQ object was torn down, but userspace still wants to access
the queue index?

Refer to 
https://lore.kernel.org/netdev/1601583511-15138-1-git-send-email-si-wei@oracle.com/

vhost VQ 0 ring restore failed: -1: Resource temporarily unavailable (11)
vhost VQ 1 ring restore failed: -1: Resource temporarily unavailable (11)

QEMU will complain with the above warning while VM is being rebooted
or shut down.

Looks to me either the kernel driver should cover this requirement, or
the userspace has to bear the burden in saving the index and not call
into kernel if VQ is destroyed.

-Siwei


>
> Signed-off-by: Eli Cohen 
> ---
>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 8 
>  1 file changed, 8 deletions(-)
>
> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> index 88dde3455bfd..549ded074ff3 100644
> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> @@ -1148,8 +1148,6 @@ static int setup_vq(struct mlx5_vdpa_net *ndev, struct 
> mlx5_vdpa_virtqueue *mvq)
>
>  static void suspend_vq(struct mlx5_vdpa_net *ndev, struct 
> mlx5_vdpa_virtqueue *mvq)
>  {
> -   struct mlx5_virtq_attr attr;
> -
> if (!mvq->initialized)
> return;
>
> @@ -1158,12 +1156,6 @@ static void suspend_vq(struct mlx5_vdpa_net *ndev, 
> struct mlx5_vdpa_virtqueue *m
>
> if (modify_virtqueue(ndev, mvq, 
> MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
> mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
> -
> -   if (query_virtqueue(ndev, mvq, &attr)) {
> -   mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
> -   return;
> -   }
> -   mvq->avail_idx = attr.available_index;
>  }
>
>  static void suspend_vqs(struct mlx5_vdpa_net *ndev)
> --
> 2.29.2
>


Re: [PATCH 1/2] vdpa/mlx5: Avoid unnecessary query virtqueue

2021-02-01 Thread Si-Wei Liu
On Mon, Feb 1, 2021 at 10:51 AM Si-Wei Liu  wrote:
>
> On Thu, Jan 28, 2021 at 5:46 AM Eli Cohen  wrote:
> >
> > suspend_vq should only suspend the VQ on not save the current available
> > index. This is done when a change of map occurs when the driver calls
> > save_channel_info().
>
> Hmmm, suspend_vq() is also called by teardown_vq(), the latter of
> which doesn't save the available index as save_channel_info() doesn't
> get called in that path at all. How does it handle the case that
> aget_vq_state() is called from userspace (e.g. QEMU) while the
> hardware VQ object was torn down, but userspace still wants to access
> the queue index?
>
> Refer to 
> https://lore.kernel.org/netdev/1601583511-15138-1-git-send-email-si-wei@oracle.com/
>
> vhost VQ 0 ring restore failed: -1: Resource temporarily unavailable (11)
> vhost VQ 1 ring restore failed: -1: Resource temporarily unavailable (11)
>
> QEMU will complain with the above warning while VM is being rebooted
> or shut down.
>
> Looks to me either the kernel driver should cover this requirement, or
> the userspace has to bear the burden in saving the index and not call
> into kernel if VQ is destroyed.

Actually, the userspace doesn't have the insights whether virt queue
will be destroyed if just changing the device status via set_status().
Looking at other vdpa driver in tree i.e. ifcvf it doesn't behave like
so. Hence this still looks to me to be Mellanox specifics and
mlx5_vdpa implementation detail that shouldn't expose to userspace.
>
> -Siwei
>
>
> >
> > Signed-off-by: Eli Cohen 
> > ---
> >  drivers/vdpa/mlx5/net/mlx5_vnet.c | 8 
> >  1 file changed, 8 deletions(-)
> >
> > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> > b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > index 88dde3455bfd..549ded074ff3 100644
> > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > @@ -1148,8 +1148,6 @@ static int setup_vq(struct mlx5_vdpa_net *ndev, 
> > struct mlx5_vdpa_virtqueue *mvq)
> >
> >  static void suspend_vq(struct mlx5_vdpa_net *ndev, struct 
> > mlx5_vdpa_virtqueue *mvq)
> >  {
> > -   struct mlx5_virtq_attr attr;
> > -
> > if (!mvq->initialized)
> > return;
> >
> > @@ -1158,12 +1156,6 @@ static void suspend_vq(struct mlx5_vdpa_net *ndev, 
> > struct mlx5_vdpa_virtqueue *m
> >
> > if (modify_virtqueue(ndev, mvq, 
> > MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
> > mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
> > -
> > -   if (query_virtqueue(ndev, mvq, &attr)) {
> > -   mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
> > -   return;
> > -   }
> > -   mvq->avail_idx = attr.available_index;
> >  }
> >
> >  static void suspend_vqs(struct mlx5_vdpa_net *ndev)
> > --
> > 2.29.2
> >


Re: [PATCH 1/2] vdpa/mlx5: Avoid unnecessary query virtqueue

2021-02-01 Thread Si-Wei Liu
On Mon, Feb 1, 2021 at 7:13 PM Jason Wang  wrote:
>
>
> On 2021/2/2 上午3:17, Si-Wei Liu wrote:
> > On Mon, Feb 1, 2021 at 10:51 AM Si-Wei Liu  wrote:
> >> On Thu, Jan 28, 2021 at 5:46 AM Eli Cohen  wrote:
> >>> suspend_vq should only suspend the VQ on not save the current available
> >>> index. This is done when a change of map occurs when the driver calls
> >>> save_channel_info().
> >> Hmmm, suspend_vq() is also called by teardown_vq(), the latter of
> >> which doesn't save the available index as save_channel_info() doesn't
> >> get called in that path at all. How does it handle the case that
> >> aget_vq_state() is called from userspace (e.g. QEMU) while the
> >> hardware VQ object was torn down, but userspace still wants to access
> >> the queue index?
> >>
> >> Refer to 
> >> https://lore.kernel.org/netdev/1601583511-15138-1-git-send-email-si-wei@oracle.com/
> >>
> >> vhost VQ 0 ring restore failed: -1: Resource temporarily unavailable (11)
> >> vhost VQ 1 ring restore failed: -1: Resource temporarily unavailable (11)
> >>
> >> QEMU will complain with the above warning while VM is being rebooted
> >> or shut down.
> >>
> >> Looks to me either the kernel driver should cover this requirement, or
> >> the userspace has to bear the burden in saving the index and not call
> >> into kernel if VQ is destroyed.
> > Actually, the userspace doesn't have the insights whether virt queue
> > will be destroyed if just changing the device status via set_status().
> > Looking at other vdpa driver in tree i.e. ifcvf it doesn't behave like
> > so. Hence this still looks to me to be Mellanox specifics and
> > mlx5_vdpa implementation detail that shouldn't expose to userspace.
>
>
> So I think we can simply drop this patch?

Yep, I think so. To be honest I don't know why it has anything to do
with the memory hotplug issue.

-Siwei

>
> Thanks
>
>
> >> -Siwei
> >>
> >>
> >>> Signed-off-by: Eli Cohen 
> >>> ---
> >>>   drivers/vdpa/mlx5/net/mlx5_vnet.c | 8 
> >>>   1 file changed, 8 deletions(-)
> >>>
> >>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> >>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> >>> index 88dde3455bfd..549ded074ff3 100644
> >>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> >>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> >>> @@ -1148,8 +1148,6 @@ static int setup_vq(struct mlx5_vdpa_net *ndev, 
> >>> struct mlx5_vdpa_virtqueue *mvq)
> >>>
> >>>   static void suspend_vq(struct mlx5_vdpa_net *ndev, struct 
> >>> mlx5_vdpa_virtqueue *mvq)
> >>>   {
> >>> -   struct mlx5_virtq_attr attr;
> >>> -
> >>>  if (!mvq->initialized)
> >>>  return;
> >>>
> >>> @@ -1158,12 +1156,6 @@ static void suspend_vq(struct mlx5_vdpa_net *ndev, 
> >>> struct mlx5_vdpa_virtqueue *m
> >>>
> >>>  if (modify_virtqueue(ndev, mvq, 
> >>> MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
> >>>  mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend 
> >>> failed\n");
> >>> -
> >>> -   if (query_virtqueue(ndev, mvq, &attr)) {
> >>> -   mlx5_vdpa_warn(&ndev->mvdev, "failed to query 
> >>> virtqueue\n");
> >>> -   return;
> >>> -   }
> >>> -   mvq->avail_idx = attr.available_index;
> >>>   }
> >>>
> >>>   static void suspend_vqs(struct mlx5_vdpa_net *ndev)
> >>> --
> >>> 2.29.2
> >>>
>


Re: [PATCH 1/2] vdpa/mlx5: Avoid unnecessary query virtqueue

2021-02-02 Thread Si-Wei Liu
Thanks Eli and Jason for clarifications. See inline.

On Mon, Feb 1, 2021 at 11:06 PM Eli Cohen  wrote:
>
> On Tue, Feb 02, 2021 at 02:02:25PM +0800, Jason Wang wrote:
> >
> > On 2021/2/2 下午12:15, Si-Wei Liu wrote:
> > > On Mon, Feb 1, 2021 at 7:13 PM Jason Wang  wrote:
> > > >
> > > > On 2021/2/2 上午3:17, Si-Wei Liu wrote:
> > > > > On Mon, Feb 1, 2021 at 10:51 AM Si-Wei Liu  
> > > > > wrote:
> > > > > > On Thu, Jan 28, 2021 at 5:46 AM Eli Cohen  wrote:
> > > > > > > suspend_vq should only suspend the VQ on not save the current 
> > > > > > > available
> > > > > > > index. This is done when a change of map occurs when the driver 
> > > > > > > calls
> > > > > > > save_channel_info().
> > > > > > Hmmm, suspend_vq() is also called by teardown_vq(), the latter of
> > > > > > which doesn't save the available index as save_channel_info() 
> > > > > > doesn't
> > > > > > get called in that path at all. How does it handle the case that
> > > > > > aget_vq_state() is called from userspace (e.g. QEMU) while the
> > > > > > hardware VQ object was torn down, but userspace still wants to 
> > > > > > access
> > > > > > the queue index?
> > > > > >
> > > > > > Refer to 
> > > > > > https://lore.kernel.org/netdev/1601583511-15138-1-git-send-email-si-wei@oracle.com/
> > > > > >
> > > > > > vhost VQ 0 ring restore failed: -1: Resource temporarily 
> > > > > > unavailable (11)
> > > > > > vhost VQ 1 ring restore failed: -1: Resource temporarily 
> > > > > > unavailable (11)
> > > > > >
> > > > > > QEMU will complain with the above warning while VM is being rebooted
> > > > > > or shut down.
> > > > > >
> > > > > > Looks to me either the kernel driver should cover this requirement, 
> > > > > > or
> > > > > > the userspace has to bear the burden in saving the index and not 
> > > > > > call
> > > > > > into kernel if VQ is destroyed.
> > > > > Actually, the userspace doesn't have the insights whether virt queue
> > > > > will be destroyed if just changing the device status via set_status().
> > > > > Looking at other vdpa driver in tree i.e. ifcvf it doesn't behave like
> > > > > so. Hence this still looks to me to be Mellanox specifics and
> > > > > mlx5_vdpa implementation detail that shouldn't expose to userspace.
> > > >
> > > > So I think we can simply drop this patch?
> > > Yep, I think so. To be honest I don't know why it has anything to do
> > > with the memory hotplug issue.
> >
> >
> > Eli may know more, my understanding is that, during memory hotplut, qemu
> > need to propagate new memory mappings via set_map(). For mellanox, it means
> > it needs to rebuild memory keys, so the virtqueue needs to be suspended.
> >
>
> I think Siwei was asking why the first patch was related to the hotplug
> issue.

I was thinking how consistency is assured when saving/restoring this
h/w avail_index against the one in the virtq memory, particularly in
the region_add/.region_del() context (e.g. the hotplug case). Problem
is I don't see explicit memory barrier when guest thread updates the
avail_index, how does the device make sure the h/w avail_index is
uptodate while guest may race with updating the virtq's avail_index in
the mean while? Maybe I completely miss something obvious?

Thanks,
-Siwei

>
> But you're correct. When memory is added, I get a new memory map. This
> requires me to build a new memory key object which covers the new memory
> map. Since the virtqueue objects are referencing this memory key, I need
> to destroy them and build new ones referncing the new memory key.
>
> > Thanks
> >
> >
> > >
> > > -Siwei
> > >
> > > > Thanks
> > > >
> > > >
> > > > > > -Siwei
> > > > > >
> > > > > >
> > > > > > > Signed-off-by: Eli Cohen 
> > > > > > > ---
> > > > > > >drivers/vdpa/mlx5/net/mlx5_vnet.c | 8 
> > > > > > >1 file changed, 8 deletions(-)
> > > > > > >
> > > > > > > diff --gi

Re: [PATCH v1] vdpa/mlx5: Restore the hardware used index after change map

2021-02-10 Thread Si-Wei Liu




On 2/9/2021 7:53 PM, Jason Wang wrote:


On 2021/2/10 上午10:30, Si-Wei Liu wrote:



On 2/8/2021 10:37 PM, Jason Wang wrote:


On 2021/2/9 下午2:12, Eli Cohen wrote:

On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

On 2021/2/8 下午6:04, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

On 2021/2/8 下午2:37, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

On 2021/2/6 上午7:07, Si-Wei Liu wrote:

On 2/3/2021 11:36 PM, Eli Cohen wrote:
When a change of memory map occurs, the hardware resources 
are destroyed
and then re-created again with the new memory map. In such 
case, we need
to restore the hardware available and used indices. The 
driver failed to

restore the used index which is added here.

Also, since the driver also fails to reset the available and 
used
indices upon device reset, fix this here to avoid regression 
caused by

the fact that used index may not be zero upon device reset.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for 
supported mlx5

devices")
Signed-off-by: Eli Cohen
---
v0 -> v1:
Clear indices upon device reset

     drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++
     1 file changed, 18 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 88dde3455bfd..b5fe6d2ad22f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {
     u64 device_addr;
     u64 driver_addr;
     u16 avail_index;
+    u16 used_index;
     bool ready;
     struct vdpa_callback cb;
     bool restore;
@@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {
     u32 virtq_id;
     struct mlx5_vdpa_net *ndev;
     u16 avail_idx;
+    u16 used_idx;
     int fw_state;
       /* keep last in the struct */
@@ -804,6 +806,7 @@ static int create_virtqueue(struct 
mlx5_vdpa_net

*ndev, struct mlx5_vdpa_virtque
       obj_context = 
MLX5_ADDR_OF(create_virtio_net_q_in, in,

obj_context);
     MLX5_SET(virtio_net_q_object, obj_context, 
hw_available_index,

mvq->avail_idx);
+    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,
mvq->used_idx);
     MLX5_SET(virtio_net_q_object, obj_context,
queue_feature_bit_mask_12_3,
get_features_12_3(ndev->mvdev.actual_features));
     vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, 
obj_context,

virtio_q_context);
@@ -1022,6 +1025,7 @@ static int connect_qps(struct 
mlx5_vdpa_net

*ndev, struct mlx5_vdpa_virtqueue *m
     struct mlx5_virtq_attr {
     u8 state;
     u16 available_index;
+    u16 used_index;
     };
       static int query_virtqueue(struct mlx5_vdpa_net 
*ndev, struct

mlx5_vdpa_virtqueue *mvq,
@@ -1052,6 +1056,7 @@ static int query_virtqueue(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu
     memset(attr, 0, sizeof(*attr));
     attr->state = MLX5_GET(virtio_net_q_object, 
obj_context, state);

     attr->available_index = MLX5_GET(virtio_net_q_object,
obj_context, hw_available_index);
+    attr->used_index = MLX5_GET(virtio_net_q_object, 
obj_context,

hw_used_index);
     kfree(out);
     return 0;
     @@ -1535,6 +1540,16 @@ static void 
teardown_virtqueues(struct

mlx5_vdpa_net *ndev)
     }
     }
     +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)
+{
+    int i;
+
+    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
+    ndev->vqs[i].avail_idx = 0;
+    ndev->vqs[i].used_idx = 0;
+    }
+}
+
     /* TODO: cross-endian support */
     static inline bool mlx5_vdpa_is_little_endian(struct 
mlx5_vdpa_dev

*mvdev)
     {
@@ -1610,6 +1625,7 @@ static int save_channel_info(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu
     return err;
       ri->avail_index = attr.available_index;
+    ri->used_index = attr.used_index;
     ri->ready = mvq->ready;
     ri->num_ent = mvq->num_ent;
     ri->desc_addr = mvq->desc_addr;
@@ -1654,6 +1670,7 @@ static void restore_channels_info(struct
mlx5_vdpa_net *ndev)
     continue;
       mvq->avail_idx = ri->avail_index;
+    mvq->used_idx = ri->used_index;
     mvq->ready = ri->ready;
     mvq->num_ent = ri->num_ent;
     mvq->desc_addr = ri->desc_addr;
@@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct
vdpa_device *vdev, u8 status)
     if (!status) {
     mlx5_vdpa_info(mvdev, "performing device 
reset\n");

     teardown_driver(ndev);
+    clear_virtqueues(ndev);
The clearing looks fine at the first glance, as it aligns 
with the other
state cleanups floating around at the same place. However, 
the thing is
get_vq_state() is supposed to be called right after to get 
sync'ed with
the latest internal avail_index from device while vq is 
stopped. The
index was saved in the driver software a

[PATCH v2 3/3] vdpa/mlx5: defer clear_virtqueues to until DRIVER_OK

2021-02-10 Thread Si-Wei Liu
While virtq is stopped,  get_vq_state() is supposed to
be  called to  get  sync'ed  with  the latest internal
avail_index from device. The saved avail_index is used
to restate  the virtq  once device is started.  Commit
b35ccebe3ef7 introduced the clear_virtqueues() routine
to  reset  the saved  avail_index,  however, the index
gets cleared a bit earlier before get_vq_state() tries
to read it. This would cause consistency problems when
virtq is restarted, e.g. through a series of link down
and link up events. We  could  defer  the  clearing of
avail_index  to  until  the  device  is to be started,
i.e. until  VIRTIO_CONFIG_S_DRIVER_OK  is set again in
set_status().

Fixes: b35ccebe3ef7 ("vdpa/mlx5: Restore the hardware used index after change 
map")
Signed-off-by: Si-Wei Liu 
Acked-by: Jason Wang 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 7c1f789..ce6aae8 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1777,7 +1777,6 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
if (!status) {
mlx5_vdpa_info(mvdev, "performing device reset\n");
teardown_driver(ndev);
-   clear_virtqueues(ndev);
mlx5_vdpa_destroy_mr(&ndev->mvdev);
ndev->mvdev.status = 0;
++mvdev->generation;
@@ -1786,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
 
if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+   clear_virtqueues(ndev);
err = setup_driver(ndev);
if (err) {
mlx5_vdpa_warn(mvdev, "failed to setup 
driver\n");
-- 
1.8.3.1



[PATCH v2 0/3] mlx5_vdpa bug fixes

2021-02-10 Thread Si-Wei Liu
This set attempts to fix a few independent issues recently found
in mlx5_vdpa driver. Please find details for each in the commit
message.

Patch 1 and patch 3 are already Ack'ed by Jason Wang. Patch 2 is
reworked to move virtio feature capability query to mlx5v_probe()
as suggested by Eli.

--
v1->v2: move feature capability query to probing (Eli)

Si-Wei Liu (3):
  vdpa/mlx5: should exclude header length and fcs from mtu
  vdpa/mlx5: fix feature negotiation across device reset
  vdpa/mlx5: defer clear_virtqueues to until DRIVER_OK

 drivers/vdpa/mlx5/core/mlx5_vdpa.h |  4 
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 42 +++---
 2 files changed, 34 insertions(+), 12 deletions(-)

-- 
1.8.3.1



[PATCH v2 2/3] vdpa/mlx5: fix feature negotiation across device reset

2021-02-10 Thread Si-Wei Liu
The mlx_features denotes the capability for which
set of virtio features is supported by device. In
principle, this field needs not be cleared during
virtio device reset, as this capability is static
and does not change across reset.

In fact, the current code may have the assumption
that mlx_features can be reloaded from firmware
via the .get_features ops after device is reset
(via the .set_status ops), which is unfortunately
not true. The userspace VMM might save a copy
of backend capable features and won't call into
kernel again to get it on reset. This causes all
virtio features getting disabled on newly created
virtqs after device reset, while guest would hold
mismatched view of available features. For e.g.,
the guest may still assume tx checksum offload
is available after reset and feature negotiation,
causing frames with bogus (incomplete) checksum
transmitted on the wire.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")
Signed-off-by: Si-Wei Liu 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 25 +++--
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b8416c4..7c1f789 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1486,16 +1486,8 @@ static u64 mlx_to_vritio_features(u16 dev_features)
 static u64 mlx5_vdpa_get_features(struct vdpa_device *vdev)
 {
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
-   struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-   u16 dev_features;
 
-   dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, 
device_features_bits_mask);
-   ndev->mvdev.mlx_features = mlx_to_vritio_features(dev_features);
-   if (MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, virtio_version_1_0))
-   ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_VERSION_1);
-   ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
-   print_features(mvdev, ndev->mvdev.mlx_features, false);
-   return ndev->mvdev.mlx_features;
+   return mvdev->mlx_features;
 }
 
 static int verify_min_features(struct mlx5_vdpa_dev *mvdev, u64 features)
@@ -1788,7 +1780,6 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
clear_virtqueues(ndev);
mlx5_vdpa_destroy_mr(&ndev->mvdev);
ndev->mvdev.status = 0;
-   ndev->mvdev.mlx_features = 0;
++mvdev->generation;
return;
}
@@ -1907,6 +1898,19 @@ static int mlx5_get_vq_irq(struct vdpa_device *vdv, u16 
idx)
.free = mlx5_vdpa_free,
 };
 
+static void query_virtio_features(struct mlx5_vdpa_net *ndev)
+{
+   struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
+   u16 dev_features;
+
+   dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, 
device_features_bits_mask);
+   mvdev->mlx_features = mlx_to_vritio_features(dev_features);
+   if (MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, virtio_version_1_0))
+   mvdev->mlx_features |= BIT_ULL(VIRTIO_F_VERSION_1);
+   mvdev->mlx_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
+   print_features(mvdev, mvdev->mlx_features, false);
+}
+
 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
 {
u16 hw_mtu;
@@ -2005,6 +2009,7 @@ static int mlx5v_probe(struct auxiliary_device *adev,
init_mvqs(ndev);
mutex_init(&ndev->reslock);
config = &ndev->config;
+   query_virtio_features(ndev);
err = query_mtu(mdev, &ndev->mtu);
if (err)
goto err_mtu;
-- 
1.8.3.1



[PATCH v2 1/3] vdpa/mlx5: should exclude header length and fcs from mtu

2021-02-10 Thread Si-Wei Liu
When feature VIRTIO_NET_F_MTU is negotiated on mlx5_vdpa,
22 extra bytes worth of MTU length is shown in guest.
This is because the mlx5_query_port_max_mtu API returns
the "hardware" MTU value, which does not just contain the
 Ethernet payload, but includes extra lengths starting
from the Ethernet header up to the FCS altogether.

Fix the MTU so packets won't get dropped silently.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")
Signed-off-by: Si-Wei Liu 
Acked-by: Jason Wang 
Acked-by: Eli Cohen 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h |  4 
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 15 ++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 08f742f..b6cc53b 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -4,9 +4,13 @@
 #ifndef __MLX5_VDPA_H__
 #define __MLX5_VDPA_H__
 
+#include 
+#include 
 #include 
 #include 
 
+#define MLX5V_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
+
 struct mlx5_vdpa_direct_mr {
u64 start;
u64 end;
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index dc88559..b8416c4 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1907,6 +1907,19 @@ static int mlx5_get_vq_irq(struct vdpa_device *vdv, u16 
idx)
.free = mlx5_vdpa_free,
 };
 
+static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
+{
+   u16 hw_mtu;
+   int err;
+
+   err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
+   if (err)
+   return err;
+
+   *mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
+   return 0;
+}
+
 static int alloc_resources(struct mlx5_vdpa_net *ndev)
 {
struct mlx5_vdpa_net_resources *res = &ndev->res;
@@ -1992,7 +2005,7 @@ static int mlx5v_probe(struct auxiliary_device *adev,
init_mvqs(ndev);
mutex_init(&ndev->reslock);
config = &ndev->config;
-   err = mlx5_query_nic_vport_mtu(mdev, &ndev->mtu);
+   err = query_mtu(mdev, &ndev->mtu);
if (err)
goto err_mtu;
 
-- 
1.8.3.1



Re: [PATCH] vdpa/mlx5: set_features should allow reset to zero

2021-02-23 Thread Si-Wei Liu




On 2/23/2021 5:26 AM, Michael S. Tsirkin wrote:

On Tue, Feb 23, 2021 at 10:03:57AM +0800, Jason Wang wrote:

On 2021/2/23 9:12 上午, Si-Wei Liu wrote:


On 2/21/2021 11:34 PM, Michael S. Tsirkin wrote:

On Mon, Feb 22, 2021 at 12:14:17PM +0800, Jason Wang wrote:

On 2021/2/19 7:54 下午, Si-Wei Liu wrote:

Commit 452639a64ad8 ("vdpa: make sure set_features is invoked
for legacy") made an exception for legacy guests to reset
features to 0, when config space is accessed before features
are set. We should relieve the verify_min_features() check
and allow features reset to 0 for this case.

It's worth noting that not just legacy guests could access
config space before features are set. For instance, when
feature VIRTIO_NET_F_MTU is advertised some modern driver
will try to access and validate the MTU present in the config
space before virtio features are set.

This looks like a spec violation:

"

The following driver-read-only field, mtu only exists if
VIRTIO_NET_F_MTU is
set.
This field specifies the maximum MTU for the driver to use.
"

Do we really want to workaround this?

Thanks

And also:

The driver MUST follow this sequence to initialize a device:
1. Reset the device.
2. Set the ACKNOWLEDGE status bit: the guest OS has noticed the device.
3. Set the DRIVER status bit: the guest OS knows how to drive the
device.
4. Read device feature bits, and write the subset of feature bits
understood by the OS and driver to the
device. During this step the driver MAY read (but MUST NOT write)
the device-specific configuration
fields to check that it can support the device before accepting it.
5. Set the FEATURES_OK status bit. The driver MUST NOT accept new
feature bits after this step.
6. Re-read device status to ensure the FEATURES_OK bit is still set:
otherwise, the device does not
support our subset of features and the device is unusable.
7. Perform device-specific setup, including discovery of virtqueues
for the device, optional per-bus setup,
reading and possibly writing the device’s virtio configuration
space, and population of virtqueues.
8. Set the DRIVER_OK status bit. At this point the device is “live”.


so accessing config space before FEATURES_OK is a spec violation, right?

It is, but it's not relevant to what this commit tries to address. I
thought the legacy guest still needs to be supported.

Having said, a separate patch has to be posted to fix the guest driver
issue where this discrepancy is introduced to virtnet_validate() (since
commit fe36cbe067). But it's not technically related to this patch.

-Siwei


I think it's a bug to read config space in validate, we should move it to
virtnet_probe().

Thanks

I take it back, reading but not writing seems to be explicitly allowed by spec.
So our way to detect a legacy guest is bogus, need to think what is
the best way to handle this.
Then maybe revert commit fe36cbe067 and friends, and have QEMU detect 
legacy guest? Supposedly only config space write access needs to be 
guarded before setting FEATURES_OK.


-Siwie




Rejecting reset to 0
prematurely causes correct MTU and link status unable to load
for the very first config space access, rendering issues like
guest showing inaccurate MTU value, or failure to reject
out-of-range MTU.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for
supported mlx5 devices")
Signed-off-by: Si-Wei Liu 
---
    drivers/vdpa/mlx5/net/mlx5_vnet.c | 15 +--
    1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 7c1f789..540dd67 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1490,14 +1490,6 @@ static u64
mlx5_vdpa_get_features(struct vdpa_device *vdev)
    return mvdev->mlx_features;
    }
-static int verify_min_features(struct mlx5_vdpa_dev *mvdev,
u64 features)
-{
-    if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
-    return -EOPNOTSUPP;
-
-    return 0;
-}
-
    static int setup_virtqueues(struct mlx5_vdpa_net *ndev)
    {
    int err;
@@ -1558,18 +1550,13 @@ static int
mlx5_vdpa_set_features(struct vdpa_device *vdev, u64
features)
    {
    struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
    struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-    int err;
    print_features(mvdev, features, true);
-    err = verify_min_features(mvdev, features);
-    if (err)
-    return err;
-
    ndev->mvdev.actual_features = features &
ndev->mvdev.mlx_features;
    ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu);
    ndev->config.status |= cpu_to_mlx5vdpa16(mvdev,
VIRTIO_NET_S_LINK_UP);
-    return err;
+    return 0;
    }
    static void mlx5_vdpa_set_config_cb(struct vdpa_device
*vdev, struct vdpa_callback *cb)




Re: [PATCH] vdpa/mlx5: set_features should allow reset to zero

2021-02-24 Thread Si-Wei Liu




On 2/23/2021 9:04 PM, Michael S. Tsirkin wrote:

On Tue, Feb 23, 2021 at 11:35:57AM -0800, Si-Wei Liu wrote:


On 2/23/2021 5:26 AM, Michael S. Tsirkin wrote:

On Tue, Feb 23, 2021 at 10:03:57AM +0800, Jason Wang wrote:

On 2021/2/23 9:12 上午, Si-Wei Liu wrote:

On 2/21/2021 11:34 PM, Michael S. Tsirkin wrote:

On Mon, Feb 22, 2021 at 12:14:17PM +0800, Jason Wang wrote:

On 2021/2/19 7:54 下午, Si-Wei Liu wrote:

Commit 452639a64ad8 ("vdpa: make sure set_features is invoked
for legacy") made an exception for legacy guests to reset
features to 0, when config space is accessed before features
are set. We should relieve the verify_min_features() check
and allow features reset to 0 for this case.

It's worth noting that not just legacy guests could access
config space before features are set. For instance, when
feature VIRTIO_NET_F_MTU is advertised some modern driver
will try to access and validate the MTU present in the config
space before virtio features are set.

This looks like a spec violation:

"

The following driver-read-only field, mtu only exists if
VIRTIO_NET_F_MTU is
set.
This field specifies the maximum MTU for the driver to use.
"

Do we really want to workaround this?

Thanks

And also:

The driver MUST follow this sequence to initialize a device:
1. Reset the device.
2. Set the ACKNOWLEDGE status bit: the guest OS has noticed the device.
3. Set the DRIVER status bit: the guest OS knows how to drive the
device.
4. Read device feature bits, and write the subset of feature bits
understood by the OS and driver to the
device. During this step the driver MAY read (but MUST NOT write)
the device-specific configuration
fields to check that it can support the device before accepting it.
5. Set the FEATURES_OK status bit. The driver MUST NOT accept new
feature bits after this step.
6. Re-read device status to ensure the FEATURES_OK bit is still set:
otherwise, the device does not
support our subset of features and the device is unusable.
7. Perform device-specific setup, including discovery of virtqueues
for the device, optional per-bus setup,
reading and possibly writing the device’s virtio configuration
space, and population of virtqueues.
8. Set the DRIVER_OK status bit. At this point the device is “live”.


so accessing config space before FEATURES_OK is a spec violation, right?

It is, but it's not relevant to what this commit tries to address. I
thought the legacy guest still needs to be supported.

Having said, a separate patch has to be posted to fix the guest driver
issue where this discrepancy is introduced to virtnet_validate() (since
commit fe36cbe067). But it's not technically related to this patch.

-Siwei

I think it's a bug to read config space in validate, we should move it to
virtnet_probe().

Thanks

I take it back, reading but not writing seems to be explicitly allowed by spec.
So our way to detect a legacy guest is bogus, need to think what is
the best way to handle this.

Then maybe revert commit fe36cbe067 and friends, and have QEMU detect legacy
guest? Supposedly only config space write access needs to be guarded before
setting FEATURES_OK.

-Siwie

Detecting it isn't enough though, we will need a new ioctl to notify
the kernel that it's a legacy guest. Ugh :(
Well, although I think adding an ioctl is doable, may I know what the 
use case there will be for kernel to leverage such info directly? Is 
there a case QEMU can't do with dedicate ioctls later if there's indeed 
differentiation (legacy v.s. modern) needed?


One of the reason I asked is if this ioctl becomes a mandate for 
vhost-vdpa kernel. QEMU would reject initialize vhost-vdpa if doesn't 
see this ioctl coming?


If it's optional, suppose the kernel may need it only when it becomes 
necessary?


Thanks,
-Siwei






Rejecting reset to 0
prematurely causes correct MTU and link status unable to load
for the very first config space access, rendering issues like
guest showing inaccurate MTU value, or failure to reject
out-of-range MTU.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for
supported mlx5 devices")
Signed-off-by: Si-Wei Liu 
---
     drivers/vdpa/mlx5/net/mlx5_vnet.c | 15 +--
     1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 7c1f789..540dd67 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1490,14 +1490,6 @@ static u64
mlx5_vdpa_get_features(struct vdpa_device *vdev)
     return mvdev->mlx_features;
     }
-static int verify_min_features(struct mlx5_vdpa_dev *mvdev,
u64 features)
-{
-    if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
-    return -EOPNOTSUPP;
-
-    return 0;
-}
-
     static int setup_virtqueues(struct mlx5_vdpa_net *ndev)
     {
     int err;
@@ -1558,18 +1550,13 @@ static int
mlx5_vdpa_set_features(struct vdpa_device *vdev, u64

Re: [PATCH v3] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-25 Thread si-wei liu



On 11/25/2020 1:30 AM, Michael S. Tsirkin wrote:

On Thu, Nov 05, 2020 at 06:26:33PM -0500, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 


Not sure which tree this is against, I had to apply this with
minor tweaks. Pls take a look at the vhost tree and
let me know whether it looks ok to you.
Thanks Michael, the commit ad89653f79f1882d55d9df76c9b2b94f008c4e27 in 
the vhost tree looks good. Sorry, I don't think I ever attempted to 
merge with linux-next when v3 was posted, although I did it for the 
first two submissions. I will pay attention to it next time.


Thanks,
-Siwei




---
Changes in v3:
- Turn explicit last_pfn check to a WARN_ON() (Jason)

Changes in v2:
- Drop the reversion patch
- Fix unhandled page leak towards the end of page_list

  drivers/vhost/vdpa.c | 80 
  1 file changed, 62 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..5b13dfd 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
  
  	if (r)

vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+   else
+   atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
  
  	return r;

  }
@@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
unsigned int gup_flags = FOLL_LONGTERM;
unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-   unsigned long locked, lock_limit, pinned, i;
+   unsigned long lock_limit, sz2pin, nchunks, i;
u64 iova = msg->iova;
+   long pinned;
int ret = 0;
  
  	if (vhost_iotlb_itree_first(iotlb, msg->iova,

msg->iova + msg->size - 1))
return -EEXIST;
  
+	/* Limit the use of memory for bookkeeping */

page_list = (struct page **) __get_free_page(GFP_KERNEL);
if (!page_list)
return -ENOMEM;
@@ -607,52 +611,75 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
gup_flags |= FOLL_WRITE;
  
  	npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT;

-   if (!npages)
-   return -EINVAL;
+   if (!npages) {
+   ret = -EINVAL;
+   goto free;
+   }
  
  	mmap_read_lock(dev->mm);
  
-	locked = atomic64_add_return(npages, &dev->mm->pinned_vm);

lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (locked > lock_limit) {
+   if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
ret = -ENOMEM;
-   goto out;
+   goto unlock;
}
  
  	cur_base = msg->uaddr & PAGE_MASK;

iova &= PAGE_MASK;
+   nchunks = 0;
  
  	while (npages) {

-   pinned = min_t(unsigned long, npages, list_size);
-   ret = pin_user_pages(cur_base, pinned,
-gup_flags, page_list, NULL);
-   if (ret != pinned)
+   sz2pin = min_t(unsigned long, npages, list_size);
+   pinned = pin_user_pages(cur_base, sz2pin,
+   gup_flags, page_list, NULL);
+   if (sz2pin != pinned) {
+   if (pinned < 0) {
+   ret = pinned;
+   } else {
+   unpin_user_pages(page_list, pinned);
+   ret = -ENOMEM;
+   }
goto out;
+   }
+   nchunks++;
  
  		if (!last_pfn)

map_pfn = page_to_pfn(page_list[0]);
  
-		for (i = 0; i < ret; i++) {

+   for (i = 0; i < pinned; i++) {
unsigned long this_pfn = page_to_pfn(page_list[i]);
u64 csize;
  
  			if (last_pfn && (this_pfn != last_pfn + 1)) {

/* Pin a contiguous chunk of memory */
csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-   if (vhost_vdpa_map(v, iova, csize,
-  map_pfn << PAGE_SHIFT,
-  msg->perm))
+ 

Re: [PATCH] vdpa/mlx5: set_features should allow reset to zero

2021-02-22 Thread Si-Wei Liu




On 2/21/2021 8:14 PM, Jason Wang wrote:


On 2021/2/19 7:54 下午, Si-Wei Liu wrote:

Commit 452639a64ad8 ("vdpa: make sure set_features is invoked
for legacy") made an exception for legacy guests to reset
features to 0, when config space is accessed before features
are set. We should relieve the verify_min_features() check
and allow features reset to 0 for this case.

It's worth noting that not just legacy guests could access
config space before features are set. For instance, when
feature VIRTIO_NET_F_MTU is advertised some modern driver
will try to access and validate the MTU present in the config
space before virtio features are set.



This looks like a spec violation:

"

The following driver-read-only field, mtu only exists if 
VIRTIO_NET_F_MTU is set. This field specifies the maximum MTU for the 
driver to use.

"

Do we really want to workaround this?


Isn't the commit 452639a64ad8 itself is a workaround for legacy guest?

I think the point is, since there's legacy guest we'd have to support, 
this host side workaround is unavoidable. Although I agree the violating 
driver should be fixed (yes, it's in today's upstream kernel which 
exists for a while now).


-Siwei



Thanks



Rejecting reset to 0
prematurely causes correct MTU and link status unable to load
for the very first config space access, rendering issues like
guest showing inaccurate MTU value, or failure to reject
out-of-range MTU.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 
devices")

Signed-off-by: Si-Wei Liu 
---
  drivers/vdpa/mlx5/net/mlx5_vnet.c | 15 +--
  1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c

index 7c1f789..540dd67 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1490,14 +1490,6 @@ static u64 mlx5_vdpa_get_features(struct 
vdpa_device *vdev)

  return mvdev->mlx_features;
  }
  -static int verify_min_features(struct mlx5_vdpa_dev *mvdev, u64 
features)

-{
-    if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
-    return -EOPNOTSUPP;
-
-    return 0;
-}
-
  static int setup_virtqueues(struct mlx5_vdpa_net *ndev)
  {
  int err;
@@ -1558,18 +1550,13 @@ static int mlx5_vdpa_set_features(struct 
vdpa_device *vdev, u64 features)

  {
  struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
  struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-    int err;
    print_features(mvdev, features, true);
  -    err = verify_min_features(mvdev, features);
-    if (err)
-    return err;
-
  ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
  ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu);
  ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, 
VIRTIO_NET_S_LINK_UP);

-    return err;
+    return 0;
  }
    static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, 
struct vdpa_callback *cb)






Re: [PATCH] vdpa/mlx5: set_features should allow reset to zero

2021-02-22 Thread Si-Wei Liu




On 2/21/2021 11:34 PM, Michael S. Tsirkin wrote:

On Mon, Feb 22, 2021 at 12:14:17PM +0800, Jason Wang wrote:

On 2021/2/19 7:54 下午, Si-Wei Liu wrote:

Commit 452639a64ad8 ("vdpa: make sure set_features is invoked
for legacy") made an exception for legacy guests to reset
features to 0, when config space is accessed before features
are set. We should relieve the verify_min_features() check
and allow features reset to 0 for this case.

It's worth noting that not just legacy guests could access
config space before features are set. For instance, when
feature VIRTIO_NET_F_MTU is advertised some modern driver
will try to access and validate the MTU present in the config
space before virtio features are set.


This looks like a spec violation:

"

The following driver-read-only field, mtu only exists if VIRTIO_NET_F_MTU is
set.
This field specifies the maximum MTU for the driver to use.
"

Do we really want to workaround this?

Thanks

And also:

The driver MUST follow this sequence to initialize a device:
1. Reset the device.
2. Set the ACKNOWLEDGE status bit: the guest OS has noticed the device.
3. Set the DRIVER status bit: the guest OS knows how to drive the device.
4. Read device feature bits, and write the subset of feature bits understood by 
the OS and driver to the
device. During this step the driver MAY read (but MUST NOT write) the 
device-specific configuration
fields to check that it can support the device before accepting it.
5. Set the FEATURES_OK status bit. The driver MUST NOT accept new feature bits 
after this step.
6. Re-read device status to ensure the FEATURES_OK bit is still set: otherwise, 
the device does not
support our subset of features and the device is unusable.
7. Perform device-specific setup, including discovery of virtqueues for the 
device, optional per-bus setup,
reading and possibly writing the device’s virtio configuration space, and 
population of virtqueues.
8. Set the DRIVER_OK status bit. At this point the device is “live”.


so accessing config space before FEATURES_OK is a spec violation, right?
It is, but it's not relevant to what this commit tries to address. I 
thought the legacy guest still needs to be supported.


Having said, a separate patch has to be posted to fix the guest driver 
issue where this discrepancy is introduced to virtnet_validate() (since 
commit fe36cbe067). But it's not technically related to this patch.


-Siwei





Rejecting reset to 0
prematurely causes correct MTU and link status unable to load
for the very first config space access, rendering issues like
guest showing inaccurate MTU value, or failure to reject
out-of-range MTU.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")
Signed-off-by: Si-Wei Liu 
---
   drivers/vdpa/mlx5/net/mlx5_vnet.c | 15 +--
   1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 7c1f789..540dd67 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1490,14 +1490,6 @@ static u64 mlx5_vdpa_get_features(struct vdpa_device 
*vdev)
return mvdev->mlx_features;
   }
-static int verify_min_features(struct mlx5_vdpa_dev *mvdev, u64 features)
-{
-   if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
-   return -EOPNOTSUPP;
-
-   return 0;
-}
-
   static int setup_virtqueues(struct mlx5_vdpa_net *ndev)
   {
int err;
@@ -1558,18 +1550,13 @@ static int mlx5_vdpa_set_features(struct vdpa_device 
*vdev, u64 features)
   {
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-   int err;
print_features(mvdev, features, true);
-   err = verify_min_features(mvdev, features);
-   if (err)
-   return err;
-
ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu);
ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
-   return err;
+   return 0;
   }
   static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct 
vdpa_callback *cb)




Re: [PATCH 2/3] mlx5_vdpa: fix feature negotiation across device reset

2021-02-08 Thread Si-Wei Liu




On 2/7/2021 9:35 PM, Eli Cohen wrote:

On Sat, Feb 06, 2021 at 04:29:23AM -0800, Si-Wei Liu wrote:

The mlx_features denotes the capability for which
set of virtio features is supported by device. In
principle, this field needs not be cleared during
virtio device reset, as this capability is static
and does not change across reset.

In fact, the current code may have the assumption
that mlx_features can be reloaded from firmware
via the .get_features ops after device is reset
(via the .set_status ops), which is unfortunately
not true. The userspace VMM might save a copy
of backend capable features and won't call into
kernel again to get it on reset. This causes all
virtio features getting disabled on newly created
virtqs after device reset, while guest would hold
mismatched view of available features. For e.g.,
the guest may still assume tx checksum offload
is available after reset and feature negotiation,
causing frames with bogus (incomplete) checksum
transmitted on the wire.

Signed-off-by: Si-Wei Liu 
---
  drivers/vdpa/mlx5/net/mlx5_vnet.c | 1 -
  1 file changed, 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b8416c4..aa6f8cd 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1788,7 +1788,6 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
clear_virtqueues(ndev);
mlx5_vdpa_destroy_mr(&ndev->mvdev);
ndev->mvdev.status = 0;
-   ndev->mvdev.mlx_features = 0;
++mvdev->generation;
return;
}

Since we assume that device capabilities don't change, I think I would
get the features through a call done in mlx5v_probe after the netdev
object is created and change mlx5_vdpa_get_features() to just return
ndev->mvdev.mlx_features.
Yep, it makes sense. Will post a revised patch. If vdpa tool allows 
reconfiguration post probing, the code has to be reconciled then.




Did you actually see this issue in action? If you did, can you share
with us how you trigerred this?
Issue is indeed seen in action. The mismatched tx-checksum offload as 
described in the commit message was one of such examples. You would need 
a guest reboot though (triggering device reset via the .set_status ops 
and zero'ed mlx_features was loaded to deduce new actual_features for 
creating the h/w virtq object) for repro.


-Siwei



--
1.8.3.1





Re: [PATCH 3/3] mlx5_vdpa: defer clear_virtqueues to until DRIVER_OK

2021-02-08 Thread Si-Wei Liu




On 2/7/2021 9:48 PM, Eli Cohen wrote:

On Sat, Feb 06, 2021 at 04:29:24AM -0800, Si-Wei Liu wrote:

While virtq is stopped,  get_vq_state() is supposed to
be  called to  get  sync'ed  with  the latest internal
avail_index from device. The saved avail_index is used
to restate  the virtq  once device is started.  Commit
b35ccebe3ef7 introduced the clear_virtqueues() routine
to  reset  the saved  avail_index,  however, the index
gets cleared a bit earlier before get_vq_state() tries
to read it. This would cause consistency problems when
virtq is restarted, e.g. through a series of link down
and link up events. We  could  defer  the  clearing of
avail_index  to  until  the  device  is to be started,
i.e. until  VIRTIO_CONFIG_S_DRIVER_OK  is set again in
set_status().


Not sure I understand the scenario. You are talking about reset of the
device followed by up/down events on the interface. How can you trigger
this?
Currently it's not possible to trigger link up/down events with upstream 
QEMU due lack of config/control interrupt implementation. And live 
migration could be another scenario that cannot be satisfied because of 
inconsistent queue state. They share the same root of cause as captured 
here.


-Siwei




Fixes: b35ccebe3ef7 ("vdpa/mlx5: Restore the hardware used index after change 
map")
Signed-off-by: Si-Wei Liu 
---
  drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index aa6f8cd..444ab58 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1785,7 +1785,6 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
if (!status) {
mlx5_vdpa_info(mvdev, "performing device reset\n");
teardown_driver(ndev);
-   clear_virtqueues(ndev);
mlx5_vdpa_destroy_mr(&ndev->mvdev);
ndev->mvdev.status = 0;
++mvdev->generation;
@@ -1794,6 +1793,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
  
  	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {

if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+   clear_virtqueues(ndev);
err = setup_driver(ndev);
if (err) {
mlx5_vdpa_warn(mvdev, "failed to setup 
driver\n");
--
1.8.3.1





Re: [PATCH 3/3] mlx5_vdpa: defer clear_virtqueues to until DRIVER_OK

2021-02-09 Thread Si-Wei Liu




On 2/8/2021 7:37 PM, Jason Wang wrote:


On 2021/2/6 下午8:29, Si-Wei Liu wrote:

While virtq is stopped,  get_vq_state() is supposed to
be  called to  get  sync'ed  with  the latest internal
avail_index from device. The saved avail_index is used
to restate  the virtq  once device is started.  Commit
b35ccebe3ef7 introduced the clear_virtqueues() routine
to  reset  the saved  avail_index,  however, the index
gets cleared a bit earlier before get_vq_state() tries
to read it. This would cause consistency problems when
virtq is restarted, e.g. through a series of link down
and link up events. We  could  defer  the  clearing of
avail_index  to  until  the  device  is to be started,
i.e. until  VIRTIO_CONFIG_S_DRIVER_OK  is set again in
set_status().

Fixes: b35ccebe3ef7 ("vdpa/mlx5: Restore the hardware used index 
after change map")

Signed-off-by: Si-Wei Liu 
---
  drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c

index aa6f8cd..444ab58 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1785,7 +1785,6 @@ static void mlx5_vdpa_set_status(struct 
vdpa_device *vdev, u8 status)

  if (!status) {
  mlx5_vdpa_info(mvdev, "performing device reset\n");
  teardown_driver(ndev);
-    clear_virtqueues(ndev);
  mlx5_vdpa_destroy_mr(&ndev->mvdev);
  ndev->mvdev.status = 0;
  ++mvdev->generation;
@@ -1794,6 +1793,7 @@ static void mlx5_vdpa_set_status(struct 
vdpa_device *vdev, u8 status)

    if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
  if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+    clear_virtqueues(ndev);



Rethink about this. As mentioned in another thread, this in fact 
breaks set_vq_state().  (See vhost_virtqueue_start() -> 
vhost_vdpa_set_vring_base() in qemu codes).

I assume that the clearing for vhost-vdpa would be done via (qemu code),

vhost_dev_start()->vhost_vdpa_dev_start()->vhost_vdpa_call(status | 
VIRTIO_CONFIG_S_DRIVER_OK)


which is _after_ vhost_virtqueue_start() gets called to restore the 
avail_idx to h/w in vhost_dev_start(). What am I missing here?


-Siwei




The issue is that the avail idx is forgot, we need keep it.

Thanks



  err = setup_driver(ndev);
  if (err) {
  mlx5_vdpa_warn(mvdev, "failed to setup driver\n");






Re: [PATCH v1] vdpa/mlx5: Restore the hardware used index after change map

2021-02-09 Thread Si-Wei Liu




On 2/8/2021 10:37 PM, Jason Wang wrote:


On 2021/2/9 下午2:12, Eli Cohen wrote:

On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

On 2021/2/8 下午6:04, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

On 2021/2/8 下午2:37, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

On 2021/2/6 上午7:07, Si-Wei Liu wrote:

On 2/3/2021 11:36 PM, Eli Cohen wrote:
When a change of memory map occurs, the hardware resources are 
destroyed
and then re-created again with the new memory map. In such 
case, we need
to restore the hardware available and used indices. The driver 
failed to

restore the used index which is added here.

Also, since the driver also fails to reset the available and used
indices upon device reset, fix this here to avoid regression 
caused by

the fact that used index may not be zero upon device reset.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported 
mlx5

devices")
Signed-off-by: Eli Cohen
---
v0 -> v1:
Clear indices upon device reset

     drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++
     1 file changed, 18 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 88dde3455bfd..b5fe6d2ad22f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {
     u64 device_addr;
     u64 driver_addr;
     u16 avail_index;
+    u16 used_index;
     bool ready;
     struct vdpa_callback cb;
     bool restore;
@@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {
     u32 virtq_id;
     struct mlx5_vdpa_net *ndev;
     u16 avail_idx;
+    u16 used_idx;
     int fw_state;
       /* keep last in the struct */
@@ -804,6 +806,7 @@ static int create_virtqueue(struct 
mlx5_vdpa_net

*ndev, struct mlx5_vdpa_virtque
       obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in,
obj_context);
     MLX5_SET(virtio_net_q_object, obj_context, 
hw_available_index,

mvq->avail_idx);
+    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,
mvq->used_idx);
     MLX5_SET(virtio_net_q_object, obj_context,
queue_feature_bit_mask_12_3,
get_features_12_3(ndev->mvdev.actual_features));
     vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context,
virtio_q_context);
@@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net
*ndev, struct mlx5_vdpa_virtqueue *m
     struct mlx5_virtq_attr {
     u8 state;
     u16 available_index;
+    u16 used_index;
     };
       static int query_virtqueue(struct mlx5_vdpa_net *ndev, 
struct

mlx5_vdpa_virtqueue *mvq,
@@ -1052,6 +1056,7 @@ static int query_virtqueue(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu
     memset(attr, 0, sizeof(*attr));
     attr->state = MLX5_GET(virtio_net_q_object, 
obj_context, state);

     attr->available_index = MLX5_GET(virtio_net_q_object,
obj_context, hw_available_index);
+    attr->used_index = MLX5_GET(virtio_net_q_object, 
obj_context,

hw_used_index);
     kfree(out);
     return 0;
     @@ -1535,6 +1540,16 @@ static void 
teardown_virtqueues(struct

mlx5_vdpa_net *ndev)
     }
     }
     +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)
+{
+    int i;
+
+    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
+    ndev->vqs[i].avail_idx = 0;
+    ndev->vqs[i].used_idx = 0;
+    }
+}
+
     /* TODO: cross-endian support */
     static inline bool mlx5_vdpa_is_little_endian(struct 
mlx5_vdpa_dev

*mvdev)
     {
@@ -1610,6 +1625,7 @@ static int save_channel_info(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu
     return err;
       ri->avail_index = attr.available_index;
+    ri->used_index = attr.used_index;
     ri->ready = mvq->ready;
     ri->num_ent = mvq->num_ent;
     ri->desc_addr = mvq->desc_addr;
@@ -1654,6 +1670,7 @@ static void restore_channels_info(struct
mlx5_vdpa_net *ndev)
     continue;
       mvq->avail_idx = ri->avail_index;
+    mvq->used_idx = ri->used_index;
     mvq->ready = ri->ready;
     mvq->num_ent = ri->num_ent;
     mvq->desc_addr = ri->desc_addr;
@@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct
vdpa_device *vdev, u8 status)
     if (!status) {
     mlx5_vdpa_info(mvdev, "performing device reset\n");
     teardown_driver(ndev);
+    clear_virtqueues(ndev);
The clearing looks fine at the first glance, as it aligns with 
the other
state cleanups floating around at the same place. However, the 
thing is
get_vq_state() is supposed to be called right after to get 
sync'ed with
the latest internal avail_index from device while vq is 
stopped. The
index was saved in the driver software at vq suspension, but 
before the
virtq object is destroyed. We shouldn't clear the avai

Re: [PATCH] vdpa/mlx5: set_features should allow reset to zero

2021-02-25 Thread Si-Wei Liu



Hi Michael,

Are you okay to live without this ioctl for now? I think QEMU is the one 
that needs to be fixed and will have to be made legacy guest aware. I 
think the kernel can just honor the feature negotiation result done by 
QEMU and do as what's told to.Will you agree?


If it's fine, I would proceed to reverting commit fe36cbe067 and related 
code in question from the kernel.


Thanks,
-Siwei

On 2/24/2021 10:24 AM, Si-Wei Liu wrote:

Detecting it isn't enough though, we will need a new ioctl to notify
the kernel that it's a legacy guest. Ugh :(
Well, although I think adding an ioctl is doable, may I know what the 
use case there will be for kernel to leverage such info directly? Is 
there a case QEMU can't do with dedicate ioctls later if there's 
indeed differentiation (legacy v.s. modern) needed?


One of the reason I asked is if this ioctl becomes a mandate for 
vhost-vdpa kernel. QEMU would reject initialize vhost-vdpa if doesn't 
see this ioctl coming?


If it's optional, suppose the kernel may need it only when it becomes 
necessary?


Thanks,
-Siwei




Re: [PATCH 1/2] vdpa/mlx5: Avoid unnecessary query virtqueue

2021-02-03 Thread Si-Wei Liu
On Tue, Feb 2, 2021 at 9:16 PM Jason Wang  wrote:
>
>
> On 2021/2/3 上午1:54, Si-Wei Liu wrote:
> > On Tue, Feb 2, 2021 at 1:23 AM Eli Cohen  wrote:
> >> On Tue, Feb 02, 2021 at 12:38:51AM -0800, Si-Wei Liu wrote:
> >>> Thanks Eli and Jason for clarifications. See inline.
> >>>
> >>> On Mon, Feb 1, 2021 at 11:06 PM Eli Cohen  wrote:
> >>>> On Tue, Feb 02, 2021 at 02:02:25PM +0800, Jason Wang wrote:
> >>>>> On 2021/2/2 下午12:15, Si-Wei Liu wrote:
> >>>>>> On Mon, Feb 1, 2021 at 7:13 PM Jason Wang  wrote:
> >>>>>>> On 2021/2/2 上午3:17, Si-Wei Liu wrote:
> >>>>>>>> On Mon, Feb 1, 2021 at 10:51 AM Si-Wei Liu  
> >>>>>>>> wrote:
> >>>>>>>>> On Thu, Jan 28, 2021 at 5:46 AM Eli Cohen  wrote:
> >>>>>>>>>> suspend_vq should only suspend the VQ on not save the current 
> >>>>>>>>>> available
> >>>>>>>>>> index. This is done when a change of map occurs when the driver 
> >>>>>>>>>> calls
> >>>>>>>>>> save_channel_info().
> >>>>>>>>> Hmmm, suspend_vq() is also called by teardown_vq(), the latter of
> >>>>>>>>> which doesn't save the available index as save_channel_info() 
> >>>>>>>>> doesn't
> >>>>>>>>> get called in that path at all. How does it handle the case that
> >>>>>>>>> aget_vq_state() is called from userspace (e.g. QEMU) while the
> >>>>>>>>> hardware VQ object was torn down, but userspace still wants to 
> >>>>>>>>> access
> >>>>>>>>> the queue index?
> >>>>>>>>>
> >>>>>>>>> Refer to 
> >>>>>>>>> https://lore.kernel.org/netdev/1601583511-15138-1-git-send-email-si-wei@oracle.com/
> >>>>>>>>>
> >>>>>>>>> vhost VQ 0 ring restore failed: -1: Resource temporarily 
> >>>>>>>>> unavailable (11)
> >>>>>>>>> vhost VQ 1 ring restore failed: -1: Resource temporarily 
> >>>>>>>>> unavailable (11)
> >>>>>>>>>
> >>>>>>>>> QEMU will complain with the above warning while VM is being rebooted
> >>>>>>>>> or shut down.
> >>>>>>>>>
> >>>>>>>>> Looks to me either the kernel driver should cover this requirement, 
> >>>>>>>>> or
> >>>>>>>>> the userspace has to bear the burden in saving the index and not 
> >>>>>>>>> call
> >>>>>>>>> into kernel if VQ is destroyed.
> >>>>>>>> Actually, the userspace doesn't have the insights whether virt queue
> >>>>>>>> will be destroyed if just changing the device status via 
> >>>>>>>> set_status().
> >>>>>>>> Looking at other vdpa driver in tree i.e. ifcvf it doesn't behave 
> >>>>>>>> like
> >>>>>>>> so. Hence this still looks to me to be Mellanox specifics and
> >>>>>>>> mlx5_vdpa implementation detail that shouldn't expose to userspace.
> >>>>>>> So I think we can simply drop this patch?
> >>>>>> Yep, I think so. To be honest I don't know why it has anything to do
> >>>>>> with the memory hotplug issue.
> >>>>>
> >>>>> Eli may know more, my understanding is that, during memory hotplut, qemu
> >>>>> need to propagate new memory mappings via set_map(). For mellanox, it 
> >>>>> means
> >>>>> it needs to rebuild memory keys, so the virtqueue needs to be suspended.
> >>>>>
> >>>> I think Siwei was asking why the first patch was related to the hotplug
> >>>> issue.
> >>> I was thinking how consistency is assured when saving/restoring this
> >>> h/w avail_index against the one in the virtq memory, particularly in
> >>> the region_add/.region_del() context (e.g. the hotplug case). Problem
> >>> is I don't see explicit memory barrier when guest thread updates the
> >>> avail_index, how does the devic

Re: [PATCH 1/2] vdpa/mlx5: Fix suspend/resume index restoration

2021-02-17 Thread Si-Wei Liu




On 2/16/2021 8:20 AM, Eli Cohen wrote:

When we suspend the VM, the VDPA interface will be reset. When the VM is
resumed again, clear_virtqueues() will clear the available and used
indices resulting in hardware virqtqueue objects becoming out of sync.
We can avoid this function alltogether since qemu will clear them if
required, e.g. when the VM went through a reboot.

Moreover, since the hw available and used indices should always be
identical on query and should be restored to the same value same value
for virtqueues that complete in order, we set the single value provided
by set_vq_state(). In get_vq_state() we return the value of hardware
used index.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")
Signed-off-by: Eli Cohen 

Acked-by: Si-Wei Liu 


---
  drivers/vdpa/mlx5/net/mlx5_vnet.c | 17 -
  1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b8e9d525d66c..a51b0f86afe2 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1169,6 +1169,7 @@ static void suspend_vq(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *m
return;
}
mvq->avail_idx = attr.available_index;
+   mvq->used_idx = attr.used_index;
  }
  
  static void suspend_vqs(struct mlx5_vdpa_net *ndev)

@@ -1426,6 +1427,7 @@ static int mlx5_vdpa_set_vq_state(struct vdpa_device 
*vdev, u16 idx,
return -EINVAL;
}
  
+	mvq->used_idx = state->avail_index;

mvq->avail_idx = state->avail_index;
return 0;
  }
@@ -1443,7 +1445,7 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device 
*vdev, u16 idx, struct vdpa
 * that cares about emulating the index after vq is stopped.
 */
if (!mvq->initialized) {
-   state->avail_index = mvq->avail_idx;
+   state->avail_index = mvq->used_idx;
return 0;
}
  
@@ -1452,7 +1454,7 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa

mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
return err;
}
-   state->avail_index = attr.available_index;
+   state->avail_index = attr.used_index;
return 0;
  }
  
@@ -1532,16 +1534,6 @@ static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)

}
  }
  
-static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

-{
-   int i;
-
-   for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
-   ndev->vqs[i].avail_idx = 0;
-   ndev->vqs[i].used_idx = 0;
-   }
-}
-
  /* TODO: cross-endian support */
  static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
  {
@@ -1777,7 +1769,6 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
if (!status) {
mlx5_vdpa_info(mvdev, "performing device reset\n");
teardown_driver(ndev);
-   clear_virtqueues(ndev);
mlx5_vdpa_destroy_mr(&ndev->mvdev);
ndev->mvdev.status = 0;
++mvdev->generation;




Re: [PATCH 1/2] vdpa/mlx5: Fix suspend/resume index restoration

2021-02-17 Thread Si-Wei Liu




On 2/17/2021 1:20 PM, Michael S. Tsirkin wrote:

On Wed, Feb 17, 2021 at 11:42:48AM -0800, Si-Wei Liu wrote:


On 2/16/2021 8:20 AM, Eli Cohen wrote:

When we suspend the VM, the VDPA interface will be reset. When the VM is
resumed again, clear_virtqueues() will clear the available and used
indices resulting in hardware virqtqueue objects becoming out of sync.
We can avoid this function alltogether since qemu will clear them if
required, e.g. when the VM went through a reboot.

Moreover, since the hw available and used indices should always be
identical on query and should be restored to the same value same value
for virtqueues that complete in order, we set the single value provided
by set_vq_state(). In get_vq_state() we return the value of hardware
used index.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")
Signed-off-by: Eli Cohen 

Acked-by: Si-Wei Liu 


Seems to also fix b35ccebe3ef76168aa2edaa35809c0232cb3578e, right?

I think so. It should have both "Fixes" tags.

-Siwei



---
   drivers/vdpa/mlx5/net/mlx5_vnet.c | 17 -
   1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b8e9d525d66c..a51b0f86afe2 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1169,6 +1169,7 @@ static void suspend_vq(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *m
return;
}
mvq->avail_idx = attr.available_index;
+   mvq->used_idx = attr.used_index;
   }
   static void suspend_vqs(struct mlx5_vdpa_net *ndev)
@@ -1426,6 +1427,7 @@ static int mlx5_vdpa_set_vq_state(struct vdpa_device 
*vdev, u16 idx,
return -EINVAL;
}
+   mvq->used_idx = state->avail_index;
mvq->avail_idx = state->avail_index;
return 0;
   }
@@ -1443,7 +1445,7 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device 
*vdev, u16 idx, struct vdpa
 * that cares about emulating the index after vq is stopped.
 */
if (!mvq->initialized) {
-   state->avail_index = mvq->avail_idx;
+   state->avail_index = mvq->used_idx;
return 0;
}
@@ -1452,7 +1454,7 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device 
*vdev, u16 idx, struct vdpa
mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
return err;
}
-   state->avail_index = attr.available_index;
+   state->avail_index = attr.used_index;
return 0;
   }
@@ -1532,16 +1534,6 @@ static void teardown_virtqueues(struct mlx5_vdpa_net 
*ndev)
}
   }
-static void clear_virtqueues(struct mlx5_vdpa_net *ndev)
-{
-   int i;
-
-   for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
-   ndev->vqs[i].avail_idx = 0;
-   ndev->vqs[i].used_idx = 0;
-   }
-}
-
   /* TODO: cross-endian support */
   static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
   {
@@ -1777,7 +1769,6 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
if (!status) {
mlx5_vdpa_info(mvdev, "performing device reset\n");
teardown_driver(ndev);
-   clear_virtqueues(ndev);
mlx5_vdpa_destroy_mr(&ndev->mvdev);
ndev->mvdev.status = 0;
++mvdev->generation;




Re: [PATCH v2 3/3] vdpa/mlx5: defer clear_virtqueues to until DRIVER_OK

2021-02-17 Thread Si-Wei Liu




On 2/16/2021 7:21 AM, Eli Cohen wrote:

On Thu, Feb 11, 2021 at 09:33:14AM +0200, Eli Cohen wrote:

On Wed, Feb 10, 2021 at 01:48:00PM -0800, Si-Wei Liu wrote:

While virtq is stopped,  get_vq_state() is supposed to
be  called to  get  sync'ed  with  the latest internal
avail_index from device. The saved avail_index is used
to restate  the virtq  once device is started.  Commit
b35ccebe3ef7 introduced the clear_virtqueues() routine
to  reset  the saved  avail_index,  however, the index
gets cleared a bit earlier before get_vq_state() tries
to read it. This would cause consistency problems when
virtq is restarted, e.g. through a series of link down
and link up events. We  could  defer  the  clearing of
avail_index  to  until  the  device  is to be started,
i.e. until  VIRTIO_CONFIG_S_DRIVER_OK  is set again in
set_status().

Fixes: b35ccebe3ef7 ("vdpa/mlx5: Restore the hardware used index after change 
map")
Signed-off-by: Si-Wei Liu 
Acked-by: Jason Wang 

Acked-by: Eli Cohen 


I take it back. I think we don't need to clear the indexes at all. In
case we need to restore indexes we'll get the right values through
set_vq_state(). If we suspend the virtqueue due to VM being suspended,
qemu will query first and will provide the the queried value. In case of
VM reboot, it will provide 0 in set_vq_state().

I am sending a patch that addresses both reboot and suspend.
With set_vq_state() repurposed to restoring used_index I'm fine with 
this approach.


Do I have to repost a v3 of this series while dropping the 3rd patch?

-Siwei



---
  drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 7c1f789..ce6aae8 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1777,7 +1777,6 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
if (!status) {
mlx5_vdpa_info(mvdev, "performing device reset\n");
teardown_driver(ndev);
-   clear_virtqueues(ndev);
mlx5_vdpa_destroy_mr(&ndev->mvdev);
ndev->mvdev.status = 0;
++mvdev->generation;
@@ -1786,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
  
  	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {

if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+   clear_virtqueues(ndev);
err = setup_driver(ndev);
if (err) {
mlx5_vdpa_warn(mvdev, "failed to setup 
driver\n");
--
1.8.3.1





Re: [PATCH v1] vdpa/mlx5: Restore the hardware used index after change map

2021-02-18 Thread Si-Wei Liu




On 2/17/2021 8:44 PM, Jason Wang wrote:


On 2021/2/10 下午4:59, Si-Wei Liu wrote:



On 2/9/2021 7:53 PM, Jason Wang wrote:


On 2021/2/10 上午10:30, Si-Wei Liu wrote:



On 2/8/2021 10:37 PM, Jason Wang wrote:


On 2021/2/9 下午2:12, Eli Cohen wrote:

On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

On 2021/2/8 下午6:04, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

On 2021/2/8 下午2:37, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

On 2021/2/6 上午7:07, Si-Wei Liu wrote:

On 2/3/2021 11:36 PM, Eli Cohen wrote:
When a change of memory map occurs, the hardware resources 
are destroyed
and then re-created again with the new memory map. In such 
case, we need
to restore the hardware available and used indices. The 
driver failed to

restore the used index which is added here.

Also, since the driver also fails to reset the available 
and used
indices upon device reset, fix this here to avoid 
regression caused by

the fact that used index may not be zero upon device reset.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for 
supported mlx5

devices")
Signed-off-by: Eli Cohen
---
v0 -> v1:
Clear indices upon device reset

     drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 
++

     1 file changed, 18 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 88dde3455bfd..b5fe6d2ad22f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {
     u64 device_addr;
     u64 driver_addr;
     u16 avail_index;
+    u16 used_index;
     bool ready;
     struct vdpa_callback cb;
     bool restore;
@@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {
     u32 virtq_id;
     struct mlx5_vdpa_net *ndev;
     u16 avail_idx;
+    u16 used_idx;
     int fw_state;
       /* keep last in the struct */
@@ -804,6 +806,7 @@ static int create_virtqueue(struct 
mlx5_vdpa_net

*ndev, struct mlx5_vdpa_virtque
       obj_context = 
MLX5_ADDR_OF(create_virtio_net_q_in, in,

obj_context);
     MLX5_SET(virtio_net_q_object, obj_context, 
hw_available_index,

mvq->avail_idx);
+    MLX5_SET(virtio_net_q_object, obj_context, 
hw_used_index,

mvq->used_idx);
     MLX5_SET(virtio_net_q_object, obj_context,
queue_feature_bit_mask_12_3,
get_features_12_3(ndev->mvdev.actual_features));
     vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, 
obj_context,

virtio_q_context);
@@ -1022,6 +1025,7 @@ static int connect_qps(struct 
mlx5_vdpa_net

*ndev, struct mlx5_vdpa_virtqueue *m
     struct mlx5_virtq_attr {
     u8 state;
     u16 available_index;
+    u16 used_index;
     };
       static int query_virtqueue(struct mlx5_vdpa_net 
*ndev, struct

mlx5_vdpa_virtqueue *mvq,
@@ -1052,6 +1056,7 @@ static int query_virtqueue(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu
     memset(attr, 0, sizeof(*attr));
     attr->state = MLX5_GET(virtio_net_q_object, 
obj_context, state);
     attr->available_index = 
MLX5_GET(virtio_net_q_object,

obj_context, hw_available_index);
+    attr->used_index = MLX5_GET(virtio_net_q_object, 
obj_context,

hw_used_index);
     kfree(out);
     return 0;
     @@ -1535,6 +1540,16 @@ static void 
teardown_virtqueues(struct

mlx5_vdpa_net *ndev)
     }
     }
     +static void clear_virtqueues(struct mlx5_vdpa_net 
*ndev)

+{
+    int i;
+
+    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
+    ndev->vqs[i].avail_idx = 0;
+    ndev->vqs[i].used_idx = 0;
+    }
+}
+
     /* TODO: cross-endian support */
     static inline bool mlx5_vdpa_is_little_endian(struct 
mlx5_vdpa_dev

*mvdev)
     {
@@ -1610,6 +1625,7 @@ static int save_channel_info(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu
     return err;
       ri->avail_index = attr.available_index;
+    ri->used_index = attr.used_index;
     ri->ready = mvq->ready;
     ri->num_ent = mvq->num_ent;
     ri->desc_addr = mvq->desc_addr;
@@ -1654,6 +1670,7 @@ static void 
restore_channels_info(struct

mlx5_vdpa_net *ndev)
     continue;
       mvq->avail_idx = ri->avail_index;
+    mvq->used_idx = ri->used_index;
     mvq->ready = ri->ready;
     mvq->num_ent = ri->num_ent;
     mvq->desc_addr = ri->desc_addr;
@@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct
vdpa_device *vdev, u8 status)
     if (!status) {
     mlx5_vdpa_info(mvdev, "performing device 
reset\n");

     teardown_driver(ndev);
+    clear_virtqueues(ndev);
The clearing looks fine at the first glance, as it aligns 
with the other
state cleanups floating around at the same place. However, 
the thing is
get_vq_state() is supposed to be called right after to get 
sync'ed with
the latest internal

[PATCH] vdpa/mlx5: set_features should allow reset to zero

2021-02-19 Thread Si-Wei Liu
Commit 452639a64ad8 ("vdpa: make sure set_features is invoked
for legacy") made an exception for legacy guests to reset
features to 0, when config space is accessed before features
are set. We should relieve the verify_min_features() check
and allow features reset to 0 for this case.

It's worth noting that not just legacy guests could access
config space before features are set. For instance, when
feature VIRTIO_NET_F_MTU is advertised some modern driver
will try to access and validate the MTU present in the config
space before virtio features are set. Rejecting reset to 0
prematurely causes correct MTU and link status unable to load
for the very first config space access, rendering issues like
guest showing inaccurate MTU value, or failure to reject
out-of-range MTU.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")
Signed-off-by: Si-Wei Liu 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 15 +--
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 7c1f789..540dd67 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1490,14 +1490,6 @@ static u64 mlx5_vdpa_get_features(struct vdpa_device 
*vdev)
return mvdev->mlx_features;
 }
 
-static int verify_min_features(struct mlx5_vdpa_dev *mvdev, u64 features)
-{
-   if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
-   return -EOPNOTSUPP;
-
-   return 0;
-}
-
 static int setup_virtqueues(struct mlx5_vdpa_net *ndev)
 {
int err;
@@ -1558,18 +1550,13 @@ static int mlx5_vdpa_set_features(struct vdpa_device 
*vdev, u64 features)
 {
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
-   int err;
 
print_features(mvdev, features, true);
 
-   err = verify_min_features(mvdev, features);
-   if (err)
-   return err;
-
ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu);
ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
-   return err;
+   return 0;
 }
 
 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct 
vdpa_callback *cb)
-- 
1.8.3.1



Re: [PATCH v1] vdpa/mlx5: Restore the hardware used index after change map

2021-02-19 Thread Si-Wei Liu




On 2/18/2021 7:10 PM, Jason Wang wrote:


On 2021/2/18 8:43 下午, Si-Wei Liu wrote:



On 2/17/2021 8:44 PM, Jason Wang wrote:


On 2021/2/10 下午4:59, Si-Wei Liu wrote:



On 2/9/2021 7:53 PM, Jason Wang wrote:


On 2021/2/10 上午10:30, Si-Wei Liu wrote:



On 2/8/2021 10:37 PM, Jason Wang wrote:


On 2021/2/9 下午2:12, Eli Cohen wrote:

On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

On 2021/2/8 下午6:04, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

On 2021/2/8 下午2:37, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

On 2021/2/6 上午7:07, Si-Wei Liu wrote:

On 2/3/2021 11:36 PM, Eli Cohen wrote:
When a change of memory map occurs, the hardware 
resources are destroyed
and then re-created again with the new memory map. In 
such case, we need
to restore the hardware available and used indices. The 
driver failed to

restore the used index which is added here.

Also, since the driver also fails to reset the available 
and used
indices upon device reset, fix this here to avoid 
regression caused by

the fact that used index may not be zero upon device reset.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for 
supported mlx5

devices")
Signed-off-by: Eli Cohen
---
v0 -> v1:
Clear indices upon device reset

     drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 
++

     1 file changed, 18 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 88dde3455bfd..b5fe6d2ad22f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {
     u64 device_addr;
     u64 driver_addr;
     u16 avail_index;
+    u16 used_index;
     bool ready;
     struct vdpa_callback cb;
     bool restore;
@@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {
     u32 virtq_id;
     struct mlx5_vdpa_net *ndev;
     u16 avail_idx;
+    u16 used_idx;
     int fw_state;
       /* keep last in the struct */
@@ -804,6 +806,7 @@ static int create_virtqueue(struct 
mlx5_vdpa_net

*ndev, struct mlx5_vdpa_virtque
       obj_context = 
MLX5_ADDR_OF(create_virtio_net_q_in, in,

obj_context);
     MLX5_SET(virtio_net_q_object, obj_context, 
hw_available_index,

mvq->avail_idx);
+    MLX5_SET(virtio_net_q_object, obj_context, 
hw_used_index,

mvq->used_idx);
     MLX5_SET(virtio_net_q_object, obj_context,
queue_feature_bit_mask_12_3,
get_features_12_3(ndev->mvdev.actual_features));
     vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, 
obj_context,

virtio_q_context);
@@ -1022,6 +1025,7 @@ static int connect_qps(struct 
mlx5_vdpa_net

*ndev, struct mlx5_vdpa_virtqueue *m
     struct mlx5_virtq_attr {
     u8 state;
     u16 available_index;
+    u16 used_index;
     };
       static int query_virtqueue(struct mlx5_vdpa_net 
*ndev, struct

mlx5_vdpa_virtqueue *mvq,
@@ -1052,6 +1056,7 @@ static int query_virtqueue(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu
     memset(attr, 0, sizeof(*attr));
     attr->state = MLX5_GET(virtio_net_q_object, 
obj_context, state);
     attr->available_index = 
MLX5_GET(virtio_net_q_object,

obj_context, hw_available_index);
+    attr->used_index = MLX5_GET(virtio_net_q_object, 
obj_context,

hw_used_index);
     kfree(out);
     return 0;
     @@ -1535,6 +1540,16 @@ static void 
teardown_virtqueues(struct

mlx5_vdpa_net *ndev)
     }
     }
     +static void clear_virtqueues(struct mlx5_vdpa_net 
*ndev)

+{
+    int i;
+
+    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
+    ndev->vqs[i].avail_idx = 0;
+    ndev->vqs[i].used_idx = 0;
+    }
+}
+
     /* TODO: cross-endian support */
     static inline bool 
mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev

*mvdev)
     {
@@ -1610,6 +1625,7 @@ static int save_channel_info(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu
     return err;
       ri->avail_index = attr.available_index;
+    ri->used_index = attr.used_index;
     ri->ready = mvq->ready;
     ri->num_ent = mvq->num_ent;
     ri->desc_addr = mvq->desc_addr;
@@ -1654,6 +1670,7 @@ static void 
restore_channels_info(struct

mlx5_vdpa_net *ndev)
     continue;
       mvq->avail_idx = ri->avail_index;
+    mvq->used_idx = ri->used_index;
     mvq->ready = ri->ready;
     mvq->num_ent = ri->num_ent;
     mvq->desc_addr = ri->desc_addr;
@@ -1768,6 +1785,7 @@ static void 
mlx5_vdpa_set_status(struct

vdpa_device *vdev, u8 status)
     if (!status) {
     mlx5_vdpa_info(mvdev, "performing device 
reset\n");

     teardown_driver(ndev);
+    clear_virtqueues(ndev);
The clearing looks fine at the first glance, as it aligns 
with the other
state cleanups floating around at the same place. 
However, the thing is
get_vq_st

Re: [PATCH 1/2] vdpa/mlx5: Fix suspend/resume index restoration

2021-02-19 Thread Si-Wei Liu




On 2/17/2021 11:42 AM, Si-Wei Liu wrote:



On 2/16/2021 8:20 AM, Eli Cohen wrote:

When we suspend the VM, the VDPA interface will be reset. When the VM is
resumed again, clear_virtqueues() will clear the available and used
indices resulting in hardware virqtqueue objects becoming out of sync.
We can avoid this function alltogether since qemu will clear them if
required, e.g. when the VM went through a reboot.

Moreover, since the hw available and used indices should always be
identical on query and should be restored to the same value same value
for virtqueues that complete in order, we set the single value provided
by set_vq_state(). In get_vq_state() we return the value of hardware
used index.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 
devices")

Signed-off-by: Eli Cohen 

Acked-by: Si-Wei Liu 


I'd take it back for the moment, according to Jason's latest comment. 
Discussion still going.





---
  drivers/vdpa/mlx5/net/mlx5_vnet.c | 17 -
  1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c

index b8e9d525d66c..a51b0f86afe2 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1169,6 +1169,7 @@ static void suspend_vq(struct mlx5_vdpa_net 
*ndev, struct mlx5_vdpa_virtqueue *m

  return;
  }
  mvq->avail_idx = attr.available_index;
+    mvq->used_idx = attr.used_index;
  }
    static void suspend_vqs(struct mlx5_vdpa_net *ndev)
@@ -1426,6 +1427,7 @@ static int mlx5_vdpa_set_vq_state(struct 
vdpa_device *vdev, u16 idx,

  return -EINVAL;
  }
  +    mvq->used_idx = state->avail_index;
  mvq->avail_idx = state->avail_index;
This is where things starts getting interesting. According to Jason, the 
original expectation of this API would be to restore the internal 
last_avail_index in the hardware. With Mellanox network vDPA hardware 
implementation, there's no such last_avail_index thing in the hardware 
but only a single last_used_index representing both indices, which 
should always be the same and in sync with each other. So from migration 
point of view, it appears logical to restore the saved last_avail_index 
to the last_used_index in the hardware, right? But what is the point to 
restore mvq->avail_idx?


Actually, this code path is being repurposed to address the index reset 
problem in the device reset scenario. Where the mvq->avail_idx and 
mvq->used_idx are both reset to 0 once device is reset. This is a bit 
crossing the boundary to me.




  return 0;
  }
@@ -1443,7 +1445,7 @@ static int mlx5_vdpa_get_vq_state(struct 
vdpa_device *vdev, u16 idx, struct vdpa

   * that cares about emulating the index after vq is stopped.
   */
  if (!mvq->initialized) {
-    state->avail_index = mvq->avail_idx;
+    state->avail_index = mvq->used_idx;
This is where the last_used_index gets loaded from the hardware (when 
device is stopped).



  return 0;
  }
  @@ -1452,7 +1454,7 @@ static int mlx5_vdpa_get_vq_state(struct 
vdpa_device *vdev, u16 idx, struct vdpa

  mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
  return err;
  }
-    state->avail_index = attr.available_index;
+    state->avail_index = attr.used_index;

This code path never gets called from userspace (when device is running).

-Siwei


  return 0;
  }
  @@ -1532,16 +1534,6 @@ static void teardown_virtqueues(struct 
mlx5_vdpa_net *ndev)

  }
  }
  -static void clear_virtqueues(struct mlx5_vdpa_net *ndev)
-{
-    int i;
-
-    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
-    ndev->vqs[i].avail_idx = 0;
-    ndev->vqs[i].used_idx = 0;
-    }
-}
-
  /* TODO: cross-endian support */
  static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev 
*mvdev)

  {
@@ -1777,7 +1769,6 @@ static void mlx5_vdpa_set_status(struct 
vdpa_device *vdev, u8 status)

  if (!status) {
  mlx5_vdpa_info(mvdev, "performing device reset\n");
  teardown_driver(ndev);
-    clear_virtqueues(ndev);
  mlx5_vdpa_destroy_mr(&ndev->mvdev);
  ndev->mvdev.status = 0;
  ++mvdev->generation;






Re: [PATCH v1] vdpa/mlx5: Restore the hardware used index after change map

2021-02-19 Thread Si-Wei Liu




On 2/19/2021 6:38 PM, Jason Wang wrote:



Right now the value is exposed to userspace via GET_VRING_BASE, so 
only last_avail_idx is synced. If we need sync last_used_idx, we 
should also sync pending indices which requires more thoughts.
Technically it doesn't sound right - crossing the boundary a bit even 
with simplified form of assumption. But depending on how userspace 
could make use of this API, it doesn't seem it breaks existing 
functionality for the moment.



I don't get here, maybe you can explain a little bit more? 

Please refer to the email I just sent.

https://lore.kernel.org/lkml/033b0806-4037-5755-a1fa-91dbb58ba...@oracle.com/

-Siwei


Re: [PATCH v1] vdpa/mlx5: Restore the hardware used index after change map

2021-02-05 Thread Si-Wei Liu




On 2/3/2021 11:36 PM, Eli Cohen wrote:

When a change of memory map occurs, the hardware resources are destroyed
and then re-created again with the new memory map. In such case, we need
to restore the hardware available and used indices. The driver failed to
restore the used index which is added here.

Also, since the driver also fails to reset the available and used
indices upon device reset, fix this here to avoid regression caused by
the fact that used index may not be zero upon device reset.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")
Signed-off-by: Eli Cohen 
---
v0 -> v1:
Clear indices upon device reset

  drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++
  1 file changed, 18 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 88dde3455bfd..b5fe6d2ad22f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {
u64 device_addr;
u64 driver_addr;
u16 avail_index;
+   u16 used_index;
bool ready;
struct vdpa_callback cb;
bool restore;
@@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {
u32 virtq_id;
struct mlx5_vdpa_net *ndev;
u16 avail_idx;
+   u16 used_idx;
int fw_state;
  
  	/* keep last in the struct */

@@ -804,6 +806,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, 
struct mlx5_vdpa_virtque
  
  	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);

MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, 
mvq->avail_idx);
+   MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, 
mvq->used_idx);
MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
 get_features_12_3(ndev->mvdev.actual_features));
vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, 
virtio_q_context);
@@ -1022,6 +1025,7 @@ static int connect_qps(struct mlx5_vdpa_net *ndev, struct 
mlx5_vdpa_virtqueue *m
  struct mlx5_virtq_attr {
u8 state;
u16 available_index;
+   u16 used_index;
  };
  
  static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,

@@ -1052,6 +1056,7 @@ static int query_virtqueue(struct mlx5_vdpa_net *ndev, 
struct mlx5_vdpa_virtqueu
memset(attr, 0, sizeof(*attr));
attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, 
hw_available_index);
+   attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, 
hw_used_index);
kfree(out);
return 0;
  
@@ -1535,6 +1540,16 @@ static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)

}
  }
  
+static void clear_virtqueues(struct mlx5_vdpa_net *ndev)

+{
+   int i;
+
+   for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
+   ndev->vqs[i].avail_idx = 0;
+   ndev->vqs[i].used_idx = 0;
+   }
+}
+
  /* TODO: cross-endian support */
  static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
  {
@@ -1610,6 +1625,7 @@ static int save_channel_info(struct mlx5_vdpa_net *ndev, 
struct mlx5_vdpa_virtqu
return err;
  
  	ri->avail_index = attr.available_index;

+   ri->used_index = attr.used_index;
ri->ready = mvq->ready;
ri->num_ent = mvq->num_ent;
ri->desc_addr = mvq->desc_addr;
@@ -1654,6 +1670,7 @@ static void restore_channels_info(struct mlx5_vdpa_net 
*ndev)
continue;
  
  		mvq->avail_idx = ri->avail_index;

+   mvq->used_idx = ri->used_index;
mvq->ready = ri->ready;
mvq->num_ent = ri->num_ent;
mvq->desc_addr = ri->desc_addr;
@@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
if (!status) {
mlx5_vdpa_info(mvdev, "performing device reset\n");
teardown_driver(ndev);
+   clear_virtqueues(ndev);
The clearing looks fine at the first glance, as it aligns with the other 
state cleanups floating around at the same place. However, the thing is 
get_vq_state() is supposed to be called right after to get sync'ed with 
the latest internal avail_index from device while vq is stopped. The 
index was saved in the driver software at vq suspension, but before the 
virtq object is destroyed. We shouldn't clear the avail_index too early.


Possibly it can be postponed to where VIRTIO_CONFIG_S_DRIVER_OK gets set 
again, i.e. right before the setup_driver() in mlx5_vdpa_set_status()?


-Siwei


mlx5_vdpa_destroy_mr(&ndev->mvdev);
ndev->mvdev.status = 0;
ndev->mvdev.mlx_features = 0;




[PATCH 2/3] mlx5_vdpa: fix feature negotiation across device reset

2021-02-06 Thread Si-Wei Liu
The mlx_features denotes the capability for which
set of virtio features is supported by device. In
principle, this field needs not be cleared during
virtio device reset, as this capability is static
and does not change across reset.

In fact, the current code may have the assumption
that mlx_features can be reloaded from firmware
via the .get_features ops after device is reset
(via the .set_status ops), which is unfortunately
not true. The userspace VMM might save a copy
of backend capable features and won't call into
kernel again to get it on reset. This causes all
virtio features getting disabled on newly created
virtqs after device reset, while guest would hold
mismatched view of available features. For e.g.,
the guest may still assume tx checksum offload
is available after reset and feature negotiation,
causing frames with bogus (incomplete) checksum
transmitted on the wire.

Signed-off-by: Si-Wei Liu 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b8416c4..aa6f8cd 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1788,7 +1788,6 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
clear_virtqueues(ndev);
mlx5_vdpa_destroy_mr(&ndev->mvdev);
ndev->mvdev.status = 0;
-   ndev->mvdev.mlx_features = 0;
++mvdev->generation;
return;
}
-- 
1.8.3.1



[PATCH 3/3] mlx5_vdpa: defer clear_virtqueues to until DRIVER_OK

2021-02-06 Thread Si-Wei Liu
While virtq is stopped,  get_vq_state() is supposed to
be  called to  get  sync'ed  with  the latest internal
avail_index from device. The saved avail_index is used
to restate  the virtq  once device is started.  Commit
b35ccebe3ef7 introduced the clear_virtqueues() routine
to  reset  the saved  avail_index,  however, the index
gets cleared a bit earlier before get_vq_state() tries
to read it. This would cause consistency problems when
virtq is restarted, e.g. through a series of link down
and link up events. We  could  defer  the  clearing of
avail_index  to  until  the  device  is to be started,
i.e. until  VIRTIO_CONFIG_S_DRIVER_OK  is set again in
set_status().

Fixes: b35ccebe3ef7 ("vdpa/mlx5: Restore the hardware used index after change 
map")
Signed-off-by: Si-Wei Liu 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index aa6f8cd..444ab58 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1785,7 +1785,6 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
if (!status) {
mlx5_vdpa_info(mvdev, "performing device reset\n");
teardown_driver(ndev);
-   clear_virtqueues(ndev);
mlx5_vdpa_destroy_mr(&ndev->mvdev);
ndev->mvdev.status = 0;
++mvdev->generation;
@@ -1794,6 +1793,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
 
if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+   clear_virtqueues(ndev);
err = setup_driver(ndev);
if (err) {
mlx5_vdpa_warn(mvdev, "failed to setup 
driver\n");
-- 
1.8.3.1



[PATCH 1/3] mlx5_vdpa: should exclude header length and fcs from mtu

2021-02-06 Thread Si-Wei Liu
When feature VIRTIO_NET_F_MTU is negotiated on mlx5_vdpa,
22 extra bytes worth of MTU length is shown in guest.
This is because the mlx5_query_port_max_mtu API returns
the "hardware" MTU value, which does not just contain the
Ethernet payload, but includes extra lengths starting
from the Ethernet header up to the FCS altogether.

Fix the MTU so packets won't get dropped silently.

Signed-off-by: Si-Wei Liu 
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h |  4 
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 15 ++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 08f742f..b6cc53b 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -4,9 +4,13 @@
 #ifndef __MLX5_VDPA_H__
 #define __MLX5_VDPA_H__
 
+#include 
+#include 
 #include 
 #include 
 
+#define MLX5V_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
+
 struct mlx5_vdpa_direct_mr {
u64 start;
u64 end;
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index dc88559..b8416c4 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1907,6 +1907,19 @@ static int mlx5_get_vq_irq(struct vdpa_device *vdv, u16 
idx)
.free = mlx5_vdpa_free,
 };
 
+static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
+{
+   u16 hw_mtu;
+   int err;
+
+   err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
+   if (err)
+   return err;
+
+   *mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
+   return 0;
+}
+
 static int alloc_resources(struct mlx5_vdpa_net *ndev)
 {
struct mlx5_vdpa_net_resources *res = &ndev->res;
@@ -1992,7 +2005,7 @@ static int mlx5v_probe(struct auxiliary_device *adev,
init_mvqs(ndev);
mutex_init(&ndev->reslock);
config = &ndev->config;
-   err = mlx5_query_nic_vport_mtu(mdev, &ndev->mtu);
+   err = query_mtu(mdev, &ndev->mtu);
if (err)
goto err_mtu;
 
-- 
1.8.3.1



Re: [PATCH] vdpa/mlx5: set_features should allow reset to zero

2021-03-01 Thread Si-Wei Liu




On 2/28/2021 1:27 PM, Michael S. Tsirkin wrote:

On Thu, Feb 25, 2021 at 04:56:42PM -0800, Si-Wei Liu wrote:

Hi Michael,

Are you okay to live without this ioctl for now? I think QEMU is the one
that needs to be fixed and will have to be made legacy guest aware. I think
the kernel can just honor the feature negotiation result done by QEMU and do
as what's told to.Will you agree?

If it's fine, I would proceed to reverting commit fe36cbe067 and related
code in question from the kernel.

Thanks,
-Siwei


Not really, I don't see why that's a good idea.  fe36cbe067 is the code
checking MTU before FEATURES_OK. Spec explicitly allows that.


Alright, but what I meant was this commit
452639a64ad8 ("vdpa: make sure set_features is invoked for legacy").

But I got why you need it in another email (for BE host/guest).

-Siwei


Re: [PATCH vhost v2 4/8] vdpa/mlx5: Mark vq addrs for modification in hw vq

2023-12-12 Thread Si-Wei Liu




On 12/12/2023 11:21 AM, Eugenio Perez Martin wrote:

On Tue, Dec 5, 2023 at 11:46 AM Dragos Tatulea  wrote:

Addresses get set by .set_vq_address. hw vq addresses will be updated on
next modify_virtqueue.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Gal Pressman 
Acked-by: Eugenio Pérez 

I'm kind of ok with this patch and the next one about state, but I
didn't ack them in the previous series.

My main concern is that it is not valid to change the vq address after
DRIVER_OK in VirtIO, which vDPA follows. Only memory maps are ok to
change at this moment. I'm not sure about vq state in vDPA, but vhost
forbids changing it with an active backend.

Suspend is not defined in VirtIO at this moment though, so maybe it is
ok to decide that all of these parameters may change during suspend.
Maybe the best thing is to protect this with a vDPA feature flag.
I think protect with vDPA feature flag could work, while on the other 
hand vDPA means vendor specific optimization is possible around suspend 
and resume (in case it helps performance), which doesn't have to be 
backed by virtio spec. Same applies to vhost user backend features, 
variations there were not backed by spec either. Of course, we should 
try best to make the default behavior backward compatible with 
virtio-based backend, but that circles back to no suspend definition in 
the current virtio spec, for which I hope we don't cease development on 
vDPA indefinitely. After all, the virtio based vdap backend can well 
define its own feature flag to describe (minor difference in) the 
suspend behavior based on the later spec once it is formed in future.


Regards,
-Siwei





Jason, what do you think?

Thanks!


---
  drivers/vdpa/mlx5/net/mlx5_vnet.c  | 9 +
  include/linux/mlx5/mlx5_ifc_vdpa.h | 1 +
  2 files changed, 10 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index f8f088cced50..80e066de0866 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1209,6 +1209,7 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
 bool state_change = false;
 void *obj_context;
 void *cmd_hdr;
+   void *vq_ctx;
 void *in;
 int err;

@@ -1230,6 +1231,7 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);

 obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
+   vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, 
virtio_q_context);

 if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) {
 if (!is_valid_state_change(mvq->fw_state, state, 
is_resumable(ndev))) {
@@ -1241,6 +1243,12 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
 state_change = true;
 }

+   if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS) {
+   MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
+   MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
+   MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
+   }
+
 MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, 
mvq->modified_fields);
 err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
 if (err)
@@ -2202,6 +2210,7 @@ static int mlx5_vdpa_set_vq_address(struct vdpa_device 
*vdev, u16 idx, u64 desc_
 mvq->desc_addr = desc_area;
 mvq->device_addr = device_area;
 mvq->driver_addr = driver_area;
+   mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS;
 return 0;
  }

diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h 
b/include/linux/mlx5/mlx5_ifc_vdpa.h
index b86d51a855f6..9594ac405740 100644
--- a/include/linux/mlx5/mlx5_ifc_vdpa.h
+++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
@@ -145,6 +145,7 @@ enum {
 MLX5_VIRTQ_MODIFY_MASK_STATE= (u64)1 << 0,
 MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_PARAMS  = (u64)1 << 3,
 MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_DUMP_ENABLE = (u64)1 << 4,
+   MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS   = (u64)1 << 6,
 MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY  = (u64)1 << 14,
  };

--
2.42.0






Re: [PATCH] vdpa/mlx5: Use random MAC address when no nic vport MAC set

2024-08-27 Thread Si-Wei Liu




On 8/27/2024 9:02 AM, Dragos Tatulea wrote:

When the vdpa device is configured without a specific MAC
address, the vport MAC address is used. However, this
address can be 0 which prevents the driver from properly
configuring the MPFS and breaks steering.

The solution is to simply generate a random MAC address
when no MAC is set on the nic vport.

Now it's possible to create a vdpa device without a
MAC address and run qemu with this device without needing
to configure an explicit MAC address.

Signed-off-by: Dragos Tatulea 
Reviewed-by: Jiri Pirko 
---
  drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index fa78e8288ebb..1c26139d02fe 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3824,6 +3824,9 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
if (err)
goto err_alloc;
+
+   if (is_zero_ether_addr(config->mac))
+   eth_random_addr(config->mac);
I wonder with this change we no longer honor the historical behaviour to 
retain the zero mac address and clear the _F_MAC bit, should we head to 
remove the below logic? It looks to me below would become dead code 
effectively.


    } else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) 
== 0) {

    /*
 * We used to clear _F_MAC feature bit if seeing
 * zero mac address when device features are not
 * specifically provisioned. Keep the behaviour
 * so old scripts do not break.
 */
    device_features &= ~BIT_ULL(VIRTIO_NET_F_MAC);

If we are not going to honor old behaviour any more, looks to me we 
should also block users from creating vdpa device with zero mac address, 
if the mac attribute is specified. There's more sorrow than help the 
zero mac address could buy for users.


    if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
    memcpy(ndev->config.mac, add_config->net.mac, ETH_ALEN);

Regards,
-Siwei


}
  
  	if (!is_zero_ether_addr(config->mac)) {





Re: [PATCH] vdpa/mlx5: Fix invalid mr resource destroy

2024-08-27 Thread Si-Wei Liu




On 8/27/2024 9:08 AM, Dragos Tatulea wrote:

Certain error paths from mlx5_vdpa_dev_add() can end up releasing mr
resources which never got initialized in the first place.

This patch adds the missing check in mlx5_vdpa_destroy_mr_resources()
to block releasing non-initialized mr resources.

Reference trace:

   mlx5_core :08:00.2: mlx5_vdpa_dev_add:3274:(pid 2700) warning: No mac 
address provisioned?
   BUG: kernel NULL pointer dereference, address: 
   #PF: supervisor read access in kernel mode
   #PF: error_code(0x) - not-present page
   PGD 140216067 P4D 0
   Oops:  [#1] PREEMPT SMP NOPTI
   CPU: 8 PID: 2700 Comm: vdpa Kdump: loaded Not tainted 5.14.0-496.el9.x86_64 
#1
   Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
   RIP: 0010:vhost_iotlb_del_range+0xf/0xe0 [vhost_iotlb]
   Code: [...]
   RSP: 0018:ff1c823ac23077f0 EFLAGS: 00010246
   RAX: c1a21a60 RBX: 899567a0 RCX: 
   RDX:  RSI:  RDI: 
   RBP: ff1bda1f7c21e800 R08:  R09: ff1c823ac2307670
   R10: ff1c823ac2307668 R11: 8a9e7b68 R12: 
   R13:  R14: ff1bda1f43e341a0 R15: ffea
   FS:  7f56eba7c740() GS:ff1bda269f80() knlGS:
   CS:  0010 DS:  ES:  CR0: 80050033
   CR2:  CR3: 000104d90001 CR4: 00771ef0
   DR0:  DR1:  DR2: 
   DR3:  DR6: fffe0ff0 DR7: 0400
   PKRU: 5554
   Call Trace:

? show_trace_log_lvl+0x1c4/0x2df
? show_trace_log_lvl+0x1c4/0x2df
? mlx5_vdpa_free+0x3d/0x150 [mlx5_vdpa]
? __die_body.cold+0x8/0xd
? page_fault_oops+0x134/0x170
? __irq_work_queue_local+0x2b/0xc0
? irq_work_queue+0x2c/0x50
? exc_page_fault+0x62/0x150
? asm_exc_page_fault+0x22/0x30
? __pfx_mlx5_vdpa_free+0x10/0x10 [mlx5_vdpa]
? vhost_iotlb_del_range+0xf/0xe0 [vhost_iotlb]
mlx5_vdpa_free+0x3d/0x150 [mlx5_vdpa]
vdpa_release_dev+0x1e/0x50 [vdpa]
device_release+0x31/0x90
kobject_cleanup+0x37/0x130
mlx5_vdpa_dev_add+0x2d2/0x7a0 [mlx5_vdpa]
vdpa_nl_cmd_dev_add_set_doit+0x277/0x4c0 [vdpa]
genl_family_rcv_msg_doit+0xd9/0x130
genl_family_rcv_msg+0x14d/0x220
? __pfx_vdpa_nl_cmd_dev_add_set_doit+0x10/0x10 [vdpa]
? _copy_to_user+0x1a/0x30
? move_addr_to_user+0x4b/0xe0
genl_rcv_msg+0x47/0xa0
? __import_iovec+0x46/0x150
? __pfx_genl_rcv_msg+0x10/0x10
netlink_rcv_skb+0x54/0x100
genl_rcv+0x24/0x40
netlink_unicast+0x245/0x370
netlink_sendmsg+0x206/0x440
__sys_sendto+0x1dc/0x1f0
? do_read_fault+0x10c/0x1d0
? do_pte_missing+0x10d/0x190
__x64_sys_sendto+0x20/0x30
do_syscall_64+0x5c/0xf0
? __count_memcg_events+0x4f/0xb0
? mm_account_fault+0x6c/0x100
? handle_mm_fault+0x116/0x270
? do_user_addr_fault+0x1d6/0x6a0
? do_syscall_64+0x6b/0xf0
? clear_bhb_loop+0x25/0x80
? clear_bhb_loop+0x25/0x80
? clear_bhb_loop+0x25/0x80
? clear_bhb_loop+0x25/0x80
? clear_bhb_loop+0x25/0x80
entry_SYSCALL_64_after_hwframe+0x78/0x80

Fixes: 512c0cdd80c1 ("vdpa/mlx5: Decouple cvq iotlb handling from hw mapping 
code")
The fix looks fine to me, but how come this is the commit that 
introduced the problem? Can you help clarify?


Reviewed-by: Si-Wei Liu 

Thanks,
-Siwei


Signed-off-by: Dragos Tatulea 
Reviewed-by: Cosmin Ratiu 
---
  drivers/vdpa/mlx5/core/mr.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 4758914ccf86..bf56f3d69625 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -581,6 +581,9 @@ static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev 
*mvdev)
  
  void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev)

  {
+   if (!mvdev->res.valid)
+   return;
+
for (int i = 0; i < MLX5_VDPA_NUM_AS; i++)
mlx5_vdpa_update_mr(mvdev, NULL, i);
  





Re: [PATCH v1] vdpa/mlx5: Restore the hardware used index after change map

2021-02-16 Thread Si-Wei Liu




On 2/10/2021 7:45 AM, Eli Cohen wrote:

On Wed, Feb 10, 2021 at 12:59:03AM -0800, Si-Wei Liu wrote:


On 2/9/2021 7:53 PM, Jason Wang wrote:

On 2021/2/10 上午10:30, Si-Wei Liu wrote:


On 2/8/2021 10:37 PM, Jason Wang wrote:

On 2021/2/9 下午2:12, Eli Cohen wrote:

On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

On 2021/2/8 下午6:04, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

On 2021/2/8 下午2:37, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

On 2021/2/6 上午7:07, Si-Wei Liu wrote:

On 2/3/2021 11:36 PM, Eli Cohen wrote:

When a change of memory map
occurs, the hardware resources
are destroyed
and then re-created again with
the new memory map. In such
case, we need
to restore the hardware
available and used indices. The
driver failed to
restore the used index which is added here.

Also, since the driver also
fails to reset the available and
used
indices upon device reset, fix
this here to avoid regression
caused by
the fact that used index may not be zero upon device reset.

Fixes: 1a86b377aa21 ("vdpa/mlx5:
Add VDPA driver for supported
mlx5
devices")
Signed-off-by: Eli Cohen
---
v0 -> v1:
Clear indices upon device reset

      drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 ++
      1 file changed, 18 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 88dde3455bfd..b5fe6d2ad22f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {
      u64 device_addr;
      u64 driver_addr;
      u16 avail_index;
+    u16 used_index;
      bool ready;
      struct vdpa_callback cb;
      bool restore;
@@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {
      u32 virtq_id;
      struct mlx5_vdpa_net *ndev;
      u16 avail_idx;
+    u16 used_idx;
      int fw_state;
        /* keep last in the struct */
@@ -804,6 +806,7 @@ static int
create_virtqueue(struct
mlx5_vdpa_net
*ndev, struct mlx5_vdpa_virtque
        obj_context =
MLX5_ADDR_OF(create_virtio_net_q_in,
in,
obj_context);
 
MLX5_SET(virtio_net_q_object,

obj_context, hw_available_index,
mvq->avail_idx);
+    MLX5_SET(virtio_net_q_object, obj_context, hw_used_index,
mvq->used_idx);
      MLX5_SET(virtio_net_q_object, obj_context,
queue_feature_bit_mask_12_3,
get_features_12_3(ndev->mvdev.actual_features));
      vq_ctx =
MLX5_ADDR_OF(virtio_net_q_object,
obj_context,
virtio_q_context);
@@ -1022,6 +1025,7 @@ static int
connect_qps(struct mlx5_vdpa_net
*ndev, struct mlx5_vdpa_virtqueue *m
      struct mlx5_virtq_attr {
      u8 state;
      u16 available_index;
+    u16 used_index;
      };
        static int
query_virtqueue(struct
mlx5_vdpa_net *ndev, struct
mlx5_vdpa_virtqueue *mvq,
@@ -1052,6 +1056,7 @@ static int query_virtqueue(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu
      memset(attr, 0, sizeof(*attr));
      attr->state =
MLX5_GET(virtio_net_q_object,
obj_context, state);
      attr->available_index = MLX5_GET(virtio_net_q_object,
obj_context, hw_available_index);
+    attr->used_index =
MLX5_GET(virtio_net_q_object,
obj_context,
hw_used_index);
      kfree(out);
      return 0;
      @@ -1535,6 +1540,16 @@
static void
teardown_virtqueues(struct
mlx5_vdpa_net *ndev)
      }
      }
      +static void clear_virtqueues(struct mlx5_vdpa_net *ndev)
+{
+    int i;
+
+    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
+    ndev->vqs[i].avail_idx = 0;
+    ndev->vqs[i].used_idx = 0;
+    }
+}
+
      /* TODO: cross-endian support */
      static inline bool
mlx5_vdpa_is_little_endian(struct
mlx5_vdpa_dev
*mvdev)
      {
@@ -1610,6 +1625,7 @@ static int save_channel_info(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu
      return err;
        ri->avail_index = attr.available_index;
+    ri->used_index = attr.used_index;
      ri->ready = mvq->ready;
      ri->num_ent = mvq->num_ent;
      ri->desc_addr = mvq->desc_addr;
@@ -1654,6 +1670,7 @@ static void restore_channels_info(struct
mlx5_vdpa_net *ndev)
      continue;
        mvq->avail_idx = ri->avail_index;
+    mvq->used_idx = ri->used_index;
      mvq->ready = ri->ready;
      mvq->num_ent = ri->num_ent;
      mvq->desc_addr = ri->desc_addr;
@@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct
vdpa_device *vdev, u8 status)
      if (!status) {
 
mlx5_vdpa_info(mvdev,

"performing device reset\n");
      teardown_driver(ndev);
+    clear_virtqueues(ndev);

The clearing looks fine at the first
glance, as it aligns with the other
state cleanups floating around at
the same place. However, the thing
is
get_vq_state() is supposed to be
called right after to get sync'ed

Re: [PATCH v3] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-10 Thread si-wei liu



On 11/9/2020 7:28 PM, Jason Wang wrote:


On 2020/11/6 上午7:26, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
Changes in v3:
- Turn explicit last_pfn check to a WARN_ON() (Jason)

Changes in v2:
- Drop the reversion patch
- Fix unhandled page leak towards the end of page_list



Acked-by: Jason Wang 

Thanks



Thanks Jason for your review!


-Siwei




  drivers/vhost/vdpa.c | 80 


  1 file changed, 62 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..5b13dfd 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
if (r)
  vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+else
+atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
return r;
  }
@@ -591,14 +593,16 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
  unsigned int gup_flags = FOLL_LONGTERM;
  unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-unsigned long locked, lock_limit, pinned, i;
+unsigned long lock_limit, sz2pin, nchunks, i;
  u64 iova = msg->iova;
+long pinned;
  int ret = 0;
if (vhost_iotlb_itree_first(iotlb, msg->iova,
  msg->iova + msg->size - 1))
  return -EEXIST;
  +/* Limit the use of memory for bookkeeping */
  page_list = (struct page **) __get_free_page(GFP_KERNEL);
  if (!page_list)
  return -ENOMEM;
@@ -607,52 +611,75 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  gup_flags |= FOLL_WRITE;
npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> 
PAGE_SHIFT;

-if (!npages)
-return -EINVAL;
+if (!npages) {
+ret = -EINVAL;
+goto free;
+}
mmap_read_lock(dev->mm);
  -locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
  lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-if (locked > lock_limit) {
+if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
  ret = -ENOMEM;
-goto out;
+goto unlock;
  }
cur_base = msg->uaddr & PAGE_MASK;
  iova &= PAGE_MASK;
+nchunks = 0;
while (npages) {
-pinned = min_t(unsigned long, npages, list_size);
-ret = pin_user_pages(cur_base, pinned,
- gup_flags, page_list, NULL);
-if (ret != pinned)
+sz2pin = min_t(unsigned long, npages, list_size);
+pinned = pin_user_pages(cur_base, sz2pin,
+gup_flags, page_list, NULL);
+if (sz2pin != pinned) {
+if (pinned < 0) {
+ret = pinned;
+} else {
+unpin_user_pages(page_list, pinned);
+ret = -ENOMEM;
+}
  goto out;
+}
+nchunks++;
if (!last_pfn)
  map_pfn = page_to_pfn(page_list[0]);
  -for (i = 0; i < ret; i++) {
+for (i = 0; i < pinned; i++) {
  unsigned long this_pfn = page_to_pfn(page_list[i]);
  u64 csize;
if (last_pfn && (this_pfn != last_pfn + 1)) {
  /* Pin a contiguous chunk of memory */
  csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-if (vhost_vdpa_map(v, iova, csize,
-   map_pfn << PAGE_SHIFT,
-   msg->perm))
+ret = vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm);
+if (ret) {
+/*
+ * Unpin the pages that are left unmapped
+ * from this point on in the current
+ * page_list. The remaining outstanding
+ * ones which may stride across several
+ * chunks will be covered in the common
+ * error path subsequently.
+ */
+unpin_user_pages(&page_list[i],
+ pinned - i);
  goto out;
+}
+
  map_pfn = this_pfn;
  iova += csize;
+ 

Re: [PATCH v3] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-19 Thread si-wei liu

A gentle reminder. Any chance this patch will get picked soon?

Thanks,
-Siwei

On 11/9/2020 7:28 PM, Jason Wang wrote:


On 2020/11/6 上午7:26, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
Changes in v3:
- Turn explicit last_pfn check to a WARN_ON() (Jason)

Changes in v2:
- Drop the reversion patch
- Fix unhandled page leak towards the end of page_list



Acked-by: Jason Wang 

Thanks




  drivers/vhost/vdpa.c | 80 


  1 file changed, 62 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..5b13dfd 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
if (r)
  vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+else
+atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
return r;
  }
@@ -591,14 +593,16 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
  unsigned int gup_flags = FOLL_LONGTERM;
  unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-unsigned long locked, lock_limit, pinned, i;
+unsigned long lock_limit, sz2pin, nchunks, i;
  u64 iova = msg->iova;
+long pinned;
  int ret = 0;
if (vhost_iotlb_itree_first(iotlb, msg->iova,
  msg->iova + msg->size - 1))
  return -EEXIST;
  +/* Limit the use of memory for bookkeeping */
  page_list = (struct page **) __get_free_page(GFP_KERNEL);
  if (!page_list)
  return -ENOMEM;
@@ -607,52 +611,75 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  gup_flags |= FOLL_WRITE;
npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> 
PAGE_SHIFT;

-if (!npages)
-return -EINVAL;
+if (!npages) {
+ret = -EINVAL;
+goto free;
+}
mmap_read_lock(dev->mm);
  -locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
  lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-if (locked > lock_limit) {
+if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
  ret = -ENOMEM;
-goto out;
+goto unlock;
  }
cur_base = msg->uaddr & PAGE_MASK;
  iova &= PAGE_MASK;
+nchunks = 0;
while (npages) {
-pinned = min_t(unsigned long, npages, list_size);
-ret = pin_user_pages(cur_base, pinned,
- gup_flags, page_list, NULL);
-if (ret != pinned)
+sz2pin = min_t(unsigned long, npages, list_size);
+pinned = pin_user_pages(cur_base, sz2pin,
+gup_flags, page_list, NULL);
+if (sz2pin != pinned) {
+if (pinned < 0) {
+ret = pinned;
+} else {
+unpin_user_pages(page_list, pinned);
+ret = -ENOMEM;
+}
  goto out;
+}
+nchunks++;
if (!last_pfn)
  map_pfn = page_to_pfn(page_list[0]);
  -for (i = 0; i < ret; i++) {
+for (i = 0; i < pinned; i++) {
  unsigned long this_pfn = page_to_pfn(page_list[i]);
  u64 csize;
if (last_pfn && (this_pfn != last_pfn + 1)) {
  /* Pin a contiguous chunk of memory */
  csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-if (vhost_vdpa_map(v, iova, csize,
-   map_pfn << PAGE_SHIFT,
-   msg->perm))
+ret = vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm);
+if (ret) {
+/*
+ * Unpin the pages that are left unmapped
+ * from this point on in the current
+ * page_list. The remaining outstanding
+ * ones which may stride across several
+ * chunks will be covered in the common
+ * error path subsequently.
+ */
+unpin_user_pages(&page_list[i],
+ pinned - i);
  goto out;
+}
+
  map_pfn = this_pfn;
 

Re: [PATCH v2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-09 Thread si-wei liu



On 11/8/2020 7:21 PM, Jason Wang wrote:


On 2020/11/6 上午6:57, si-wei liu wrote:


On 11/4/2020 7:26 PM, Jason Wang wrote:


On 2020/11/5 上午7:33, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
Changes in v2:
- Drop the reversion patch
- Fix unhandled page leak towards the end of page_list

  drivers/vhost/vdpa.c | 79 


  1 file changed, 61 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..e112854 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
if (r)
  vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+else
+atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
return r;
  }
@@ -591,14 +593,16 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
  unsigned int gup_flags = FOLL_LONGTERM;
  unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-unsigned long locked, lock_limit, pinned, i;
+unsigned long lock_limit, sz2pin, nchunks, i;
  u64 iova = msg->iova;
+long pinned;
  int ret = 0;
if (vhost_iotlb_itree_first(iotlb, msg->iova,
  msg->iova + msg->size - 1))
  return -EEXIST;
  +/* Limit the use of memory for bookkeeping */
  page_list = (struct page **) __get_free_page(GFP_KERNEL);
  if (!page_list)
  return -ENOMEM;
@@ -607,52 +611,75 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  gup_flags |= FOLL_WRITE;
npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> 
PAGE_SHIFT;

-if (!npages)
-return -EINVAL;
+if (!npages) {
+ret = -EINVAL;
+goto free;
+}
mmap_read_lock(dev->mm);
  -locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
  lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-if (locked > lock_limit) {
+if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
  ret = -ENOMEM;
-goto out;
+goto unlock;
  }
cur_base = msg->uaddr & PAGE_MASK;
  iova &= PAGE_MASK;
+nchunks = 0;
while (npages) {
-pinned = min_t(unsigned long, npages, list_size);
-ret = pin_user_pages(cur_base, pinned,
- gup_flags, page_list, NULL);
-if (ret != pinned)
+sz2pin = min_t(unsigned long, npages, list_size);
+pinned = pin_user_pages(cur_base, sz2pin,
+gup_flags, page_list, NULL);
+if (sz2pin != pinned) {
+if (pinned < 0) {
+ret = pinned;
+} else {
+unpin_user_pages(page_list, pinned);
+ret = -ENOMEM;
+}
  goto out;
+}
+nchunks++;
if (!last_pfn)
  map_pfn = page_to_pfn(page_list[0]);
  -for (i = 0; i < ret; i++) {
+for (i = 0; i < pinned; i++) {
  unsigned long this_pfn = page_to_pfn(page_list[i]);
  u64 csize;
if (last_pfn && (this_pfn != last_pfn + 1)) {
  /* Pin a contiguous chunk of memory */
  csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-if (vhost_vdpa_map(v, iova, csize,
-   map_pfn << PAGE_SHIFT,
-   msg->perm))
+ret = vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm);
+if (ret) {
+/*
+ * Unpin the pages that are left unmapped
+ * from this point on in the current
+ * page_list. The remaining outstanding
+ * ones which may stride across several
+ * chunks will be covered in the common
+ * error path subsequently.
+ */
+unpin_user_pages(&page_list[i],
+ pinned - i);



Can we simply do last_pfn = this_pfn here?
Nope. They are not contiguous segments of memory. Noted the 
conditional (this_pfn != last_pfn + 1) being held here.



Right.








   

Re: [PATCH v2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-09 Thread si-wei liu



On 11/9/2020 2:42 PM, Michael S. Tsirkin wrote:

On Mon, Nov 09, 2020 at 01:44:03PM -0800, si-wei liu wrote:

On 11/8/2020 7:21 PM, Jason Wang wrote:

On 2020/11/6 上午6:57, si-wei liu wrote:

On 11/4/2020 7:26 PM, Jason Wang wrote:

On 2020/11/5 上午7:33, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
Changes in v2:
- Drop the reversion patch
- Fix unhandled page leak towards the end of page_list

   drivers/vhost/vdpa.c | 79

   1 file changed, 61 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..e112854 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
 if (r)
   vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+else
+atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
 return r;
   }
@@ -591,14 +593,16 @@ static int
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
   unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
   unsigned int gup_flags = FOLL_LONGTERM;
   unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-unsigned long locked, lock_limit, pinned, i;
+unsigned long lock_limit, sz2pin, nchunks, i;
   u64 iova = msg->iova;
+long pinned;
   int ret = 0;
 if (vhost_iotlb_itree_first(iotlb, msg->iova,
   msg->iova + msg->size - 1))
   return -EEXIST;
   +/* Limit the use of memory for bookkeeping */
   page_list = (struct page **) __get_free_page(GFP_KERNEL);
   if (!page_list)
   return -ENOMEM;
@@ -607,52 +611,75 @@ static int
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
   gup_flags |= FOLL_WRITE;
 npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK))

PAGE_SHIFT;

-if (!npages)
-return -EINVAL;
+if (!npages) {
+ret = -EINVAL;
+goto free;
+}
 mmap_read_lock(dev->mm);
   -locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
   lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-if (locked > lock_limit) {
+if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
   ret = -ENOMEM;
-goto out;
+goto unlock;
   }
 cur_base = msg->uaddr & PAGE_MASK;
   iova &= PAGE_MASK;
+nchunks = 0;
 while (npages) {
-pinned = min_t(unsigned long, npages, list_size);
-ret = pin_user_pages(cur_base, pinned,
- gup_flags, page_list, NULL);
-if (ret != pinned)
+sz2pin = min_t(unsigned long, npages, list_size);
+pinned = pin_user_pages(cur_base, sz2pin,
+gup_flags, page_list, NULL);
+if (sz2pin != pinned) {
+if (pinned < 0) {
+ret = pinned;
+} else {
+unpin_user_pages(page_list, pinned);
+ret = -ENOMEM;
+}
   goto out;
+}
+nchunks++;
 if (!last_pfn)
   map_pfn = page_to_pfn(page_list[0]);
   -for (i = 0; i < ret; i++) {
+for (i = 0; i < pinned; i++) {
   unsigned long this_pfn = page_to_pfn(page_list[i]);
   u64 csize;
 if (last_pfn && (this_pfn != last_pfn + 1)) {
   /* Pin a contiguous chunk of memory */
   csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-if (vhost_vdpa_map(v, iova, csize,
-   map_pfn << PAGE_SHIFT,
-   msg->perm))
+ret = vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm);
+if (ret) {
+/*
+ * Unpin the pages that are left unmapped
+ * from this point on in the current
+ * page_list. The remaining outstanding
+ * ones which may stride across several
+ * chunks will be covered in the common
+ * error path subsequently.
+ */
+unpin_user_pages(&page_list[i],
+ pinned - i);


Can we simply do last_pfn = this_pfn here?

Nope. They are not con

Re: [PATCH v3 2/2] vhost-vdpa: fix page pinning leakage in error path

2020-10-29 Thread si-wei liu



On 10/29/2020 2:53 PM, Michael S. Tsirkin wrote:

On Thu, Oct 15, 2020 at 01:17:14PM -0700, si-wei liu wrote:

On 10/15/2020 6:11 AM, Michael S. Tsirkin wrote:

On Thu, Oct 15, 2020 at 02:15:32PM +0800, Jason Wang wrote:

On 2020/10/14 上午7:42, si-wei liu wrote:

So what I suggest is to fix the pinning leakage first and do the
possible optimization on top (which is still questionable to me).

OK. Unfortunately, this was picked and got merged in upstream. So I will
post a follow up patch set to 1) revert the commit to the original
__get_free_page() implementation, and 2) fix the accounting and leakage
on top. Will it be fine?

Fine.

Thanks

Fine by me too.


Thanks, Michael & Jason. I will post the fix shortly. Stay tuned.

-Siwei

did I miss the patch?

You didn't, sorry. I was looking into a way to speed up the boot time 
for large memory guest by multi-threading the page pinning process, and 
it turns out I need more time on that. Will post the fix I have now 
soon, hopefully tomorrow.


-Siwei




[PATCH 1/2] Revert "vhost-vdpa: fix page pinning leakage in error path"

2020-10-30 Thread Si-Wei Liu
This reverts commit 7ed9e3d97c32d969caded2dfb6e67c1a2cc5a0b1.

Signed-off-by: Si-Wei Liu 
---
 drivers/vhost/vdpa.c | 119 +--
 1 file changed, 48 insertions(+), 71 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index a2dbc85..b6d9016 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -588,19 +588,21 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
struct vhost_dev *dev = &v->vdev;
struct vhost_iotlb *iotlb = dev->iotlb;
struct page **page_list;
-   struct vm_area_struct **vmas;
+   unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
unsigned int gup_flags = FOLL_LONGTERM;
-   unsigned long map_pfn, last_pfn = 0;
-   unsigned long npages, lock_limit;
-   unsigned long i, nmap = 0;
+   unsigned long npages, cur_base, map_pfn, last_pfn = 0;
+   unsigned long locked, lock_limit, pinned, i;
u64 iova = msg->iova;
-   long pinned;
int ret = 0;
 
if (vhost_iotlb_itree_first(iotlb, msg->iova,
msg->iova + msg->size - 1))
return -EEXIST;
 
+   page_list = (struct page **) __get_free_page(GFP_KERNEL);
+   if (!page_list)
+   return -ENOMEM;
+
if (msg->perm & VHOST_ACCESS_WO)
gup_flags |= FOLL_WRITE;
 
@@ -608,86 +610,61 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
if (!npages)
return -EINVAL;
 
-   page_list = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
-   vmas = kvmalloc_array(npages, sizeof(struct vm_area_struct *),
- GFP_KERNEL);
-   if (!page_list || !vmas) {
-   ret = -ENOMEM;
-   goto free;
-   }
-
mmap_read_lock(dev->mm);
 
+   locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-   if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
-   ret = -ENOMEM;
-   goto unlock;
-   }
 
-   pinned = pin_user_pages(msg->uaddr & PAGE_MASK, npages, gup_flags,
-   page_list, vmas);
-   if (npages != pinned) {
-   if (pinned < 0) {
-   ret = pinned;
-   } else {
-   unpin_user_pages(page_list, pinned);
-   ret = -ENOMEM;
-   }
-   goto unlock;
+   if (locked > lock_limit) {
+   ret = -ENOMEM;
+   goto out;
}
 
+   cur_base = msg->uaddr & PAGE_MASK;
iova &= PAGE_MASK;
-   map_pfn = page_to_pfn(page_list[0]);
-
-   /* One more iteration to avoid extra vdpa_map() call out of loop. */
-   for (i = 0; i <= npages; i++) {
-   unsigned long this_pfn;
-   u64 csize;
-
-   /* The last chunk may have no valid PFN next to it */
-   this_pfn = i < npages ? page_to_pfn(page_list[i]) : -1UL;
-
-   if (last_pfn && (this_pfn == -1UL ||
-this_pfn != last_pfn + 1)) {
-   /* Pin a contiguous chunk of memory */
-   csize = last_pfn - map_pfn + 1;
-   ret = vhost_vdpa_map(v, iova, csize << PAGE_SHIFT,
-map_pfn << PAGE_SHIFT,
-msg->perm);
-   if (ret) {
-   /*
-* Unpin the rest chunks of memory on the
-* flight with no corresponding vdpa_map()
-* calls having been made yet. On the other
-* hand, vdpa_unmap() in the failure path
-* is in charge of accounting the number of
-* pinned pages for its own.
-* This asymmetrical pattern of accounting
-* is for efficiency to pin all pages at
-* once, while there is no other callsite
-* of vdpa_map() than here above.
-*/
-   unpin_user_pages(&page_list[nmap],
-npages - nmap);
-   goto out;
+
+   while (npages) {
+   pinned = min_t(unsigned long, npages, list_size);
+   ret = pin_user_pages(cur_base, pinned,
+gup_flags, page_list, NULL);
+   if (ret != pinned)
+   goto out;
+
+   if (!last_pfn)
+

[PATCH 2/2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-10-30 Thread Si-Wei Liu
Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
 drivers/vhost/vdpa.c | 64 +---
 1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..8da8558 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
 
if (r)
vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+   else
+   atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
 
return r;
 }
@@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
unsigned int gup_flags = FOLL_LONGTERM;
unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-   unsigned long locked, lock_limit, pinned, i;
+   unsigned long lock_limit, sz2pin, nchunks, i;
u64 iova = msg->iova;
+   long pinned;
int ret = 0;
 
if (vhost_iotlb_itree_first(iotlb, msg->iova,
msg->iova + msg->size - 1))
return -EEXIST;
 
+   /* Limit the use of memory for bookkeeping */
page_list = (struct page **) __get_free_page(GFP_KERNEL);
if (!page_list)
return -ENOMEM;
@@ -607,52 +611,64 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
gup_flags |= FOLL_WRITE;
 
npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT;
-   if (!npages)
-   return -EINVAL;
+   if (!npages) {
+   ret = -EINVAL;
+   goto free;
+   }
 
mmap_read_lock(dev->mm);
 
-   locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (locked > lock_limit) {
+   if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
ret = -ENOMEM;
-   goto out;
+   goto unlock;
}
 
cur_base = msg->uaddr & PAGE_MASK;
iova &= PAGE_MASK;
+   nchunks = 0;
 
while (npages) {
-   pinned = min_t(unsigned long, npages, list_size);
-   ret = pin_user_pages(cur_base, pinned,
-gup_flags, page_list, NULL);
-   if (ret != pinned)
+   sz2pin = min_t(unsigned long, npages, list_size);
+   pinned = pin_user_pages(cur_base, sz2pin,
+   gup_flags, page_list, NULL);
+   if (sz2pin != pinned) {
+   if (pinned < 0) {
+   ret = pinned;
+   } else {
+   unpin_user_pages(page_list, pinned);
+   ret = -ENOMEM;
+   }
goto out;
+   }
+   nchunks++;
 
if (!last_pfn)
map_pfn = page_to_pfn(page_list[0]);
 
-   for (i = 0; i < ret; i++) {
+   for (i = 0; i < pinned; i++) {
unsigned long this_pfn = page_to_pfn(page_list[i]);
u64 csize;
 
if (last_pfn && (this_pfn != last_pfn + 1)) {
/* Pin a contiguous chunk of memory */
csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-   if (vhost_vdpa_map(v, iova, csize,
-  map_pfn << PAGE_SHIFT,
-  msg->perm))
+   ret = vhost_vdpa_map(v, iova, csize,
+map_pfn << PAGE_SHIFT,
+msg->perm);
+   if (ret)
goto out;
+
map_pfn = this_pfn;
iova += csize;
+   nchunks = 0;
}
 
last_pfn = this_pfn;
}
 
-   cur_base += ret << PAGE_SHIFT;
-   npages -= ret;
+   cur_base += pinned << PAGE_SHIF

Re: [PATCH 2/2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-03 Thread si-wei liu



On 11/3/2020 5:00 AM, Jason Wang wrote:


On 2020/10/30 下午3:45, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
  drivers/vhost/vdpa.c | 64 
+---

  1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..8da8558 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
if (r)
  vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+else
+atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
return r;
  }
@@ -591,14 +593,16 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
  unsigned int gup_flags = FOLL_LONGTERM;
  unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-unsigned long locked, lock_limit, pinned, i;
+unsigned long lock_limit, sz2pin, nchunks, i;
  u64 iova = msg->iova;
+long pinned;
  int ret = 0;
if (vhost_iotlb_itree_first(iotlb, msg->iova,
  msg->iova + msg->size - 1))
  return -EEXIST;
  +/* Limit the use of memory for bookkeeping */
  page_list = (struct page **) __get_free_page(GFP_KERNEL);
  if (!page_list)
  return -ENOMEM;
@@ -607,52 +611,64 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  gup_flags |= FOLL_WRITE;
npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> 
PAGE_SHIFT;

-if (!npages)
-return -EINVAL;
+if (!npages) {
+ret = -EINVAL;
+goto free;
+}
mmap_read_lock(dev->mm);
  -locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
  lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-if (locked > lock_limit) {
+if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
  ret = -ENOMEM;
-goto out;
+goto unlock;
  }
cur_base = msg->uaddr & PAGE_MASK;
  iova &= PAGE_MASK;
+nchunks = 0;
while (npages) {
-pinned = min_t(unsigned long, npages, list_size);
-ret = pin_user_pages(cur_base, pinned,
- gup_flags, page_list, NULL);
-if (ret != pinned)
+sz2pin = min_t(unsigned long, npages, list_size);
+pinned = pin_user_pages(cur_base, sz2pin,
+gup_flags, page_list, NULL);
+if (sz2pin != pinned) {
+if (pinned < 0) {
+ret = pinned;
+} else {
+unpin_user_pages(page_list, pinned);
+ret = -ENOMEM;
+}
  goto out;
+}
+nchunks++;
if (!last_pfn)
  map_pfn = page_to_pfn(page_list[0]);
  -for (i = 0; i < ret; i++) {
+for (i = 0; i < pinned; i++) {
  unsigned long this_pfn = page_to_pfn(page_list[i]);
  u64 csize;
if (last_pfn && (this_pfn != last_pfn + 1)) {
  /* Pin a contiguous chunk of memory */
  csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-if (vhost_vdpa_map(v, iova, csize,
-   map_pfn << PAGE_SHIFT,
-   msg->perm))
+ret = vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm);
+if (ret)
  goto out;
+
  map_pfn = this_pfn;
  iova += csize;
+nchunks = 0;
  }
last_pfn = this_pfn;
  }
  -cur_base += ret << PAGE_SHIFT;
-npages -= ret;
+cur_base += pinned << PAGE_SHIFT;
+npages -= pinned;
  }
/* Pin the rest chunk */
@@ -660,10 +676,22 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

   map_pfn << PAGE_SHIFT, msg->perm);
  out:
  if (ret) {
+if (nchunks && last_pfn) {
+unsigned long pfn;
+
+/*
+ * Unpin the outstanding pages which are unmapped.
+ * Mapped pages are accounted in vdpa_map(), thus
+ * will be handled by vdpa_unmap().
+ */
+for (pfn = map_pfn; pfn &

Re: [PATCH 2/2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-03 Thread si-wei liu



On 11/3/2020 5:06 PM, si-wei liu wrote:


On 11/3/2020 5:00 AM, Jason Wang wrote:


On 2020/10/30 下午3:45, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
  drivers/vhost/vdpa.c | 64 
+---

  1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..8da8558 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
if (r)
  vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+else
+atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
return r;
  }
@@ -591,14 +593,16 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
  unsigned int gup_flags = FOLL_LONGTERM;
  unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-unsigned long locked, lock_limit, pinned, i;
+unsigned long lock_limit, sz2pin, nchunks, i;
  u64 iova = msg->iova;
+long pinned;
  int ret = 0;
if (vhost_iotlb_itree_first(iotlb, msg->iova,
  msg->iova + msg->size - 1))
  return -EEXIST;
  +/* Limit the use of memory for bookkeeping */
  page_list = (struct page **) __get_free_page(GFP_KERNEL);
  if (!page_list)
  return -ENOMEM;
@@ -607,52 +611,64 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  gup_flags |= FOLL_WRITE;
npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> 
PAGE_SHIFT;

-if (!npages)
-return -EINVAL;
+if (!npages) {
+ret = -EINVAL;
+goto free;
+}
mmap_read_lock(dev->mm);
  -locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
  lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-if (locked > lock_limit) {
+if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
  ret = -ENOMEM;
-goto out;
+goto unlock;
  }
cur_base = msg->uaddr & PAGE_MASK;
  iova &= PAGE_MASK;
+nchunks = 0;
while (npages) {
-pinned = min_t(unsigned long, npages, list_size);
-ret = pin_user_pages(cur_base, pinned,
- gup_flags, page_list, NULL);
-if (ret != pinned)
+sz2pin = min_t(unsigned long, npages, list_size);
+pinned = pin_user_pages(cur_base, sz2pin,
+gup_flags, page_list, NULL);
+if (sz2pin != pinned) {
+if (pinned < 0) {
+ret = pinned;
+} else {
+unpin_user_pages(page_list, pinned);
+ret = -ENOMEM;
+}
  goto out;
+}
+nchunks++;
if (!last_pfn)
  map_pfn = page_to_pfn(page_list[0]);
  -for (i = 0; i < ret; i++) {
+for (i = 0; i < pinned; i++) {
  unsigned long this_pfn = page_to_pfn(page_list[i]);
  u64 csize;
if (last_pfn && (this_pfn != last_pfn + 1)) {
  /* Pin a contiguous chunk of memory */
  csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-if (vhost_vdpa_map(v, iova, csize,
-   map_pfn << PAGE_SHIFT,
-   msg->perm))
+ret = vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm);
+if (ret)
  goto out;
+
  map_pfn = this_pfn;
  iova += csize;
+nchunks = 0;
  }
last_pfn = this_pfn;
  }
  -cur_base += ret << PAGE_SHIFT;
-npages -= ret;
+cur_base += pinned << PAGE_SHIFT;
+npages -= pinned;
  }
/* Pin the rest chunk */
@@ -660,10 +676,22 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

   map_pfn << PAGE_SHIFT, msg->perm);
  out:
  if (ret) {
+if (nchunks && last_pfn) {
+unsigned long pfn;
+
+/*
+ * Unpin the outstanding pages which are unmapped.
+ * Mapped pages are accounted in vdpa_map(), thus
+ * will be handled by vdpa_unmap().
+  

Re: [PATCH 2/2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-03 Thread si-wei liu



On 11/3/2020 5:58 PM, Jason Wang wrote:


On 2020/11/4 上午9:08, si-wei liu wrote:


On 11/3/2020 5:06 PM, si-wei liu wrote:


On 11/3/2020 5:00 AM, Jason Wang wrote:


On 2020/10/30 下午3:45, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
  drivers/vhost/vdpa.c | 64 
+---

  1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..8da8558 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
if (r)
  vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+else
+atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
return r;
  }
@@ -591,14 +593,16 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
  unsigned int gup_flags = FOLL_LONGTERM;
  unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-unsigned long locked, lock_limit, pinned, i;
+unsigned long lock_limit, sz2pin, nchunks, i;
  u64 iova = msg->iova;
+long pinned;
  int ret = 0;
if (vhost_iotlb_itree_first(iotlb, msg->iova,
  msg->iova + msg->size - 1))
  return -EEXIST;
  +/* Limit the use of memory for bookkeeping */
  page_list = (struct page **) __get_free_page(GFP_KERNEL);
  if (!page_list)
  return -ENOMEM;
@@ -607,52 +611,64 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  gup_flags |= FOLL_WRITE;
npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> 
PAGE_SHIFT;

-if (!npages)
-return -EINVAL;
+if (!npages) {
+ret = -EINVAL;
+goto free;
+}
mmap_read_lock(dev->mm);
  -locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
  lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-if (locked > lock_limit) {
+if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
  ret = -ENOMEM;
-goto out;
+goto unlock;
  }
cur_base = msg->uaddr & PAGE_MASK;
  iova &= PAGE_MASK;
+nchunks = 0;
while (npages) {
-pinned = min_t(unsigned long, npages, list_size);
-ret = pin_user_pages(cur_base, pinned,
- gup_flags, page_list, NULL);
-if (ret != pinned)
+sz2pin = min_t(unsigned long, npages, list_size);
+pinned = pin_user_pages(cur_base, sz2pin,
+gup_flags, page_list, NULL);
+if (sz2pin != pinned) {
+if (pinned < 0) {
+ret = pinned;
+} else {
+unpin_user_pages(page_list, pinned);
+ret = -ENOMEM;
+}
  goto out;
+}
+nchunks++;
if (!last_pfn)
  map_pfn = page_to_pfn(page_list[0]);
  -for (i = 0; i < ret; i++) {
+for (i = 0; i < pinned; i++) {
  unsigned long this_pfn = page_to_pfn(page_list[i]);
  u64 csize;
if (last_pfn && (this_pfn != last_pfn + 1)) {
  /* Pin a contiguous chunk of memory */
  csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-if (vhost_vdpa_map(v, iova, csize,
-   map_pfn << PAGE_SHIFT,
-   msg->perm))
+ret = vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm);
+if (ret)
  goto out;
+
  map_pfn = this_pfn;
  iova += csize;
+nchunks = 0;
  }
last_pfn = this_pfn;
  }
  -cur_base += ret << PAGE_SHIFT;
-npages -= ret;
+cur_base += pinned << PAGE_SHIFT;
+npages -= pinned;
  }
/* Pin the rest chunk */
@@ -660,10 +676,22 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

   map_pfn << PAGE_SHIFT, msg->perm);
  out:
  if (ret) {
+if (nchunks && last_pfn) {
+unsigned long pfn;
+
+/*
+ * Unpin the outstanding pages which are unmapped.
+   

Re: [PATCH v3 2/2] vhost-vdpa: fix page pinning leakage in error path

2020-10-13 Thread si-wei liu



On 10/9/2020 7:27 PM, Jason Wang wrote:


On 2020/10/3 下午1:02, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path. As the inflight pinned
pages, specifically for memory region that strides across
multiple chunks, would need more than one free page for
book keeping and accounting. For simplicity, pin pages
for all memory in the IOVA range in one go rather than
have multiple pin_user_pages calls to make up the entire
region. This way it's easier to track and account the
pages already mapped, particularly for clean-up in the
error path.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
Changes in v3:
- Factor out vhost_vdpa_map() change to a separate patch

Changes in v2:
- Fix incorrect target SHA1 referenced

  drivers/vhost/vdpa.c | 119 
++-

  1 file changed, 71 insertions(+), 48 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 0f27919..dad41dae 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -595,21 +595,19 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  struct vhost_dev *dev = &v->vdev;
  struct vhost_iotlb *iotlb = dev->iotlb;
  struct page **page_list;
-unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
+struct vm_area_struct **vmas;
  unsigned int gup_flags = FOLL_LONGTERM;
-unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-unsigned long locked, lock_limit, pinned, i;
+unsigned long map_pfn, last_pfn = 0;
+unsigned long npages, lock_limit;
+unsigned long i, nmap = 0;
  u64 iova = msg->iova;
+long pinned;
  int ret = 0;
if (vhost_iotlb_itree_first(iotlb, msg->iova,
  msg->iova + msg->size - 1))
  return -EEXIST;
  -page_list = (struct page **) __get_free_page(GFP_KERNEL);
-if (!page_list)
-return -ENOMEM;
-
  if (msg->perm & VHOST_ACCESS_WO)
  gup_flags |= FOLL_WRITE;
  @@ -617,61 +615,86 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  if (!npages)
  return -EINVAL;
  +page_list = kvmalloc_array(npages, sizeof(struct page *), 
GFP_KERNEL);

+vmas = kvmalloc_array(npages, sizeof(struct vm_area_struct *),
+  GFP_KERNEL);



This will result high order memory allocation which was what the code 
tried to avoid originally.


Using an unlimited size will cause a lot of side effects consider VM 
or userspace may try to pin several TB of memory.
Hmmm, that's a good point. Indeed, if the guest memory demand is huge or 
the host system is running short of free pages, kvmalloc will be 
problematic and less efficient than the __get_free_page implementation.






+if (!page_list || !vmas) {
+ret = -ENOMEM;
+goto free;
+}



Any reason that you want to use vmas?
Without providing custom vmas, it's subject to high order allocation 
failure. While page_list and vmas can now fallback to virtual memory 
allocation if need be.






+
  mmap_read_lock(dev->mm);
  -locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
  lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-if (locked > lock_limit) {
+if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
  ret = -ENOMEM;
-goto out;
+goto unlock;
  }
  -cur_base = msg->uaddr & PAGE_MASK;
-iova &= PAGE_MASK;
+pinned = pin_user_pages(msg->uaddr & PAGE_MASK, npages, gup_flags,
+page_list, vmas);
+if (npages != pinned) {
+if (pinned < 0) {
+ret = pinned;
+} else {
+unpin_user_pages(page_list, pinned);
+ret = -ENOMEM;
+}
+goto unlock;
+}
  -while (npages) {
-pinned = min_t(unsigned long, npages, list_size);
-ret = pin_user_pages(cur_base, pinned,
- gup_flags, page_list, NULL);
-if (ret != pinned)
-goto out;
-
-if (!last_pfn)
-map_pfn = page_to_pfn(page_list[0]);
-
-for (i = 0; i < ret; i++) {
-unsigned long this_pfn = page_to_pfn(page_list[i]);
-u64 csize;
-
-if (last_pfn && (this_pfn != last_pfn + 1)) {
-/* Pin a contiguous chunk of memory */
-csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-if (vhost_vdpa_map(v, iova, csize,
-   map_pfn << PAGE_SHIFT,
-   msg->perm))
-goto out;
-map_pfn = this_pfn;
-iova += csize;
+iova &= PAGE_MASK;
+map_pfn = page_to_pfn(page_list[0]);
+
+/* One more iterati

Re: [PATCH 2/2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-04 Thread si-wei liu



On 11/3/2020 6:42 PM, Jason Wang wrote:


On 2020/10/30 下午3:45, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
  drivers/vhost/vdpa.c | 64 
+---

  1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..8da8558 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
if (r)
  vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+else
+atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
return r;
  }
@@ -591,14 +593,16 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
  unsigned int gup_flags = FOLL_LONGTERM;
  unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-unsigned long locked, lock_limit, pinned, i;
+unsigned long lock_limit, sz2pin, nchunks, i;
  u64 iova = msg->iova;
+long pinned;
  int ret = 0;
if (vhost_iotlb_itree_first(iotlb, msg->iova,
  msg->iova + msg->size - 1))
  return -EEXIST;
  +/* Limit the use of memory for bookkeeping */
  page_list = (struct page **) __get_free_page(GFP_KERNEL);
  if (!page_list)
  return -ENOMEM;
@@ -607,52 +611,64 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  gup_flags |= FOLL_WRITE;
npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> 
PAGE_SHIFT;

-if (!npages)
-return -EINVAL;
+if (!npages) {
+ret = -EINVAL;
+goto free;
+}
mmap_read_lock(dev->mm);
  -locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
  lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-if (locked > lock_limit) {
+if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
  ret = -ENOMEM;
-goto out;
+goto unlock;
  }
cur_base = msg->uaddr & PAGE_MASK;
  iova &= PAGE_MASK;
+nchunks = 0;
while (npages) {
-pinned = min_t(unsigned long, npages, list_size);
-ret = pin_user_pages(cur_base, pinned,
- gup_flags, page_list, NULL);
-if (ret != pinned)
+sz2pin = min_t(unsigned long, npages, list_size);
+pinned = pin_user_pages(cur_base, sz2pin,
+gup_flags, page_list, NULL);
+if (sz2pin != pinned) {
+if (pinned < 0) {
+ret = pinned;
+} else {
+unpin_user_pages(page_list, pinned);
+ret = -ENOMEM;
+}
  goto out;
+}
+nchunks++;
if (!last_pfn)
  map_pfn = page_to_pfn(page_list[0]);
  -for (i = 0; i < ret; i++) {
+for (i = 0; i < pinned; i++) {
  unsigned long this_pfn = page_to_pfn(page_list[i]);
  u64 csize;
if (last_pfn && (this_pfn != last_pfn + 1)) {
  /* Pin a contiguous chunk of memory */
  csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-if (vhost_vdpa_map(v, iova, csize,
-   map_pfn << PAGE_SHIFT,
-   msg->perm))
+ret = vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm);
+if (ret)
  goto out;
+
  map_pfn = this_pfn;
  iova += csize;
+nchunks = 0;
  }
last_pfn = this_pfn;
  }
  -cur_base += ret << PAGE_SHIFT;
-npages -= ret;
+cur_base += pinned << PAGE_SHIFT;
+npages -= pinned;
  }
/* Pin the rest chunk */
@@ -660,10 +676,22 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

   map_pfn << PAGE_SHIFT, msg->perm);
  out:
  if (ret) {
+if (nchunks && last_pfn) {



Can we decrease npages where you did "nchunks++" then we can check 
npages here instead?
Hmmm, I am not sure I get what you want... @nchunks gets reset to 0 
whenever a certain range of pinned pages is successfully mapped. The 
conditional (when nchunks is non-zero) 

[PATCH v2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-04 Thread Si-Wei Liu
Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
Changes in v2:
- Drop the reversion patch
- Fix unhandled page leak towards the end of page_list

 drivers/vhost/vdpa.c | 79 
 1 file changed, 61 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..e112854 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
 
if (r)
vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+   else
+   atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
 
return r;
 }
@@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
unsigned int gup_flags = FOLL_LONGTERM;
unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-   unsigned long locked, lock_limit, pinned, i;
+   unsigned long lock_limit, sz2pin, nchunks, i;
u64 iova = msg->iova;
+   long pinned;
int ret = 0;
 
if (vhost_iotlb_itree_first(iotlb, msg->iova,
msg->iova + msg->size - 1))
return -EEXIST;
 
+   /* Limit the use of memory for bookkeeping */
page_list = (struct page **) __get_free_page(GFP_KERNEL);
if (!page_list)
return -ENOMEM;
@@ -607,52 +611,75 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
gup_flags |= FOLL_WRITE;
 
npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT;
-   if (!npages)
-   return -EINVAL;
+   if (!npages) {
+   ret = -EINVAL;
+   goto free;
+   }
 
mmap_read_lock(dev->mm);
 
-   locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (locked > lock_limit) {
+   if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
ret = -ENOMEM;
-   goto out;
+   goto unlock;
}
 
cur_base = msg->uaddr & PAGE_MASK;
iova &= PAGE_MASK;
+   nchunks = 0;
 
while (npages) {
-   pinned = min_t(unsigned long, npages, list_size);
-   ret = pin_user_pages(cur_base, pinned,
-gup_flags, page_list, NULL);
-   if (ret != pinned)
+   sz2pin = min_t(unsigned long, npages, list_size);
+   pinned = pin_user_pages(cur_base, sz2pin,
+   gup_flags, page_list, NULL);
+   if (sz2pin != pinned) {
+   if (pinned < 0) {
+   ret = pinned;
+   } else {
+   unpin_user_pages(page_list, pinned);
+   ret = -ENOMEM;
+   }
goto out;
+   }
+   nchunks++;
 
if (!last_pfn)
map_pfn = page_to_pfn(page_list[0]);
 
-   for (i = 0; i < ret; i++) {
+   for (i = 0; i < pinned; i++) {
unsigned long this_pfn = page_to_pfn(page_list[i]);
u64 csize;
 
if (last_pfn && (this_pfn != last_pfn + 1)) {
/* Pin a contiguous chunk of memory */
csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-   if (vhost_vdpa_map(v, iova, csize,
-  map_pfn << PAGE_SHIFT,
-  msg->perm))
+   ret = vhost_vdpa_map(v, iova, csize,
+map_pfn << PAGE_SHIFT,
+msg->perm);
+   if (ret) {
+   /*
+* Unpin the pages that are left 
unmapped
+* from this point on in the current
+* page_list. The remaining outstanding
+ 

Re: [PATCH 2/2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-05 Thread si-wei liu



On 11/4/2020 7:12 PM, Jason Wang wrote:


On 2020/11/5 上午7:40, si-wei liu wrote:


On 11/3/2020 6:42 PM, Jason Wang wrote:


On 2020/10/30 下午3:45, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
  drivers/vhost/vdpa.c | 64 
+---

  1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..8da8558 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
if (r)
  vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+else
+atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
return r;
  }
@@ -591,14 +593,16 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
  unsigned int gup_flags = FOLL_LONGTERM;
  unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-unsigned long locked, lock_limit, pinned, i;
+unsigned long lock_limit, sz2pin, nchunks, i;
  u64 iova = msg->iova;
+long pinned;
  int ret = 0;
if (vhost_iotlb_itree_first(iotlb, msg->iova,
  msg->iova + msg->size - 1))
  return -EEXIST;
  +/* Limit the use of memory for bookkeeping */
  page_list = (struct page **) __get_free_page(GFP_KERNEL);
  if (!page_list)
  return -ENOMEM;
@@ -607,52 +611,64 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  gup_flags |= FOLL_WRITE;
npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> 
PAGE_SHIFT;

-if (!npages)
-return -EINVAL;
+if (!npages) {
+ret = -EINVAL;
+goto free;
+}
mmap_read_lock(dev->mm);
  -locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
  lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-if (locked > lock_limit) {
+if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
  ret = -ENOMEM;
-goto out;
+goto unlock;
  }
cur_base = msg->uaddr & PAGE_MASK;
  iova &= PAGE_MASK;
+nchunks = 0;
while (npages) {
-pinned = min_t(unsigned long, npages, list_size);
-ret = pin_user_pages(cur_base, pinned,
- gup_flags, page_list, NULL);
-if (ret != pinned)
+sz2pin = min_t(unsigned long, npages, list_size);
+pinned = pin_user_pages(cur_base, sz2pin,
+gup_flags, page_list, NULL);
+if (sz2pin != pinned) {
+if (pinned < 0) {
+ret = pinned;
+} else {
+unpin_user_pages(page_list, pinned);
+ret = -ENOMEM;
+}
  goto out;
+}
+nchunks++;
if (!last_pfn)
  map_pfn = page_to_pfn(page_list[0]);
  -for (i = 0; i < ret; i++) {
+for (i = 0; i < pinned; i++) {
  unsigned long this_pfn = page_to_pfn(page_list[i]);
  u64 csize;
if (last_pfn && (this_pfn != last_pfn + 1)) {
  /* Pin a contiguous chunk of memory */
  csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-if (vhost_vdpa_map(v, iova, csize,
-   map_pfn << PAGE_SHIFT,
-   msg->perm))
+ret = vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm);
+if (ret)
  goto out;
+
  map_pfn = this_pfn;
  iova += csize;
+nchunks = 0;
  }
last_pfn = this_pfn;
  }
  -cur_base += ret << PAGE_SHIFT;
-npages -= ret;
+cur_base += pinned << PAGE_SHIFT;
+npages -= pinned;
  }
/* Pin the rest chunk */
@@ -660,10 +676,22 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

   map_pfn << PAGE_SHIFT, msg->perm);
  out:
  if (ret) {
+if (nchunks && last_pfn) {



Can we decrease npages where you did "nchunks++" then we can check 
npages here instead?
Hmmm, I am not sure I get what you want... @nchunks gets reset to 0 
whenever a certain range of 

Re: [PATCH v2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-05 Thread si-wei liu



On 11/4/2020 7:26 PM, Jason Wang wrote:


On 2020/11/5 上午7:33, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
Changes in v2:
- Drop the reversion patch
- Fix unhandled page leak towards the end of page_list

  drivers/vhost/vdpa.c | 79 


  1 file changed, 61 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..e112854 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
if (r)
  vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+else
+atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
return r;
  }
@@ -591,14 +593,16 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
  unsigned int gup_flags = FOLL_LONGTERM;
  unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-unsigned long locked, lock_limit, pinned, i;
+unsigned long lock_limit, sz2pin, nchunks, i;
  u64 iova = msg->iova;
+long pinned;
  int ret = 0;
if (vhost_iotlb_itree_first(iotlb, msg->iova,
  msg->iova + msg->size - 1))
  return -EEXIST;
  +/* Limit the use of memory for bookkeeping */
  page_list = (struct page **) __get_free_page(GFP_KERNEL);
  if (!page_list)
  return -ENOMEM;
@@ -607,52 +611,75 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  gup_flags |= FOLL_WRITE;
npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> 
PAGE_SHIFT;

-if (!npages)
-return -EINVAL;
+if (!npages) {
+ret = -EINVAL;
+goto free;
+}
mmap_read_lock(dev->mm);
  -locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
  lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-if (locked > lock_limit) {
+if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
  ret = -ENOMEM;
-goto out;
+goto unlock;
  }
cur_base = msg->uaddr & PAGE_MASK;
  iova &= PAGE_MASK;
+nchunks = 0;
while (npages) {
-pinned = min_t(unsigned long, npages, list_size);
-ret = pin_user_pages(cur_base, pinned,
- gup_flags, page_list, NULL);
-if (ret != pinned)
+sz2pin = min_t(unsigned long, npages, list_size);
+pinned = pin_user_pages(cur_base, sz2pin,
+gup_flags, page_list, NULL);
+if (sz2pin != pinned) {
+if (pinned < 0) {
+ret = pinned;
+} else {
+unpin_user_pages(page_list, pinned);
+ret = -ENOMEM;
+}
  goto out;
+}
+nchunks++;
if (!last_pfn)
  map_pfn = page_to_pfn(page_list[0]);
  -for (i = 0; i < ret; i++) {
+for (i = 0; i < pinned; i++) {
  unsigned long this_pfn = page_to_pfn(page_list[i]);
  u64 csize;
if (last_pfn && (this_pfn != last_pfn + 1)) {
  /* Pin a contiguous chunk of memory */
  csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-if (vhost_vdpa_map(v, iova, csize,
-   map_pfn << PAGE_SHIFT,
-   msg->perm))
+ret = vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm);
+if (ret) {
+/*
+ * Unpin the pages that are left unmapped
+ * from this point on in the current
+ * page_list. The remaining outstanding
+ * ones which may stride across several
+ * chunks will be covered in the common
+ * error path subsequently.
+ */
+unpin_user_pages(&page_list[i],
+ pinned - i);



Can we simply do last_pfn = this_pfn here?
Nope. They are not contiguous segments of memory. Noted the conditional 
(this_pfn != last_pfn + 1) being held here.






  goto out;
+}
+
  map_pfn = this_pfn;
  iova 

[PATCH v3] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-05 Thread Si-Wei Liu
Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
Changes in v3:
- Turn explicit last_pfn check to a WARN_ON() (Jason)

Changes in v2:
- Drop the reversion patch
- Fix unhandled page leak towards the end of page_list

 drivers/vhost/vdpa.c | 80 
 1 file changed, 62 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..5b13dfd 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
 
if (r)
vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+   else
+   atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
 
return r;
 }
@@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
unsigned int gup_flags = FOLL_LONGTERM;
unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-   unsigned long locked, lock_limit, pinned, i;
+   unsigned long lock_limit, sz2pin, nchunks, i;
u64 iova = msg->iova;
+   long pinned;
int ret = 0;
 
if (vhost_iotlb_itree_first(iotlb, msg->iova,
msg->iova + msg->size - 1))
return -EEXIST;
 
+   /* Limit the use of memory for bookkeeping */
page_list = (struct page **) __get_free_page(GFP_KERNEL);
if (!page_list)
return -ENOMEM;
@@ -607,52 +611,75 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
gup_flags |= FOLL_WRITE;
 
npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT;
-   if (!npages)
-   return -EINVAL;
+   if (!npages) {
+   ret = -EINVAL;
+   goto free;
+   }
 
mmap_read_lock(dev->mm);
 
-   locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (locked > lock_limit) {
+   if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
ret = -ENOMEM;
-   goto out;
+   goto unlock;
}
 
cur_base = msg->uaddr & PAGE_MASK;
iova &= PAGE_MASK;
+   nchunks = 0;
 
while (npages) {
-   pinned = min_t(unsigned long, npages, list_size);
-   ret = pin_user_pages(cur_base, pinned,
-gup_flags, page_list, NULL);
-   if (ret != pinned)
+   sz2pin = min_t(unsigned long, npages, list_size);
+   pinned = pin_user_pages(cur_base, sz2pin,
+   gup_flags, page_list, NULL);
+   if (sz2pin != pinned) {
+   if (pinned < 0) {
+   ret = pinned;
+   } else {
+   unpin_user_pages(page_list, pinned);
+   ret = -ENOMEM;
+   }
goto out;
+   }
+   nchunks++;
 
if (!last_pfn)
map_pfn = page_to_pfn(page_list[0]);
 
-   for (i = 0; i < ret; i++) {
+   for (i = 0; i < pinned; i++) {
unsigned long this_pfn = page_to_pfn(page_list[i]);
u64 csize;
 
if (last_pfn && (this_pfn != last_pfn + 1)) {
/* Pin a contiguous chunk of memory */
csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-   if (vhost_vdpa_map(v, iova, csize,
-  map_pfn << PAGE_SHIFT,
-  msg->perm))
+   ret = vhost_vdpa_map(v, iova, csize,
+map_pfn << PAGE_SHIFT,
+msg->perm);
+   if (ret) {
+   /*
+* Unpin the pages that are left 
unmapped
+* from this point on in the current
+ 

Re: [PATCH v3 2/2] vhost-vdpa: fix page pinning leakage in error path

2020-11-02 Thread si-wei liu



On 10/29/2020 2:53 PM, Michael S. Tsirkin wrote:

On Thu, Oct 15, 2020 at 01:17:14PM -0700, si-wei liu wrote:

On 10/15/2020 6:11 AM, Michael S. Tsirkin wrote:

On Thu, Oct 15, 2020 at 02:15:32PM +0800, Jason Wang wrote:

On 2020/10/14 上午7:42, si-wei liu wrote:

So what I suggest is to fix the pinning leakage first and do the
possible optimization on top (which is still questionable to me).

OK. Unfortunately, this was picked and got merged in upstream. So I will
post a follow up patch set to 1) revert the commit to the original
__get_free_page() implementation, and 2) fix the accounting and leakage
on top. Will it be fine?

Fine.

Thanks

Fine by me too.


Thanks, Michael & Jason. I will post the fix shortly. Stay tuned.

-Siwei

did I miss the patch?


The patch had been posted last Friday. See this thread:

https://lore.kernel.org/virtualization/1604043944-4897-2-git-send-email-si-wei@oracle.com/

-Siwei