Re: [PATCH v2 08/17] vdpa_sim: add supported_features field in vdpasim_dev_attr

2020-11-29 Thread Jason Wang



On 2020/11/26 下午10:49, Stefano Garzarella wrote:

Introduce a new VDPASIM_FEATURES macro with the generic features
supported by the vDPA simulator, and VDPASIM_NET_FEATURES macro with
vDPA-net features.

Add 'supported_features' field in vdpasim_dev_attr, to allow devices
to specify their features.

Co-developed-by: Max Gurtovoy 
Signed-off-by: Max Gurtovoy 
Signed-off-by: Stefano Garzarella 
---
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 29 ++---
  1 file changed, 18 insertions(+), 11 deletions(-)



Acked-by: Jason Wang 




diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 393b54a9f0e4..36677fc3631b 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -49,12 +49,15 @@ struct vdpasim_virtqueue {
  #define VDPASIM_VQ_NUM 0x2
  #define VDPASIM_NAME "vdpasim-netdev"
  
-static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) |

- (1ULL << VIRTIO_F_VERSION_1)  |
- (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
- (1ULL << VIRTIO_NET_F_MAC);
+#define VDPASIM_FEATURES   ((1ULL << VIRTIO_F_ANY_LAYOUT) | \
+(1ULL << VIRTIO_F_VERSION_1)  | \
+(1ULL << VIRTIO_F_ACCESS_PLATFORM))
+
+#define VDPASIM_NET_FEATURES   (VDPASIM_FEATURES | \
+(1ULL << VIRTIO_NET_F_MAC))
  
  struct vdpasim_dev_attr {

+   u64 supported_features;
int nvqs;
u32 id;
  };
@@ -112,7 +115,7 @@ static void vdpasim_queue_ready(struct vdpasim *vdpasim, 
unsigned int idx)
  {
struct vdpasim_virtqueue *vq = >vqs[idx];
  
-	vringh_init_iotlb(>vring, vdpasim_features,

+   vringh_init_iotlb(>vring, vdpasim->dev_attr.supported_features,
  VDPASIM_QUEUE_MAX, false,
  (struct vring_desc *)(uintptr_t)vq->desc_addr,
  (struct vring_avail *)
@@ -121,7 +124,8 @@ static void vdpasim_queue_ready(struct vdpasim *vdpasim, 
unsigned int idx)
  (uintptr_t)vq->device_addr);
  }
  
-static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq)

+static void vdpasim_vq_reset(struct vdpasim *vdpasim,
+struct vdpasim_virtqueue *vq)
  {
vq->ready = false;
vq->desc_addr = 0;
@@ -129,8 +133,8 @@ static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq)
vq->device_addr = 0;
vq->cb = NULL;
vq->private = NULL;
-   vringh_init_iotlb(>vring, vdpasim_features, VDPASIM_QUEUE_MAX,
- false, NULL, NULL, NULL);
+   vringh_init_iotlb(>vring, vdpasim->dev_attr.supported_features,
+ VDPASIM_QUEUE_MAX, false, NULL, NULL, NULL);
  }
  
  static void vdpasim_reset(struct vdpasim *vdpasim)

@@ -138,7 +142,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim)
int i;
  
  	for (i = 0; i < vdpasim->dev_attr.nvqs; i++)

-   vdpasim_vq_reset(>vqs[i]);
+   vdpasim_vq_reset(vdpasim, >vqs[i]);
  
  	spin_lock(>iommu_lock);

vhost_iotlb_reset(vdpasim->iommu);
@@ -498,7 +502,9 @@ static u32 vdpasim_get_vq_align(struct vdpa_device *vdpa)
  
  static u64 vdpasim_get_features(struct vdpa_device *vdpa)

  {
-   return vdpasim_features;
+   struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+   return vdpasim->dev_attr.supported_features;
  }
  
  static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features)

@@ -510,7 +516,7 @@ static int vdpasim_set_features(struct vdpa_device *vdpa, 
u64 features)
if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
return -EINVAL;
  
-	vdpasim->features = features & vdpasim_features;

+   vdpasim->features = features & vdpasim->dev_attr.supported_features;
  
  	/* We generally only know whether guest is using the legacy interface

 * here, so generally that's the earliest we can set config fields.
@@ -722,6 +728,7 @@ static int __init vdpasim_dev_init(void)
struct vdpasim_dev_attr dev_attr = {};
  
  	dev_attr.id = VIRTIO_ID_NET;

+   dev_attr.supported_features = VDPASIM_NET_FEATURES;
dev_attr.nvqs = VDPASIM_VQ_NUM;
  
  	vdpasim_dev = vdpasim_create(_attr);




Re: [PATCH v2 09/17] vdpa_sim: add work_fn in vdpasim_dev_attr

2020-11-29 Thread Jason Wang



On 2020/11/26 下午10:49, Stefano Garzarella wrote:

Rename vdpasim_work() in vdpasim_net_work() and add it to
the vdpasim_dev_attr structure.

Co-developed-by: Max Gurtovoy 
Signed-off-by: Max Gurtovoy 
Signed-off-by: Stefano Garzarella 
---
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 7 +--
  1 file changed, 5 insertions(+), 2 deletions(-)



Acked-by: Jason Wang 




diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 36677fc3631b..b84d9acd130c 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -60,6 +60,8 @@ struct vdpasim_dev_attr {
u64 supported_features;
int nvqs;
u32 id;
+
+   work_func_t work_fn;
  };
  
  /* State of each vdpasim device */

@@ -153,7 +155,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim)
++vdpasim->generation;
  }
  
-static void vdpasim_work(struct work_struct *work)

+static void vdpasim_net_work(struct work_struct *work)
  {
struct vdpasim *vdpasim = container_of(work, struct
 vdpasim, work);
@@ -360,7 +362,7 @@ static struct vdpasim *vdpasim_create(struct 
vdpasim_dev_attr *dev_attr)
goto err_alloc;
  
  	vdpasim->dev_attr = *dev_attr;

-   INIT_WORK(>work, vdpasim_work);
+   INIT_WORK(>work, dev_attr->work_fn);
spin_lock_init(>lock);
spin_lock_init(>iommu_lock);
  
@@ -730,6 +732,7 @@ static int __init vdpasim_dev_init(void)

dev_attr.id = VIRTIO_ID_NET;
dev_attr.supported_features = VDPASIM_NET_FEATURES;
dev_attr.nvqs = VDPASIM_VQ_NUM;
+   dev_attr.work_fn = vdpasim_net_work;
  
  	vdpasim_dev = vdpasim_create(_attr);
  




Re: [PATCH v2 07/17] vdpa_sim: add device id field in vdpasim_dev_attr

2020-11-29 Thread Jason Wang



On 2020/11/26 下午10:49, Stefano Garzarella wrote:

Remove VDPASIM_DEVICE_ID macro and add 'id' field in vdpasim_dev_attr,
that will be returned by vdpasim_get_device_id().

Use VIRTIO_ID_NET for vDPA-net simulator device id.

Co-developed-by: Max Gurtovoy 
Signed-off-by: Max Gurtovoy 
Signed-off-by: Stefano Garzarella 
---
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 7 +--
  1 file changed, 5 insertions(+), 2 deletions(-)



Acked-by: Jason Wang 




diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index f98262add0e1..393b54a9f0e4 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -44,7 +44,6 @@ struct vdpasim_virtqueue {
  
  #define VDPASIM_QUEUE_ALIGN PAGE_SIZE

  #define VDPASIM_QUEUE_MAX 256
-#define VDPASIM_DEVICE_ID 0x1
  #define VDPASIM_VENDOR_ID 0
  #define VDPASIM_IOTLB_LIMIT 0 /* unlimited */
  #define VDPASIM_VQ_NUM 0x2
@@ -57,6 +56,7 @@ static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) |
  
  struct vdpasim_dev_attr {

int nvqs;
+   u32 id;
  };
  
  /* State of each vdpasim device */

@@ -536,7 +536,9 @@ static u16 vdpasim_get_vq_num_max(struct vdpa_device *vdpa)
  
  static u32 vdpasim_get_device_id(struct vdpa_device *vdpa)

  {
-   return VDPASIM_DEVICE_ID;
+   struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+   return vdpasim->dev_attr.id;
  }
  
  static u32 vdpasim_get_vendor_id(struct vdpa_device *vdpa)

@@ -719,6 +721,7 @@ static int __init vdpasim_dev_init(void)
  {
struct vdpasim_dev_attr dev_attr = {};
  
+	dev_attr.id = VIRTIO_ID_NET;

dev_attr.nvqs = VDPASIM_VQ_NUM;
  
  	vdpasim_dev = vdpasim_create(_attr);




Re: [PATCH v2 06/17] vdpa_sim: add struct vdpasim_dev_attr for device attributes

2020-11-29 Thread Jason Wang



On 2020/11/26 下午10:49, Stefano Garzarella wrote:

vdpasim_dev_attr will contain device specific attributes. We starting
moving the number of virtqueues (i.e. nvqs) to vdpasim_dev_attr.

vdpasim_create() creates a new vDPA simulator following the device
attributes defined in the vdpasim_dev_attr parameter.

Co-developed-by: Max Gurtovoy 
Signed-off-by: Max Gurtovoy 
Signed-off-by: Stefano Garzarella 
---
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 25 +
  1 file changed, 17 insertions(+), 8 deletions(-)



Acked-by: Jason Wang 




diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 62204e064841..f98262add0e1 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -55,11 +55,16 @@ static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) 
|
  (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
  (1ULL << VIRTIO_NET_F_MAC);
  
+struct vdpasim_dev_attr {

+   int nvqs;
+};
+
  /* State of each vdpasim device */
  struct vdpasim {
struct vdpa_device vdpa;
struct vdpasim_virtqueue *vqs;
struct work_struct work;
+   struct vdpasim_dev_attr dev_attr;
/* spinlock to synchronize virtqueue state */
spinlock_t lock;
struct virtio_net_config config;
@@ -68,7 +73,6 @@ struct vdpasim {
u32 status;
u32 generation;
u64 features;
-   int nvqs;
/* spinlock to synchronize iommu table */
spinlock_t iommu_lock;
  };
@@ -133,7 +137,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim)
  {
int i;
  
-	for (i = 0; i < vdpasim->nvqs; i++)

+   for (i = 0; i < vdpasim->dev_attr.nvqs; i++)
vdpasim_vq_reset(>vqs[i]);
  
  	spin_lock(>iommu_lock);

@@ -334,7 +338,7 @@ static const struct dma_map_ops vdpasim_dma_ops = {
  static const struct vdpa_config_ops vdpasim_config_ops;
  static const struct vdpa_config_ops vdpasim_batch_config_ops;
  
-static struct vdpasim *vdpasim_create(void)

+static struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr)
  {
const struct vdpa_config_ops *ops;
struct vdpasim *vdpasim;
@@ -346,11 +350,12 @@ static struct vdpasim *vdpasim_create(void)
else
ops = _config_ops;
  
-	vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, VDPASIM_VQ_NUM);

+   vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops,
+   dev_attr->nvqs);
if (!vdpasim)
goto err_alloc;
  
-	vdpasim->nvqs = VDPASIM_VQ_NUM;

+   vdpasim->dev_attr = *dev_attr;
INIT_WORK(>work, vdpasim_work);
spin_lock_init(>lock);
spin_lock_init(>iommu_lock);
@@ -361,7 +366,7 @@ static struct vdpasim *vdpasim_create(void)
goto err_iommu;
set_dma_ops(dev, _dma_ops);
  
-	vdpasim->vqs = kcalloc(vdpasim->nvqs, sizeof(struct vdpasim_virtqueue),

+   vdpasim->vqs = kcalloc(dev_attr->nvqs, sizeof(struct vdpasim_virtqueue),
   GFP_KERNEL);
if (!vdpasim->vqs)
goto err_iommu;
@@ -384,7 +389,7 @@ static struct vdpasim *vdpasim_create(void)
eth_random_addr(vdpasim->config.mac);
}
  
-	for (i = 0; i < vdpasim->nvqs; i++)

+   for (i = 0; i < dev_attr->nvqs; i++)
vringh_set_iotlb(>vqs[i].vring, vdpasim->iommu);
  
  	vdpasim->vdpa.dma_dev = dev;

@@ -712,7 +717,11 @@ static const struct vdpa_config_ops 
vdpasim_batch_config_ops = {
  
  static int __init vdpasim_dev_init(void)

  {
-   vdpasim_dev = vdpasim_create();
+   struct vdpasim_dev_attr dev_attr = {};
+
+   dev_attr.nvqs = VDPASIM_VQ_NUM;
+
+   vdpasim_dev = vdpasim_create(_attr);
  
  	if (!IS_ERR(vdpasim_dev))

return 0;




Re: [PATCH v2 05/17] vdpa_sim: rename vdpasim_config_ops variables

2020-11-29 Thread Jason Wang



On 2020/11/26 下午10:49, Stefano Garzarella wrote:

These variables stores generic callbacks used by the vDPA simulator
core, so we can remove the 'net' word in their names.

Co-developed-by: Max Gurtovoy 
Signed-off-by: Max Gurtovoy 
Signed-off-by: Stefano Garzarella 
---
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 12 ++--
  1 file changed, 6 insertions(+), 6 deletions(-)



Acked-by: Jason Wang 




diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 40664d87f303..62204e064841 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -331,8 +331,8 @@ static const struct dma_map_ops vdpasim_dma_ops = {
.free = vdpasim_free_coherent,
  };
  
-static const struct vdpa_config_ops vdpasim_net_config_ops;

-static const struct vdpa_config_ops vdpasim_net_batch_config_ops;
+static const struct vdpa_config_ops vdpasim_config_ops;
+static const struct vdpa_config_ops vdpasim_batch_config_ops;
  
  static struct vdpasim *vdpasim_create(void)

  {
@@ -342,9 +342,9 @@ static struct vdpasim *vdpasim_create(void)
int i, ret = -ENOMEM;
  
  	if (batch_mapping)

-   ops = _net_batch_config_ops;
+   ops = _batch_config_ops;
else
-   ops = _net_config_ops;
+   ops = _config_ops;
  
  	vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, VDPASIM_VQ_NUM);

if (!vdpasim)
@@ -657,7 +657,7 @@ static void vdpasim_free(struct vdpa_device *vdpa)
kfree(vdpasim->vqs);
  }
  
-static const struct vdpa_config_ops vdpasim_net_config_ops = {

+static const struct vdpa_config_ops vdpasim_config_ops = {
.set_vq_address = vdpasim_set_vq_address,
.set_vq_num = vdpasim_set_vq_num,
.kick_vq= vdpasim_kick_vq,
@@ -684,7 +684,7 @@ static const struct vdpa_config_ops vdpasim_net_config_ops 
= {
.free   = vdpasim_free,
  };
  
-static const struct vdpa_config_ops vdpasim_net_batch_config_ops = {

+static const struct vdpa_config_ops vdpasim_batch_config_ops = {
.set_vq_address = vdpasim_set_vq_address,
.set_vq_num = vdpasim_set_vq_num,
.kick_vq= vdpasim_kick_vq,




Re: [PATCH v2 04/17] vdpa_sim: remove the limit of IOTLB entries

2020-11-29 Thread Jason Wang



On 2020/11/26 下午10:49, Stefano Garzarella wrote:

The simulated devices can support multiple queues, so this limit
should be defined according to the number of queues supported by
the device.

Since we are in a simulator, let's simply remove that limit.

Suggested-by: Jason Wang 
Acked-by: Jason Wang 
Signed-off-by: Stefano Garzarella 
---
v2:
- added VDPASIM_IOTLB_LIMIT macro [Jason]



Sorry for being unclear. I meant adding a macro like

VHOST_IOTLB_UNLIMITED 0 in vhost_iotlb.h.

And use that in vdpa_sim.

Thanks



---
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index ad72f7b1a4eb..40664d87f303 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -46,6 +46,7 @@ struct vdpasim_virtqueue {
  #define VDPASIM_QUEUE_MAX 256
  #define VDPASIM_DEVICE_ID 0x1
  #define VDPASIM_VENDOR_ID 0
+#define VDPASIM_IOTLB_LIMIT 0 /* unlimited */
  #define VDPASIM_VQ_NUM 0x2
  #define VDPASIM_NAME "vdpasim-netdev"
  
@@ -365,7 +366,7 @@ static struct vdpasim *vdpasim_create(void)

if (!vdpasim->vqs)
goto err_iommu;
  
-	vdpasim->iommu = vhost_iotlb_alloc(2048, 0);

+   vdpasim->iommu = vhost_iotlb_alloc(VDPASIM_IOTLB_LIMIT, 0);
if (!vdpasim->iommu)
goto err_iommu;
  




Re: [PATCH v2 03/17] vdpa_sim: remove hard-coded virtq count

2020-11-29 Thread Jason Wang



On 2020/11/26 下午10:49, Stefano Garzarella wrote:

From: Max Gurtovoy 

Add a new attribute that will define the number of virt queues to be
created for the vdpasim device.

Signed-off-by: Max Gurtovoy 
[sgarzare: replace kmalloc_array() with kcalloc()]
Signed-off-by: Stefano Garzarella 



Acked-by: Jason Wang 



---
v1:
- use kcalloc() instead of kmalloc_array() since some function expects
   variables initialized to zero
---
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 18 +-
  1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index c6eaf62df8ec..ad72f7b1a4eb 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -57,7 +57,7 @@ static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) |
  /* State of each vdpasim device */
  struct vdpasim {
struct vdpa_device vdpa;
-   struct vdpasim_virtqueue vqs[VDPASIM_VQ_NUM];
+   struct vdpasim_virtqueue *vqs;
struct work_struct work;
/* spinlock to synchronize virtqueue state */
spinlock_t lock;
@@ -67,6 +67,7 @@ struct vdpasim {
u32 status;
u32 generation;
u64 features;
+   int nvqs;
/* spinlock to synchronize iommu table */
spinlock_t iommu_lock;
  };
@@ -131,7 +132,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim)
  {
int i;
  
-	for (i = 0; i < VDPASIM_VQ_NUM; i++)

+   for (i = 0; i < vdpasim->nvqs; i++)
vdpasim_vq_reset(>vqs[i]);
  
  	spin_lock(>iommu_lock);

@@ -337,7 +338,7 @@ static struct vdpasim *vdpasim_create(void)
const struct vdpa_config_ops *ops;
struct vdpasim *vdpasim;
struct device *dev;
-   int ret = -ENOMEM;
+   int i, ret = -ENOMEM;
  
  	if (batch_mapping)

ops = _net_batch_config_ops;
@@ -348,6 +349,7 @@ static struct vdpasim *vdpasim_create(void)
if (!vdpasim)
goto err_alloc;
  
+	vdpasim->nvqs = VDPASIM_VQ_NUM;

INIT_WORK(>work, vdpasim_work);
spin_lock_init(>lock);
spin_lock_init(>iommu_lock);
@@ -358,6 +360,11 @@ static struct vdpasim *vdpasim_create(void)
goto err_iommu;
set_dma_ops(dev, _dma_ops);
  
+	vdpasim->vqs = kcalloc(vdpasim->nvqs, sizeof(struct vdpasim_virtqueue),

+  GFP_KERNEL);
+   if (!vdpasim->vqs)
+   goto err_iommu;
+
vdpasim->iommu = vhost_iotlb_alloc(2048, 0);
if (!vdpasim->iommu)
goto err_iommu;
@@ -376,8 +383,8 @@ static struct vdpasim *vdpasim_create(void)
eth_random_addr(vdpasim->config.mac);
}
  
-	vringh_set_iotlb(>vqs[0].vring, vdpasim->iommu);

-   vringh_set_iotlb(>vqs[1].vring, vdpasim->iommu);
+   for (i = 0; i < vdpasim->nvqs; i++)
+   vringh_set_iotlb(>vqs[i].vring, vdpasim->iommu);
  
  	vdpasim->vdpa.dma_dev = dev;

ret = vdpa_register_device(>vdpa);
@@ -646,6 +653,7 @@ static void vdpasim_free(struct vdpa_device *vdpa)
kfree(vdpasim->buffer);
if (vdpasim->iommu)
vhost_iotlb_free(vdpasim->iommu);
+   kfree(vdpasim->vqs);
  }
  
  static const struct vdpa_config_ops vdpasim_net_config_ops = {




Re: [PATCH v2 02/17] vdpa_sim: remove unnecessary headers inclusion

2020-11-29 Thread Jason Wang



On 2020/11/26 下午10:49, Stefano Garzarella wrote:

Some headers are not necessary, so let's remove them to do
some cleaning.

Signed-off-by: Stefano Garzarella 
---
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 13 -
  1 file changed, 13 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 6a90fdb9cbfc..c6eaf62df8ec 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -7,24 +7,11 @@
   *
   */
  
-#include 

  #include 
-#include 



I think the rule is to make sure e.g the structure definition can be via 
direct inclusion. E.g struct device {} is defined in this file.




-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
  #include 
-#include 
-#include 
  #include 
  #include 
  #include 
-#include 



And the  __cpu_to_virtio16 is defined in this file.

Thanks



  #include 
  #include 
  #include 




Re: [PATCH v2 01/17] vdpa: remove unnecessary 'default n' in Kconfig entries

2020-11-29 Thread Jason Wang



On 2020/11/26 下午10:49, Stefano Garzarella wrote:

'default n' is not necessary since it is already the default when
nothing is specified.

Suggested-by: Jason Wang 
Signed-off-by: Stefano Garzarella 



Acked-by: Jason Wang 



---
  drivers/vdpa/Kconfig | 3 ---
  1 file changed, 3 deletions(-)

diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
index 358f6048dd3c..4019ceb88181 100644
--- a/drivers/vdpa/Kconfig
+++ b/drivers/vdpa/Kconfig
@@ -14,7 +14,6 @@ config VDPA_SIM
select DMA_OPS
select VHOST_RING
select GENERIC_NET_UTILS
-   default n
help
  vDPA networking device simulator which loop TX traffic back
  to RX. This device is used for testing, prototyping and
@@ -23,7 +22,6 @@ config VDPA_SIM
  config IFCVF
tristate "Intel IFC VF vDPA driver"
depends on PCI_MSI
-   default n
help
  This kernel module can drive Intel IFC VF NIC to offload
  virtio dataplane traffic to hardware.
@@ -41,7 +39,6 @@ config MLX5_VDPA_NET
tristate "vDPA driver for ConnectX devices"
select MLX5_VDPA
depends on MLX5_CORE
-   default n
help
  VDPA network driver for ConnectX6 and newer. Provides offloading
  of virtio net datapath such that descriptors put on the ring will




Re: [PATCH] vdpa: ifcvf: Use dma_set_mask_and_coherent to simplify code

2020-11-29 Thread Jason Wang



On 2020/11/29 下午8:54, Christophe JAILLET wrote:

'pci_set_dma_mask()' + 'pci_set_consistent_dma_mask()' can be replaced by
an equivalent 'dma_set_mask_and_coherent()' which is much less verbose.

While at it, fix a typo (s/confiugration/configuration)

Signed-off-by: Christophe JAILLET 
---



Acked-by: Jason Wang 



  drivers/vdpa/ifcvf/ifcvf_main.c | 11 ++-
  1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index 8b4028556cb6..fa1af301cf55 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -417,16 +417,9 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct 
pci_device_id *id)
return ret;
}
  
-	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));

+   ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
if (ret) {
-   IFCVF_ERR(pdev, "No usable DMA confiugration\n");
-   return ret;
-   }
-
-   ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-   if (ret) {
-   IFCVF_ERR(pdev,
- "No usable coherent DMA confiugration\n");
+   IFCVF_ERR(pdev, "No usable DMA configuration\n");
return ret;
}
  




Re: [PATCH v4] vdpa: mlx5: fix vdpa/vhost dependencies

2020-11-29 Thread Jason Wang



On 2020/11/29 上午5:39, Randy Dunlap wrote:

drivers/vdpa/mlx5/ uses vhost_iotlb*() interfaces, so select
VHOST_IOTLB to make them be built.

However, if VHOST_IOTLB is the only VHOST symbol that is
set/enabled, the object file still won't be built because
drivers/Makefile won't descend into drivers/vhost/ to build it,
so make drivers/Makefile build the needed binary whenever
VHOST_IOTLB is set, like it does for VHOST_RING.

Fixes these build errors:
ERROR: modpost: "vhost_iotlb_itree_next" [drivers/vdpa/mlx5/mlx5_vdpa.ko] 
undefined!
ERROR: modpost: "vhost_iotlb_itree_first" [drivers/vdpa/mlx5/mlx5_vdpa.ko] 
undefined!

Fixes: 29064bfdabd5 ("vdpa/mlx5: Add support library for mlx5 VDPA 
implementation")
Fixes: aff90770e54c ("vdpa/mlx5: Fix dependency on MLX5_CORE")
Reported-by: kernel test robot 
Signed-off-by: Randy Dunlap 
Cc: Eli Cohen 
Cc: Parav Pandit 
Cc: "Michael S. Tsirkin" 
Cc: Jason Wang 
Cc: virtualizat...@lists.linux-foundation.org
Cc: Saeed Mahameed 
Cc: Leon Romanovsky 
Cc: net...@vger.kernel.org
---
v2: change from select to depends on VHOST (Saeed)
v3: change to depends on VHOST_IOTLB (Jason)
v4: use select VHOST_IOTLB (Michael); also add to drivers/Makefile

  drivers/Makefile |1 +
  drivers/vdpa/Kconfig |1 +
  2 files changed, 2 insertions(+)

--- linux-next-20201127.orig/drivers/vdpa/Kconfig
+++ linux-next-20201127/drivers/vdpa/Kconfig
@@ -32,6 +32,7 @@ config IFCVF
  
  config MLX5_VDPA

bool
+   select VHOST_IOTLB
help
  Support library for Mellanox VDPA drivers. Provides code that is
  common for all types of VDPA drivers. The following drivers are 
planned:
--- linux-next-20201127.orig/drivers/Makefile
+++ linux-next-20201127/drivers/Makefile
@@ -143,6 +143,7 @@ obj-$(CONFIG_OF)+= of/
  obj-$(CONFIG_SSB) += ssb/
  obj-$(CONFIG_BCMA)+= bcma/
  obj-$(CONFIG_VHOST_RING)  += vhost/
+obj-$(CONFIG_VHOST_IOTLB)  += vhost/
  obj-$(CONFIG_VHOST)   += vhost/
  obj-$(CONFIG_VLYNQ)   += vlynq/
  obj-$(CONFIG_GREYBUS) += greybus/



Acked-by: Jason Wang 

Thanks



Re: [PATCH V2 02/14] virtio-pci: switch to use devres for modern devices

2020-11-26 Thread Jason Wang



On 2020/11/26 下午9:57, Michael S. Tsirkin wrote:

On Thu, Nov 26, 2020 at 05:25:52PM +0800, Jason Wang wrote:

This patch tries to convert the modern device to use devres to manage
its resources (iomaps). Before this patch the IO address is mapped
individually according to the capability. After this patch, we simply
map the whole BAR.

I think the point of mapping capability was e.g. for devices with
huge BARs. We don't want to waste virtual memory for e.g. 32 bit guests.

And in particular the spec says:

The drivers SHOULD only map part of configuration structure large 
enough for device operation. The drivers
MUST handle an unexpectedly large length, but MAY check that length is 
large enough for device operation.



Good point, so I will stick to devres but not use the shortcut like 
whole BAR mapping.





I also wonder how would this interact with cases where device memory is
mapped for different reasons, such as for MSI table access, into userspace
as it has resources such as virtio mem, etc.



I think it depends on the driver, e.g for virtio-pci and vDPA, the upper 
layer driver (virtio bus or vDPA bus) know nothing about transport 
specific thing. It should be ok.




E.g. don't e.g. intel CPUs disallow mapping the same address twice
with different attributes?



Do you mean it doesn't allow one VA is mapped as UC but the other is 
not? I don't know. But anyhow my understanding is that 
virtio-pci/vp_vdpa tries to hide the details so we can not have two 
mappings here.


Thanks








Re: [PATCH V2 01/14] virtio-pci: do not access iomem via virtio_pci_device directly

2020-11-26 Thread Jason Wang



On 2020/11/26 下午9:46, Michael S. Tsirkin wrote:

On Thu, Nov 26, 2020 at 05:25:51PM +0800, Jason Wang wrote:

Instead of accessing iomem via virito_pci_device directly. Add an
indirect level

well this patch does not add any indirection it's just refactoring.
which is ok of course let's just say it as is.


to ease the life of splitting out modern virito-pci

typo



Will fix.

Thanks








[PATCH V2 06/14] virtio-pci-modern: introduce vp_modern_queue_address()

2020-11-26 Thread Jason Wang
This patch introduce a helper to set virtqueue address for modern address.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_pci_modern.c | 33 --
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/drivers/virtio/virtio_pci_modern.c 
b/drivers/virtio/virtio_pci_modern.c
index bacc05cbc762..3125987973d3 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -174,6 +174,30 @@ static u16 vp_modern_queue_vector(struct 
virtio_pci_modern_device *mdev,
return vp_ioread16(>queue_msix_vector);
 }
 
+/*
+ * vp_modern_queue_address - set the virtqueue address
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ * @desc_addr: address of the descriptor area
+ * @driver_addr: address of the driver area
+ * @device_addr: address of the device area
+ */
+static void vp_modern_queue_address(struct virtio_pci_modern_device *mdev,
+   u16 index, u64 desc_addr, u64 driver_addr,
+   u64 device_addr)
+{
+   struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+   vp_iowrite16(index, >queue_select);
+
+   vp_iowrite64_twopart(desc_addr, >queue_desc_lo,
+>queue_desc_hi);
+   vp_iowrite64_twopart(driver_addr, >queue_avail_lo,
+>queue_avail_hi);
+   vp_iowrite64_twopart(device_addr, >queue_used_lo,
+>queue_used_hi);
+}
+
 /* virtio config->finalize_features() implementation */
 static int vp_finalize_features(struct virtio_device *vdev)
 {
@@ -396,12 +420,9 @@ static struct virtqueue *setup_vq(struct virtio_pci_device 
*vp_dev,
 
/* activate the queue */
vp_iowrite16(virtqueue_get_vring_size(vq), >queue_size);
-   vp_iowrite64_twopart(virtqueue_get_desc_addr(vq),
->queue_desc_lo, >queue_desc_hi);
-   vp_iowrite64_twopart(virtqueue_get_avail_addr(vq),
->queue_avail_lo, >queue_avail_hi);
-   vp_iowrite64_twopart(virtqueue_get_used_addr(vq),
->queue_used_lo, >queue_used_hi);
+   vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq),
+   virtqueue_get_avail_addr(vq),
+   virtqueue_get_used_addr(vq));
 
vq->priv = (void __force *)mdev->notify_base +
off * mdev->notify_offset_multiplier;
-- 
2.25.1



[PATCH V2 14/14] vdpa: introduce virtio pci driver

2020-11-26 Thread Jason Wang
This patch introduce a vDPA driver for virtio-pci device. It bridges
the virtio-pci control command to the vDPA bus. This will be used for
features prototyping and testing.

Note that get/restore virtqueue state is not supported which needs
extension on the virtio specification.

Signed-off-by: Jason Wang 
---
 drivers/vdpa/Kconfig  |   6 +
 drivers/vdpa/Makefile |   1 +
 drivers/vdpa/virtio_pci/Makefile  |   2 +
 drivers/vdpa/virtio_pci/vp_vdpa.c | 450 ++
 4 files changed, 459 insertions(+)
 create mode 100644 drivers/vdpa/virtio_pci/Makefile
 create mode 100644 drivers/vdpa/virtio_pci/vp_vdpa.c

diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
index d7d32b656102..4cca53114cc4 100644
--- a/drivers/vdpa/Kconfig
+++ b/drivers/vdpa/Kconfig
@@ -47,4 +47,10 @@ config MLX5_VDPA_NET
  be executed by the hardware. It also supports a variety of stateless
  offloads depending on the actual device used and firmware version.
 
+config VP_VDPA
+   tristate "Virtio PCI bridge vDPA driver"
+   depends on PCI_MSI && VIRTIO_PCI_MODERN
+   help
+ This kernel module that bridges virtio PCI device to vDPA bus.
+
 endif # VDPA
diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
index d160e9b63a66..67fe7f3d6943 100644
--- a/drivers/vdpa/Makefile
+++ b/drivers/vdpa/Makefile
@@ -3,3 +3,4 @@ obj-$(CONFIG_VDPA) += vdpa.o
 obj-$(CONFIG_VDPA_SIM) += vdpa_sim/
 obj-$(CONFIG_IFCVF)+= ifcvf/
 obj-$(CONFIG_MLX5_VDPA) += mlx5/
+obj-$(CONFIG_VP_VDPA)+= virtio_pci/
diff --git a/drivers/vdpa/virtio_pci/Makefile b/drivers/vdpa/virtio_pci/Makefile
new file mode 100644
index ..231088d3af7d
--- /dev/null
+++ b/drivers/vdpa/virtio_pci/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_VP_VDPA) += vp_vdpa.o
diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c 
b/drivers/vdpa/virtio_pci/vp_vdpa.c
new file mode 100644
index ..6458fa470566
--- /dev/null
+++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vDPA bridge driver for modern virtio-pci device
+ *
+ * Copyright (c) 2020, Red Hat Inc. All rights reserved.
+ * Author: Jason Wang 
+ *
+ * Based on virtio_pci_modern.c.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define VP_VDPA_QUEUE_MAX 256
+#define VP_VDPA_DRIVER_NAME "vp_vdpa"
+
+struct vp_vring {
+   void __iomem *notify;
+   char msix_name[256];
+   struct vdpa_callback cb;
+   int irq;
+};
+
+struct vp_vdpa {
+   struct vdpa_device vdpa;
+   struct virtio_pci_modern_device mdev;
+   struct vp_vring *vring;
+   struct vdpa_callback cb;
+   char msix_name[256];
+   int config_irq;
+   int queues;
+   int vectors;
+};
+
+static struct vp_vdpa *vdpa_to_vp(struct vdpa_device *vdpa)
+{
+   return container_of(vdpa, struct vp_vdpa, vdpa);
+}
+
+static struct virtio_pci_modern_device *vdpa_to_mdev(struct vdpa_device *vdpa)
+{
+   struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+
+   return _vdpa->mdev;
+}
+
+static u64 vp_vdpa_get_features(struct vdpa_device *vdpa)
+{
+   struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+   return vp_modern_get_features(mdev);
+}
+
+static int vp_vdpa_set_features(struct vdpa_device *vdpa, u64 features)
+{
+   struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+   vp_modern_set_features(mdev, features);
+
+   return 0;
+}
+
+static u8 vp_vdpa_get_status(struct vdpa_device *vdpa)
+{
+   struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+   return vp_modern_get_status(mdev);
+}
+
+static void vp_vdpa_free_irq(struct vp_vdpa *vp_vdpa)
+{
+   struct virtio_pci_modern_device *mdev = _vdpa->mdev;
+   struct pci_dev *pdev = mdev->pci_dev;
+   int i;
+
+   for (i = 0; i < vp_vdpa->queues; i++) {
+   if (vp_vdpa->vring[i].irq != VIRTIO_MSI_NO_VECTOR) {
+   vp_modern_queue_vector(mdev, i, VIRTIO_MSI_NO_VECTOR);
+   devm_free_irq(>dev, vp_vdpa->vring[i].irq,
+ _vdpa->vring[i]);
+   vp_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR;
+   }
+   }
+
+   if (vp_vdpa->config_irq != VIRTIO_MSI_NO_VECTOR) {
+   vp_modern_config_vector(mdev, VIRTIO_MSI_NO_VECTOR);
+   devm_free_irq(>dev, vp_vdpa->config_irq, vp_vdpa);
+   vp_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR;
+   }
+
+   if (vp_vdpa->vectors) {
+   pci_free_irq_vectors(pdev);
+   vp_vdpa->vectors = 0;
+   }
+}
+
+static irqreturn_t vp_vdpa_vq_handler(int irq, void *arg)
+{
+   struct vp_vring *vring = arg;
+
+   if (vring->cb.callback)
+   return vring->cb.callback(vring->cb.pr

[PATCH V2 10/14] virtio-pci-modern: introduce helper to get notification offset

2020-11-26 Thread Jason Wang
This patch introduces help to get notification offset of modern device.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_pci_modern.c | 21 -
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/virtio/virtio_pci_modern.c 
b/drivers/virtio/virtio_pci_modern.c
index 0b86a36998c8..8f1f274724be 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -267,6 +267,21 @@ static u16 vp_modern_get_num_queues(struct 
virtio_pci_modern_device *mdev)
return vp_ioread16(>common->num_queues);
 }
 
+/*
+ * vp_modern_get_queue_notify_off - get notification offset for a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ *
+ * Returns the notification offset for a virtqueue
+ */
+static u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device 
*mdev,
+ u16 index)
+{
+   vp_iowrite16(index, >common->queue_select);
+
+   return vp_ioread16(>common->queue_notify_off);
+}
+
 /* virtio config->finalize_features() implementation */
 static int vp_finalize_features(struct virtio_device *vdev)
 {
@@ -453,7 +468,6 @@ static struct virtqueue *setup_vq(struct virtio_pci_device 
*vp_dev,
 {
 
struct virtio_pci_modern_device *mdev = _dev->mdev;
-   struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
struct virtqueue *vq;
u16 num, off;
int err;
@@ -461,9 +475,6 @@ static struct virtqueue *setup_vq(struct virtio_pci_device 
*vp_dev,
if (index >= vp_modern_get_num_queues(mdev))
return ERR_PTR(-ENOENT);
 
-   /* Select the queue we're interested in */
-   vp_iowrite16(index, >queue_select);
-
/* Check if queue is either not available or already active. */
num = vp_modern_get_queue_size(mdev, index);
if (!num || vp_modern_get_queue_enable(mdev, index))
@@ -475,7 +486,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device 
*vp_dev,
}
 
/* get offset of notification word for this vq */
-   off = vp_ioread16(>queue_notify_off);
+   off = vp_modern_get_queue_notify_off(mdev, index);
 
info->msix_vector = msix_vec;
 
-- 
2.25.1



[PATCH V2 11/14] virtio-pci: introduce modern device module

2020-11-26 Thread Jason Wang
This patch introduce an separate module that implement the low level
device probe and access logic for modern device. The goal is let the
module to be reused by other driver (e.g vDPA driver that will be
introduced soon).

Note that, the shared memory cap is not converted since there's no
user currently. We can do that in the future if necessary.

Signed-off-by: Jason Wang 
---
 drivers/virtio/Kconfig |  10 +-
 drivers/virtio/Makefile|   1 +
 drivers/virtio/virtio_pci_common.h |  28 +-
 drivers/virtio/virtio_pci_modern.c | 462 -
 drivers/virtio/virtio_pci_modern_dev.c | 462 +
 include/linux/virtio_pci_modern.h  | 107 ++
 6 files changed, 580 insertions(+), 490 deletions(-)
 create mode 100644 drivers/virtio/virtio_pci_modern_dev.c
 create mode 100644 include/linux/virtio_pci_modern.h

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index e76e9b9ba93c..26491b6e7e10 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -6,6 +6,14 @@ config VIRTIO
  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG
  or CONFIG_S390_GUEST.
 
+config VIRTIO_PCI_MODERN
+   tristate "Modern Virtio PCI Device"
+   depends on PCI
+   help
+ Modern PCI device implementation. This module implement the
+ basic probe and control for devices which is based on modern
+ PCI device with possible vendor specific extensions.
+
 menuconfig VIRTIO_MENU
bool "Virtio drivers"
default y
@@ -14,7 +22,7 @@ if VIRTIO_MENU
 
 config VIRTIO_PCI
tristate "PCI driver for virtio devices"
-   depends on PCI
+   depends on PCI && VIRTIO_PCI_MODERN
select VIRTIO
help
  This driver provides support for virtio based paravirtual device
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index 591e6f72aa54..f097578aaa8f 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o
+obj-$(CONFIG_VIRTIO_PCI_MODERN) += virtio_pci_modern_dev.o
 obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o
 obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
 virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
diff --git a/drivers/virtio/virtio_pci_common.h 
b/drivers/virtio/virtio_pci_common.h
index d32af8ff56f9..4025b940f74e 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -39,33 +40,6 @@ struct virtio_pci_vq_info {
unsigned msix_vector;
 };
 
-struct virtio_pci_modern_device {
-   struct pci_dev *pci_dev;
-
-   /* The IO mapping for the PCI BARs */
-   void __iomem * const *base;
-
-   /* The IO mapping for the PCI config space */
-   struct virtio_pci_common_cfg __iomem *common;
-   /* Device-specific data (non-legacy mode)  */
-   void __iomem *device;
-   /* Base of vq notifications (non-legacy mode). */
-   void __iomem *notify_base;
-   /* Where to read and clear interrupt */
-   u8 __iomem *isr;
-
-   /* So we can sanity-check accesses. */
-   size_t notify_len;
-   size_t device_len;
-
-   /* Multiply queue_notify_off by this value. (non-legacy mode). */
-   u32 notify_offset_multiplier;
-
-   int modern_bars;
-
-   struct virtio_device_id id;
-};
-
 /* Our device structure */
 struct virtio_pci_device {
struct virtio_device vdev;
diff --git a/drivers/virtio/virtio_pci_modern.c 
b/drivers/virtio/virtio_pci_modern.c
index 8f1f274724be..8dfdc3b57502 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -19,113 +19,6 @@
 #define VIRTIO_RING_NO_LEGACY
 #include "virtio_pci_common.h"
 
-/*
- * Type-safe wrappers for io accesses.
- * Use these to enforce at compile time the following spec requirement:
- *
- * The driver MUST access each field using the “natural” access
- * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses
- * for 16-bit fields and 8-bit accesses for 8-bit fields.
- */
-static inline u8 vp_ioread8(const u8 __iomem *addr)
-{
-   return ioread8(addr);
-}
-static inline u16 vp_ioread16 (const __le16 __iomem *addr)
-{
-   return ioread16(addr);
-}
-
-static inline u32 vp_ioread32(const __le32 __iomem *addr)
-{
-   return ioread32(addr);
-}
-
-static inline void vp_iowrite8(u8 value, u8 __iomem *addr)
-{
-   iowrite8(value, addr);
-}
-
-static inline void vp_iowrite16(u16 value, __le16 __iomem *addr)
-{
-   iowrite16(value, addr);
-}
-
-static inline void vp_iowrite32(u32 value, __le32 __iomem *addr)
-{
-   iowrite32(value, addr);
-}
-
-static void vp_iowrite64_twopart(u64 val,
-__le32 __iomem *lo, __le32 __iomem *hi)
-{
-   vp_iowrite32((

[PATCH V2 12/14] vdpa: set the virtqueue num during register

2020-11-26 Thread Jason Wang
This patch delay the queue number setting to vDPA device
registering. This allows us to probe the virtqueue numbers between
device allocation and registering.

Signed-off-by: Jason Wang 
---
 drivers/vdpa/ifcvf/ifcvf_main.c   | 5 ++---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 5 ++---
 drivers/vdpa/vdpa.c   | 8 
 drivers/vdpa/vdpa_sim/vdpa_sim.c  | 4 ++--
 include/linux/vdpa.h  | 7 +++
 5 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index 8b4028556cb6..d65f3221d8ed 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -438,8 +438,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct 
pci_device_id *id)
}
 
adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa,
-   dev, _vdpa_ops,
-   IFCVF_MAX_QUEUE_PAIRS * 2);
+   dev, _vdpa_ops);
if (adapter == NULL) {
IFCVF_ERR(pdev, "Failed to allocate vDPA structure");
return -ENOMEM;
@@ -463,7 +462,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct 
pci_device_id *id)
for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++)
vf->vring[i].irq = -EINVAL;
 
-   ret = vdpa_register_device(>vdpa);
+   ret = vdpa_register_device(>vdpa, IFCVF_MAX_QUEUE_PAIRS * 2);
if (ret) {
IFCVF_ERR(pdev, "Failed to register ifcvf to vdpa bus");
goto err;
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 74264e590695..baa6be16f3e5 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1932,8 +1932,7 @@ void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev)
max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues);
max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
 
-   ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, 
mdev->device, _vdpa_ops,
-2 * mlx5_vdpa_max_qps(max_vqs));
+   ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, 
mdev->device, _vdpa_ops);
if (IS_ERR(ndev))
return ndev;
 
@@ -1960,7 +1959,7 @@ void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev)
if (err)
goto err_res;
 
-   err = vdpa_register_device(>vdev);
+   err = vdpa_register_device(>vdev, 2 * 
mlx5_vdpa_max_qps(max_vqs));
if (err)
goto err_reg;
 
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index a69ffc991e13..ba89238f9898 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -61,7 +61,6 @@ static void vdpa_release_dev(struct device *d)
  * initialized but before registered.
  * @parent: the parent device
  * @config: the bus operations that is supported by this device
- * @nvqs: number of virtqueues supported by this device
  * @size: size of the parent structure that contains private data
  *
  * Driver should use vdpa_alloc_device() wrapper macro instead of
@@ -72,7 +71,6 @@ static void vdpa_release_dev(struct device *d)
  */
 struct vdpa_device *__vdpa_alloc_device(struct device *parent,
const struct vdpa_config_ops *config,
-   int nvqs,
size_t size)
 {
struct vdpa_device *vdev;
@@ -99,7 +97,6 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
vdev->index = err;
vdev->config = config;
vdev->features_valid = false;
-   vdev->nvqs = nvqs;
 
err = dev_set_name(>dev, "vdpa%u", vdev->index);
if (err)
@@ -122,11 +119,14 @@ EXPORT_SYMBOL_GPL(__vdpa_alloc_device);
  * vdpa_register_device - register a vDPA device
  * Callers must have a succeed call of vdpa_alloc_device() before.
  * @vdev: the vdpa device to be registered to vDPA bus
+ * @nvqs: number of virtqueues supported by this device
  *
  * Returns an error when fail to add to vDPA bus
  */
-int vdpa_register_device(struct vdpa_device *vdev)
+int vdpa_register_device(struct vdpa_device *vdev, int nvqs)
 {
+   vdev->nvqs = nvqs;
+
return device_add(>dev);
 }
 EXPORT_SYMBOL_GPL(vdpa_register_device);
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index fb3e7d46870f..e3108bd77610 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -352,7 +352,7 @@ static struct vdpasim *vdpasim_create(void)
else
ops = _net_config_ops;
 
-   vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, 
VDPASIM_VQ_NUM);
+   vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops);
if (!vdpasim)
goto err_alloc;
 
@@ -378,7 +378,7 @@ static struct vdpasi

[PATCH V2 13/14] virtio_vdpa: don't warn when fail to disable vq

2020-11-26 Thread Jason Wang
There's no guarantee that the device can disable a specific virtqueue
through set_vq_ready(). One example is the modern virtio-pci
device. So this patch removes the warning.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_vdpa.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index 4a9ddb44b2a7..e28acf482e0c 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -225,9 +225,8 @@ static void virtio_vdpa_del_vq(struct virtqueue *vq)
list_del(>node);
spin_unlock_irqrestore(_dev->lock, flags);
 
-   /* Select and deactivate the queue */
+   /* Select and deactivate the queue (best effort) */
ops->set_vq_ready(vdpa, index, 0);
-   WARN_ON(ops->get_vq_ready(vdpa, index));
 
vring_del_virtqueue(vq);
 
-- 
2.25.1



[PATCH V2 09/14] virtio-pci-modern: introduce helper for getting queue nums

2020-11-26 Thread Jason Wang
This patch introduces helper for getting queue num of modern device.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_pci_modern.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_pci_modern.c 
b/drivers/virtio/virtio_pci_modern.c
index f85216ccc6df..0b86a36998c8 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -256,6 +256,17 @@ static u16 vp_modern_get_queue_size(struct 
virtio_pci_modern_device *mdev,
 
 }
 
+/*
+ * vp_modern_get_num_queues - get the number of virtqueues
+ * @mdev: the modern virtio-pci device
+ *
+ * Returns the number of virtqueues
+ */
+static u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev)
+{
+   return vp_ioread16(>common->num_queues);
+}
+
 /* virtio config->finalize_features() implementation */
 static int vp_finalize_features(struct virtio_device *vdev)
 {
@@ -447,7 +458,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device 
*vp_dev,
u16 num, off;
int err;
 
-   if (index >= vp_ioread16(>num_queues))
+   if (index >= vp_modern_get_num_queues(mdev))
return ERR_PTR(-ENOENT);
 
/* Select the queue we're interested in */
-- 
2.25.1



[PATCH V2 08/14] virtio-pci-modern: introduce helper for setting/geting queue size

2020-11-26 Thread Jason Wang
This patch introduces helper for setting/getting queue size for modern
device.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_pci_modern.c | 34 --
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_pci_modern.c 
b/drivers/virtio/virtio_pci_modern.c
index dcdda32b6182..f85216ccc6df 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -226,6 +226,36 @@ static bool vp_modern_get_queue_enable(struct 
virtio_pci_modern_device *mdev,
return vp_ioread16(>common->queue_enable);
 }
 
+/*
+ * vp_modern_set_queue_size - set size for a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ * @size: the size of the virtqueue
+ */
+static void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev,
+u16 index, u16 size)
+{
+   vp_iowrite16(index, >common->queue_select);
+   vp_iowrite16(size, >common->queue_size);
+
+}
+
+/*
+ * vp_modern_get_queue_size - get size for a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ *
+ * Returns the size of the virtqueue
+ */
+static u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev,
+   u16 index)
+{
+   vp_iowrite16(index, >common->queue_select);
+
+   return vp_ioread16(>common->queue_size);
+
+}
+
 /* virtio config->finalize_features() implementation */
 static int vp_finalize_features(struct virtio_device *vdev)
 {
@@ -424,7 +454,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device 
*vp_dev,
vp_iowrite16(index, >queue_select);
 
/* Check if queue is either not available or already active. */
-   num = vp_ioread16(>queue_size);
+   num = vp_modern_get_queue_size(mdev, index);
if (!num || vp_modern_get_queue_enable(mdev, index))
return ERR_PTR(-ENOENT);
 
@@ -447,7 +477,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device 
*vp_dev,
return ERR_PTR(-ENOMEM);
 
/* activate the queue */
-   vp_iowrite16(virtqueue_get_vring_size(vq), >queue_size);
+   vp_modern_set_queue_size(mdev, index, virtqueue_get_vring_size(vq));
vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq),
virtqueue_get_avail_addr(vq),
virtqueue_get_used_addr(vq));
-- 
2.25.1



[PATCH V2 04/14] virtio-pci: move the notification sanity check to vp_modern_probe()

2020-11-26 Thread Jason Wang
This patch moves the notification sanity check to
vp_modern_probe(). This can make sure the logic could be reused by
modules other than virtio-pci.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_pci_modern.c | 34 +++---
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/drivers/virtio/virtio_pci_modern.c 
b/drivers/virtio/virtio_pci_modern.c
index 02688c3b3fbd..d001c74beefe 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -384,17 +384,6 @@ static struct virtqueue *setup_vq(struct virtio_pci_device 
*vp_dev,
vp_iowrite64_twopart(virtqueue_get_used_addr(vq),
 >queue_used_lo, >queue_used_hi);
 
-   /* offset should not wrap */
-   if ((u64)off * mdev->notify_offset_multiplier + 2
-   > mdev->notify_len) {
-   dev_warn(_dev->pci_dev->dev,
-"bad notification offset %u (x %u) "
-"for queue %u > %zd",
-off, mdev->notify_offset_multiplier,
-index, mdev->notify_len);
-   err = -EINVAL;
-   goto err_map_notify;
-   }
vq->priv = (void __force *)mdev->notify_base +
off * mdev->notify_offset_multiplier;
 
@@ -695,9 +684,11 @@ static inline void check_offsets(void)
 static int vp_modern_probe(struct virtio_pci_modern_device *mdev)
 {
struct pci_dev *pci_dev = mdev->pci_dev;
-   int err, common, isr, notify, device;
+   int err, common, isr, notify, device, i;
+   unsigned int num_queues;
u32 notify_length;
u32 notify_offset;
+   u16 off;
 
/* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */
if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f)
@@ -796,6 +787,25 @@ static int vp_modern_probe(struct virtio_pci_modern_device 
*mdev)
if (!mdev->notify_base)
goto err;
 
+   num_queues = vp_ioread16(>common->num_queues);
+
+   /* offset should not wrap */
+   for (i = 0; i < num_queues; i++) {
+   vp_iowrite16(i, >common->queue_select);
+   off = vp_ioread16(>common->queue_notify_off);
+
+   if ((u64)off * mdev->notify_offset_multiplier + 2
+   > mdev->notify_len) {
+   dev_warn(_dev->dev,
+"bad notification offset %u (x %u) "
+"for queue %u > %zd",
+off, mdev->notify_offset_multiplier,
+i, mdev->notify_len);
+   err = -EINVAL;
+   goto err;
+   }
+   }
+
/* We don't know how much we should map, but PAGE_SIZE
 * is more than enough for all existing devices.
 */
-- 
2.25.1



[PATCH V2 07/14] virtio-pci-modern: introduce helper to set/get queue_enable

2020-11-26 Thread Jason Wang
This patch introduces a helper to set/get queue_enable for modern device.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_pci_modern.c | 37 +-
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/drivers/virtio/virtio_pci_modern.c 
b/drivers/virtio/virtio_pci_modern.c
index 3125987973d3..dcdda32b6182 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -198,6 +198,34 @@ static void vp_modern_queue_address(struct 
virtio_pci_modern_device *mdev,
 >queue_used_hi);
 }
 
+/*
+ * vp_modern_set_queue_enable - enable a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ * @enable: whether the virtqueue is enable or not
+ */
+static void vp_modern_set_queue_enable(struct virtio_pci_modern_device *mdev,
+  u16 index, bool enable)
+{
+   vp_iowrite16(index, >common->queue_select);
+   vp_iowrite16(enable, >common->queue_enable);
+}
+
+/*
+ * vp_modern_get_queue_enable - enable a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ *
+ * Returns whether a virtqueue is enabled or not
+ */
+static bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev,
+  u16 index)
+{
+   vp_iowrite16(index, >common->queue_select);
+
+   return vp_ioread16(>common->queue_enable);
+}
+
 /* virtio config->finalize_features() implementation */
 static int vp_finalize_features(struct virtio_device *vdev)
 {
@@ -397,7 +425,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device 
*vp_dev,
 
/* Check if queue is either not available or already active. */
num = vp_ioread16(>queue_size);
-   if (!num || vp_ioread16(>queue_enable))
+   if (!num || vp_modern_get_queue_enable(mdev, index))
return ERR_PTR(-ENOENT);
 
if (num & (num - 1)) {
@@ -454,7 +482,6 @@ static int vp_modern_find_vqs(struct virtio_device *vdev, 
unsigned nvqs,
  struct irq_affinity *desc)
 {
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-   struct virtio_pci_common_cfg __iomem *cfg = vp_dev->mdev.common;
struct virtqueue *vq;
int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc);
 
@@ -464,10 +491,8 @@ static int vp_modern_find_vqs(struct virtio_device *vdev, 
unsigned nvqs,
/* Select and activate all queues. Has to be done last: once we do
 * this, there's no way to go back except reset.
 */
-   list_for_each_entry(vq, >vqs, list) {
-   vp_iowrite16(vq->index, >queue_select);
-   vp_iowrite16(1, >queue_enable);
-   }
+   list_for_each_entry(vq, >vqs, list)
+   vp_modern_set_queue_enable(_dev->mdev, vq->index, true);
 
return 0;
 }
-- 
2.25.1



[PATCH V2 05/14] virtio-pci-modern: introduce vp_modern_set_queue_vector()

2020-11-26 Thread Jason Wang
This patch introduces a helper to set virtqueue MSI vector.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_pci_modern.c | 35 --
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/drivers/virtio/virtio_pci_modern.c 
b/drivers/virtio/virtio_pci_modern.c
index d001c74beefe..bacc05cbc762 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -155,6 +155,25 @@ static void vp_modern_set_features(struct 
virtio_pci_modern_device *mdev,
vp_iowrite32(features >> 32, >guest_feature);
 }
 
+/*
+ * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: queue index
+ * @vector: the config vector
+ *
+ * Returns the config vector read from the device
+ */
+static u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev,
+ u16 index, u16 vector)
+{
+   struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+   vp_iowrite16(index, >queue_select);
+   vp_iowrite16(vector, >queue_msix_vector);
+   /* Flush the write out to device */
+   return vp_ioread16(>queue_msix_vector);
+}
+
 /* virtio config->finalize_features() implementation */
 static int vp_finalize_features(struct virtio_device *vdev)
 {
@@ -393,8 +412,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device 
*vp_dev,
}
 
if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
-   vp_iowrite16(msix_vec, >queue_msix_vector);
-   msix_vec = vp_ioread16(>queue_msix_vector);
+   msix_vec = vp_modern_queue_vector(mdev, index, msix_vec);
if (msix_vec == VIRTIO_MSI_NO_VECTOR) {
err = -EBUSY;
goto err_map_notify;
@@ -437,16 +455,11 @@ static void del_vq(struct virtio_pci_vq_info *info)
 {
struct virtqueue *vq = info->vq;
struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
-   struct virtio_pci_common_cfg __iomem *cfg = vp_dev->mdev.common;
-
-   vp_iowrite16(vq->index, >queue_select);
+   struct virtio_pci_modern_device *mdev = _dev->mdev;
 
-   if (vp_dev->msix_enabled) {
-   vp_iowrite16(VIRTIO_MSI_NO_VECTOR,
->queue_msix_vector);
-   /* Flush the write out to device */
-   vp_ioread16(>queue_msix_vector);
-   }
+   if (vp_dev->msix_enabled)
+   vp_modern_queue_vector(mdev, vq->index,
+  VIRTIO_MSI_NO_VECTOR);
 
vring_del_virtqueue(vq);
 }
-- 
2.25.1



[PATCH V2 03/14] virtio-pci: split out modern device

2020-11-26 Thread Jason Wang
This patch splits out the virtio-pci modern device only attributes
into another structure. While at it, a dedicated probe method for
modern only attributes is introduced. This may help for split the
logic into a dedicated module.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_pci_common.h |  33 +++--
 drivers/virtio/virtio_pci_modern.c | 224 ++---
 2 files changed, 158 insertions(+), 99 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.h 
b/drivers/virtio/virtio_pci_common.h
index 1d23420f7ed6..d32af8ff56f9 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -39,37 +39,43 @@ struct virtio_pci_vq_info {
unsigned msix_vector;
 };
 
-/* Our device structure */
-struct virtio_pci_device {
-   struct virtio_device vdev;
+struct virtio_pci_modern_device {
struct pci_dev *pci_dev;
 
-   /* In legacy mode, these two point to within ->legacy. */
-   /* Where to read and clear interrupt */
-   u8 __iomem *isr;
-
-   /* Modern only fields */
-   /* The IO mapping for the BARs */
+   /* The IO mapping for the PCI BARs */
void __iomem * const *base;
-   /* The IO mapping for the PCI config space (non-legacy mode) */
+
+   /* The IO mapping for the PCI config space */
struct virtio_pci_common_cfg __iomem *common;
/* Device-specific data (non-legacy mode)  */
void __iomem *device;
/* Base of vq notifications (non-legacy mode). */
void __iomem *notify_base;
+   /* Where to read and clear interrupt */
+   u8 __iomem *isr;
 
/* So we can sanity-check accesses. */
size_t notify_len;
size_t device_len;
 
-   /* Capability for when we need to map notifications per-vq. */
-   int notify_map_cap;
-
/* Multiply queue_notify_off by this value. (non-legacy mode). */
u32 notify_offset_multiplier;
 
int modern_bars;
 
+   struct virtio_device_id id;
+};
+
+/* Our device structure */
+struct virtio_pci_device {
+   struct virtio_device vdev;
+   struct pci_dev *pci_dev;
+   struct virtio_pci_modern_device mdev;
+
+   /* In legacy mode, these two point to within ->legacy. */
+   /* Where to read and clear interrupt */
+   u8 __iomem *isr;
+
/* Legacy only field */
/* the IO mapping for the PCI config space */
void __iomem *ioaddr;
@@ -157,6 +163,5 @@ static inline void virtio_pci_legacy_remove(struct 
virtio_pci_device *vp_dev)
 }
 #endif
 int virtio_pci_modern_probe(struct virtio_pci_device *);
-void virtio_pci_modern_remove(struct virtio_pci_device *);
 
 #endif
diff --git a/drivers/virtio/virtio_pci_modern.c 
b/drivers/virtio/virtio_pci_modern.c
index 33cc21b818de..02688c3b3fbd 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -63,13 +63,11 @@ static void vp_iowrite64_twopart(u64 val,
vp_iowrite32(val >> 32, hi);
 }
 
-static void __iomem *map_capability(struct virtio_pci_device *vp_dev, int off,
-   size_t minlen,
-   u32 align,
-   u32 size,
-   size_t *len)
+static void __iomem *map_capability(struct virtio_pci_modern_device *mdev,
+   int off, size_t minlen, u32 align,
+   u32 size, size_t *len)
 {
-   struct pci_dev *dev = vp_dev->pci_dev;
+   struct pci_dev *dev = mdev->pci_dev;
u8 bar;
u32 offset, length;
 
@@ -111,14 +109,13 @@ static void __iomem *map_capability(struct 
virtio_pci_device *vp_dev, int off,
return NULL;
}
 
-   return vp_dev->base[bar] + offset;
+   return mdev->base[bar] + offset;
 }
 
-/* virtio config->get_features() implementation */
-static u64 vp_get_features(struct virtio_device *vdev)
+static u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev)
 {
-   struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-   struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common;
+   struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
u64 features;
 
vp_iowrite32(0, >device_feature_select);
@@ -129,6 +126,14 @@ static u64 vp_get_features(struct virtio_device *vdev)
return features;
 }
 
+/* virtio config->get_features() implementation */
+static u64 vp_get_features(struct virtio_device *vdev)
+{
+   struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+   return vp_modern_get_features(_dev->mdev);
+}
+
 static void vp_transport_features(struct virtio_device *vdev, u64 features)
 {
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -139,11 +144,21 @@ static void vp_transport_features(struct virtio_device 
*vdev, u64 features)
__virtio_set_bit(vdev, VIRTIO_F_SR_IOV);
 }
 

[PATCH V2 02/14] virtio-pci: switch to use devres for modern devices

2020-11-26 Thread Jason Wang
This patch tries to convert the modern device to use devres to manage
its resources (iomaps). Before this patch the IO address is mapped
individually according to the capability. After this patch, we simply
map the whole BAR.

This simplify the work of splitting modern device logic into an
separate module.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_pci_common.c |  10 --
 drivers/virtio/virtio_pci_common.h |   2 +
 drivers/virtio/virtio_pci_legacy.c |  13 ++-
 drivers/virtio/virtio_pci_modern.c | 141 +
 4 files changed, 54 insertions(+), 112 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.c 
b/drivers/virtio/virtio_pci_common.c
index 222d630c41fc..e786701fa1b4 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -527,11 +527,6 @@ static int virtio_pci_probe(struct pci_dev *pci_dev,
INIT_LIST_HEAD(_dev->virtqueues);
spin_lock_init(_dev->lock);
 
-   /* enable the device */
-   rc = pci_enable_device(pci_dev);
-   if (rc)
-   goto err_enable_device;
-
if (force_legacy) {
rc = virtio_pci_legacy_probe(vp_dev);
/* Also try modern mode if we can't map BAR0 (no IO space). */
@@ -559,11 +554,8 @@ static int virtio_pci_probe(struct pci_dev *pci_dev,
 err_register:
if (vp_dev->ioaddr)
 virtio_pci_legacy_remove(vp_dev);
-   else
-virtio_pci_modern_remove(vp_dev);
 err_probe:
pci_disable_device(pci_dev);
-err_enable_device:
if (reg_dev)
put_device(_dev->vdev.dev);
else
@@ -582,8 +574,6 @@ static void virtio_pci_remove(struct pci_dev *pci_dev)
 
if (vp_dev->ioaddr)
virtio_pci_legacy_remove(vp_dev);
-   else
-   virtio_pci_modern_remove(vp_dev);
 
pci_disable_device(pci_dev);
put_device(dev);
diff --git a/drivers/virtio/virtio_pci_common.h 
b/drivers/virtio/virtio_pci_common.h
index b2f0eb4067cb..1d23420f7ed6 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -49,6 +49,8 @@ struct virtio_pci_device {
u8 __iomem *isr;
 
/* Modern only fields */
+   /* The IO mapping for the BARs */
+   void __iomem * const *base;
/* The IO mapping for the PCI config space (non-legacy mode) */
struct virtio_pci_common_cfg __iomem *common;
/* Device-specific data (non-legacy mode)  */
diff --git a/drivers/virtio/virtio_pci_legacy.c 
b/drivers/virtio/virtio_pci_legacy.c
index d62e9835aeec..890f155ff48c 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -214,14 +214,19 @@ int virtio_pci_legacy_probe(struct virtio_pci_device 
*vp_dev)
struct pci_dev *pci_dev = vp_dev->pci_dev;
int rc;
 
+   rc = pci_enable_device(pci_dev);
+   if (rc)
+   return rc;
+
+   rc = -ENODEV;
/* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */
if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f)
-   return -ENODEV;
+   goto err_id;
 
if (pci_dev->revision != VIRTIO_PCI_ABI_VERSION) {
printk(KERN_ERR "virtio_pci: expected ABI version %d, got %d\n",
   VIRTIO_PCI_ABI_VERSION, pci_dev->revision);
-   return -ENODEV;
+   goto err_id;
}
 
rc = dma_set_mask(_dev->dev, DMA_BIT_MASK(64));
@@ -241,7 +246,7 @@ int virtio_pci_legacy_probe(struct virtio_pci_device 
*vp_dev)
 
rc = pci_request_region(pci_dev, 0, "virtio-pci-legacy");
if (rc)
-   return rc;
+   goto err_id;
 
rc = -ENOMEM;
vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0);
@@ -267,6 +272,8 @@ int virtio_pci_legacy_probe(struct virtio_pci_device 
*vp_dev)
 
 err_iomap:
pci_release_region(pci_dev, 0);
+err_id:
+   pci_disable_device(pci_dev);
return rc;
 }
 
diff --git a/drivers/virtio/virtio_pci_modern.c 
b/drivers/virtio/virtio_pci_modern.c
index df1481fd400c..33cc21b818de 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -63,15 +63,15 @@ static void vp_iowrite64_twopart(u64 val,
vp_iowrite32(val >> 32, hi);
 }
 
-static void __iomem *map_capability(struct pci_dev *dev, int off,
+static void __iomem *map_capability(struct virtio_pci_device *vp_dev, int off,
size_t minlen,
u32 align,
-   u32 start, u32 size,
+   u32 size,
size_t *len)
 {
+   struct pci_dev *dev = vp_dev->pci_dev;
u8 bar;
u32 offset, length;
-   void __iomem *p;
 
pci_read_config_byte(dev, off + offsetof(struct virtio_pci_cap,
 

[PATCH V2 01/14] virtio-pci: do not access iomem via virtio_pci_device directly

2020-11-26 Thread Jason Wang
Instead of accessing iomem via virito_pci_device directly. Add an
indirect level to ease the life of splitting out modern virito-pci
logic.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_pci_modern.c | 76 ++
 1 file changed, 46 insertions(+), 30 deletions(-)

diff --git a/drivers/virtio/virtio_pci_modern.c 
b/drivers/virtio/virtio_pci_modern.c
index 3d6ae5a5e252..df1481fd400c 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -141,12 +141,13 @@ static void __iomem *map_capability(struct pci_dev *dev, 
int off,
 static u64 vp_get_features(struct virtio_device *vdev)
 {
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+   struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common;
u64 features;
 
-   vp_iowrite32(0, _dev->common->device_feature_select);
-   features = vp_ioread32(_dev->common->device_feature);
-   vp_iowrite32(1, _dev->common->device_feature_select);
-   features |= ((u64)vp_ioread32(_dev->common->device_feature) << 32);
+   vp_iowrite32(0, >device_feature_select);
+   features = vp_ioread32(>device_feature);
+   vp_iowrite32(1, >device_feature_select);
+   features |= ((u64)vp_ioread32(>device_feature) << 32);
 
return features;
 }
@@ -165,6 +166,7 @@ static void vp_transport_features(struct virtio_device 
*vdev, u64 features)
 static int vp_finalize_features(struct virtio_device *vdev)
 {
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+   struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common;
u64 features = vdev->features;
 
/* Give virtio_ring a chance to accept features. */
@@ -179,10 +181,10 @@ static int vp_finalize_features(struct virtio_device 
*vdev)
return -EINVAL;
}
 
-   vp_iowrite32(0, _dev->common->guest_feature_select);
-   vp_iowrite32((u32)vdev->features, _dev->common->guest_feature);
-   vp_iowrite32(1, _dev->common->guest_feature_select);
-   vp_iowrite32(vdev->features >> 32, _dev->common->guest_feature);
+   vp_iowrite32(0, >guest_feature_select);
+   vp_iowrite32((u32)vdev->features, >guest_feature);
+   vp_iowrite32(1, >guest_feature_select);
+   vp_iowrite32(vdev->features >> 32, >guest_feature);
 
return 0;
 }
@@ -192,6 +194,7 @@ static void vp_get(struct virtio_device *vdev, unsigned 
offset,
   void *buf, unsigned len)
 {
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+   void __iomem *device = vp_dev->device;
u8 b;
__le16 w;
__le32 l;
@@ -200,21 +203,21 @@ static void vp_get(struct virtio_device *vdev, unsigned 
offset,
 
switch (len) {
case 1:
-   b = ioread8(vp_dev->device + offset);
+   b = ioread8(device + offset);
memcpy(buf, , sizeof b);
break;
case 2:
-   w = cpu_to_le16(ioread16(vp_dev->device + offset));
+   w = cpu_to_le16(ioread16(device + offset));
memcpy(buf, , sizeof w);
break;
case 4:
-   l = cpu_to_le32(ioread32(vp_dev->device + offset));
+   l = cpu_to_le32(ioread32(device + offset));
memcpy(buf, , sizeof l);
break;
case 8:
-   l = cpu_to_le32(ioread32(vp_dev->device + offset));
+   l = cpu_to_le32(ioread32(device + offset));
memcpy(buf, , sizeof l);
-   l = cpu_to_le32(ioread32(vp_dev->device + offset + sizeof l));
+   l = cpu_to_le32(ioread32(device + offset + sizeof l));
memcpy(buf + sizeof l, , sizeof l);
break;
default:
@@ -228,6 +231,7 @@ static void vp_set(struct virtio_device *vdev, unsigned 
offset,
   const void *buf, unsigned len)
 {
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+   void __iomem *device = vp_dev->device;
u8 b;
__le16 w;
__le32 l;
@@ -237,21 +241,21 @@ static void vp_set(struct virtio_device *vdev, unsigned 
offset,
switch (len) {
case 1:
memcpy(, buf, sizeof b);
-   iowrite8(b, vp_dev->device + offset);
+   iowrite8(b, device + offset);
break;
case 2:
memcpy(, buf, sizeof w);
-   iowrite16(le16_to_cpu(w), vp_dev->device + offset);
+   iowrite16(le16_to_cpu(w), device + offset);
break;
case 4:
memcpy(, buf, sizeof l);
-   iowrite32(le32_to_cpu(l), vp_dev->device + offset);
+   iowrite32(le32_to_cpu(l), device + offset);
break;
case 8:
memcpy(, buf, sizeof l);
-   iowrite32(le

[PATCH V2 00/14] vDPA driver for virtio-pci device

2020-11-26 Thread Jason Wang
Hi all:

This series tries to implement a vDPA driver for virtio-pci device
which will bridge between vDPA bus and virtio-pci device.

This could be used for future feature prototyping and testing.

Please review

Changes from V1:

- Split common codes from virito-pci and share it with vDPA driver
- Use dynamic id in order to be less confusing with virtio-pci driver
- No feature whitelist, supporting any features (mq, config etc)

Thanks

Jason Wang (14):
  virtio-pci: do not access iomem via virtio_pci_device directly
  virtio-pci: switch to use devres for modern devices
  virtio-pci: split out modern device
  virtio-pci: move the notification sanity check to vp_modern_probe()
  virtio-pci-modern: introduce vp_modern_set_queue_vector()
  virtio-pci-modern: introduce vp_modern_queue_address()
  virtio-pci-modern: introduce helper to set/get queue_enable
  virtio-pci-modern: introduce helper for setting/geting queue size
  virtio-pci-modern: introduce helper for getting queue nums
  virtio-pci-modern: introduce helper to get notification offset
  virtio-pci: introduce modern device module
  vdpa: set the virtqueue num during register
  virtio_vdpa: don't warn when fail to disable vq
  vdpa: introduce virtio pci driver

 drivers/vdpa/Kconfig   |   6 +
 drivers/vdpa/Makefile  |   1 +
 drivers/vdpa/ifcvf/ifcvf_main.c|   5 +-
 drivers/vdpa/mlx5/net/mlx5_vnet.c  |   5 +-
 drivers/vdpa/vdpa.c|   8 +-
 drivers/vdpa/vdpa_sim/vdpa_sim.c   |   4 +-
 drivers/vdpa/virtio_pci/Makefile   |   2 +
 drivers/vdpa/virtio_pci/vp_vdpa.c  | 450 
 drivers/virtio/Kconfig |  10 +-
 drivers/virtio/Makefile|   1 +
 drivers/virtio/virtio_pci_common.c |  10 -
 drivers/virtio/virtio_pci_common.h |  23 +-
 drivers/virtio/virtio_pci_legacy.c |  13 +-
 drivers/virtio/virtio_pci_modern.c | 442 +++
 drivers/virtio/virtio_pci_modern_dev.c | 462 +
 drivers/virtio/virtio_vdpa.c   |   3 +-
 include/linux/vdpa.h   |   7 +-
 include/linux/virtio_pci_modern.h  | 107 ++
 18 files changed, 1121 insertions(+), 438 deletions(-)
 create mode 100644 drivers/vdpa/virtio_pci/Makefile
 create mode 100644 drivers/vdpa/virtio_pci/vp_vdpa.c
 create mode 100644 drivers/virtio/virtio_pci_modern_dev.c
 create mode 100644 include/linux/virtio_pci_modern.h

-- 
2.25.1



Re: [PATCH RFC 02/12] vdpa: split vdpasim to core and net modules

2020-11-18 Thread Jason Wang



On 2020/11/18 下午9:14, Stefano Garzarella wrote:

Hi Jason,
I just discovered that I missed the other questions in this email,
sorry for that!



No problem :)




On Mon, Nov 16, 2020 at 12:00:11PM +0800, Jason Wang wrote:


On 2020/11/13 下午9:47, Stefano Garzarella wrote:

From: Max Gurtovoy 

Introduce new vdpa_sim_net and vdpa_sim (core) drivers. This is a
preparation for adding a vdpa simulator module for block devices.

Signed-off-by: Max Gurtovoy 
[sgarzare: various cleanups/fixes]
Signed-off-by: Stefano Garzarella 
---
v1:
- Removed unused headers
- Removed empty module_init() module_exit()
- Moved vdpasim_is_little_endian() in vdpa_sim.h
- Moved vdpasim16_to_cpu/cpu_to_vdpasim16() in vdpa_sim.h
- Added vdpasim*_to_cpu/cpu_to_vdpasim*() also for 32 and 64
- Replaced 'select VDPA_SIM' with 'depends on VDPA_SIM' since selected
  option can not depend on other [Jason]



If possible, I would suggest to split this patch further:

1) convert to use void *config, and an attribute for setting config 
size during allocation

2) introduce supported_features
3) other attributes (#vqs)
4) rename config ops (more generic one)
5) introduce ops for set|get_config, set_get_features
6) real split




[...]


-static const struct vdpa_config_ops vdpasim_net_config_ops;
-static const struct vdpa_config_ops vdpasim_net_batch_config_ops;
+static const struct vdpa_config_ops vdpasim_config_ops;
+static const struct vdpa_config_ops vdpasim_batch_config_ops;
-static struct vdpasim *vdpasim_create(void)
+struct vdpasim *vdpasim_create(struct vdpasim_init_attr *attr)
 {
 const struct vdpa_config_ops *ops;
 struct vdpasim *vdpasim;
+    u32 device_id;
 struct device *dev;
-    int ret = -ENOMEM;
+    int i, size, ret = -ENOMEM;
-    if (batch_mapping)
-    ops = _net_batch_config_ops;
+    device_id = attr->device_id;
+    /* Currently, we only accept the network and block devices. */
+    if (device_id != VIRTIO_ID_NET && device_id != VIRTIO_ID_BLOCK)
+    return ERR_PTR(-EOPNOTSUPP);
+
+    if (attr->batch_mapping)
+    ops = _batch_config_ops;
 else
-    ops = _net_config_ops;
+    ops = _config_ops;
 vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, 
VDPASIM_VQ_NUM);

 if (!vdpasim)
 goto err_alloc;
-    INIT_WORK(>work, vdpasim_work);
+    if (device_id == VIRTIO_ID_NET)
+    size = sizeof(struct virtio_net_config);
+    else
+    size = sizeof(struct virtio_blk_config);



It's better to avoid such if/else consider we may introduce more type 
of devices.


Can we have an attribute of config size instead?


Yes, I'll move the patch 7 before this.

About config size and set/get_config ops, I'm not sure if it is better 
to hidden everything under the new set/get_config ops, allocating the 
config structure in each device, or leave the allocation in the core 
and update it like now.



I think we'd better to avoid having any type specific codes in generic 
sim codes.



[...]



+config VDPA_SIM_NET
+    tristate "vDPA simulator for networking device"
+    depends on VDPA_SIM
+    default n



I remember somebody told me that if we don't enable a module it was 
disabled by default.


So, should I remove "default n" from vdpa_sim* entries?



Yes, but please do that in another patch.

Thanks




Thanks,
Stefano





Re: [PATCH] vringh: fix vringh_iov_push_*() documentation

2020-11-16 Thread Jason Wang



On 2020/11/17 上午12:16, Stefano Garzarella wrote:

vringh_iov_push_*() functions don't have 'dst' parameter, but have
the 'src' parameter.

Replace 'dst' description with 'src' description.

Signed-off-by: Stefano Garzarella 



Acked-by: Jason Wang 



---
  drivers/vhost/vringh.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
index 8bd8b403f087..b7403ba8e7f7 100644
--- a/drivers/vhost/vringh.c
+++ b/drivers/vhost/vringh.c
@@ -730,7 +730,7 @@ EXPORT_SYMBOL(vringh_iov_pull_user);
  /**
   * vringh_iov_push_user - copy bytes into vring_iov.
   * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume)
- * @dst: the place to copy.
+ * @src: the place to copy from.
   * @len: the maximum length to copy.
   *
   * Returns the bytes copied <= len or a negative errno.
@@ -976,7 +976,7 @@ EXPORT_SYMBOL(vringh_iov_pull_kern);
  /**
   * vringh_iov_push_kern - copy bytes into vring_iov.
   * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume)
- * @dst: the place to copy.
+ * @src: the place to copy from.
   * @len: the maximum length to copy.
   *
   * Returns the bytes copied <= len or a negative errno.
@@ -1333,7 +1333,7 @@ EXPORT_SYMBOL(vringh_iov_pull_iotlb);
   * vringh_iov_push_iotlb - copy bytes into vring_iov.
   * @vrh: the vring.
   * @wiov: the wiov as passed to vringh_getdesc_iotlb() (updated as we consume)
- * @dst: the place to copy.
+ * @src: the place to copy from.
   * @len: the maximum length to copy.
   *
   * Returns the bytes copied <= len or a negative errno.




Re: [PATCH RFC 12/12] vdpa_sim_blk: implement ramdisk behaviour

2020-11-15 Thread Jason Wang



On 2020/11/13 下午9:47, Stefano Garzarella wrote:

The previous implementation wrote only the status of each request.
This patch implements a more accurate block device simulator,
providing a ramdisk-like behavior.

Also handle VIRTIO_BLK_T_GET_ID request, always answering the
"vdpa_blk_sim" string.



Let's use a separate patch for this.




Signed-off-by: Stefano Garzarella 
---
  drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 151 +++
  1 file changed, 133 insertions(+), 18 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c 
b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
index 8e41b3ab98d5..68e74383322f 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
@@ -7,6 +7,7 @@
   */
  
  #include 

+#include 
  #include 
  
  #include "vdpa_sim.h"

@@ -24,10 +25,137 @@
  
  static struct vdpasim *vdpasim_blk_dev;
  
+static int vdpasim_blk_handle_req(struct vdpasim *vdpasim,

+ struct vdpasim_virtqueue *vq)
+{
+   size_t wrote = 0, to_read = 0, to_write = 0;
+   struct virtio_blk_outhdr hdr;
+   uint8_t status;
+   uint32_t type;
+   ssize_t bytes;
+   loff_t offset;
+   int i, ret;
+
+   vringh_kiov_cleanup(>riov);
+   vringh_kiov_cleanup(>wiov);



It looks to me we should do those after vringh_get_desc_iotlb()? See 
comment above vringh_getdesc_kern().




+
+   ret = vringh_getdesc_iotlb(>vring, >riov, >wiov,
+  >head, GFP_ATOMIC);
+   if (ret != 1)
+   return ret;
+
+   for (i = 0; i < vq->wiov.used; i++)
+   to_write += vq->wiov.iov[i].iov_len;



It's better to introduce a helper for this (or consider to use iov 
iterator).




+   to_write -= 1; /* last byte is the status */
+
+   for (i = 0; i < vq->riov.used; i++)
+   to_read += vq->riov.iov[i].iov_len;
+
+   bytes = vringh_iov_pull_iotlb(>vring, >riov, , sizeof(hdr));
+   if (bytes != sizeof(hdr))
+   return 0;
+
+   to_read -= bytes;
+
+   type = le32_to_cpu(hdr.type);
+   offset = le64_to_cpu(hdr.sector) << SECTOR_SHIFT;
+   status = VIRTIO_BLK_S_OK;
+
+   switch (type) {
+   case VIRTIO_BLK_T_IN:
+   if (offset + to_write > VDPASIM_BLK_CAPACITY << SECTOR_SHIFT) {
+   dev_err(>vdpa.dev,
+   "reading over the capacity - offset: 0x%llx len: 
0x%lx\n",
+   offset, to_write);
+   status = VIRTIO_BLK_S_IOERR;
+   break;
+   }
+
+   bytes = vringh_iov_push_iotlb(>vring, >wiov,
+ vdpasim->buffer + offset,
+ to_write);
+   if (bytes < 0) {
+   dev_err(>vdpa.dev,
+   "vringh_iov_push_iotlb() error: %ld offset: 0x%llx 
len: 0x%lx\n",
+   bytes, offset, to_write);
+   status = VIRTIO_BLK_S_IOERR;
+   break;
+   }
+
+   wrote += bytes;
+   break;
+
+   case VIRTIO_BLK_T_OUT:
+   if (offset + to_read > VDPASIM_BLK_CAPACITY << SECTOR_SHIFT) {
+   dev_err(>vdpa.dev,
+   "writing over the capacity - offset: 0x%llx len: 
0x%lx\n",
+   offset, to_read);
+   status = VIRTIO_BLK_S_IOERR;
+   break;
+   }
+
+   bytes = vringh_iov_pull_iotlb(>vring, >riov,
+ vdpasim->buffer + offset,
+ to_read);
+   if (bytes < 0) {
+   dev_err(>vdpa.dev,
+   "vringh_iov_pull_iotlb() error: %ld offset: 0x%llx 
len: 0x%lx\n",
+   bytes, offset, to_read);
+   status = VIRTIO_BLK_S_IOERR;
+   break;
+   }
+   break;
+
+   case VIRTIO_BLK_T_GET_ID: {
+   char id[VIRTIO_BLK_ID_BYTES] = "vdpa_blk_sim";



Let's use a global static one?



+
+   bytes = vringh_iov_push_iotlb(>vring,
+ >wiov, id,
+ VIRTIO_BLK_ID_BYTES);
+   if (bytes < 0) {
+   dev_err(>vdpa.dev,
+   "vringh_iov_push_iotlb() error: %ld\n", bytes);
+   status = VIRTIO_BLK_S_IOERR;
+   break;
+   }
+
+   wrote += bytes;
+   break;
+   }
+
+   default:
+   dev_warn(>vdpa.dev,
+"Unsupported request type %d\n", type);
+   status = VIRTIO_BLK_S_IOERR;
+   break;
+   }
+
+  

Re: [PATCH RFC 11/12] vringh: allow vringh_iov_xfer() to skip bytes when ptr is NULL

2020-11-15 Thread Jason Wang



On 2020/11/13 下午9:47, Stefano Garzarella wrote:

In some cases, it may be useful to provide a way to skip a number
of bytes in a vringh_iov.

In order to keep vringh_iov consistent, let's reuse vringh_iov_xfer()
logic and skip bytes when the ptr is NULL.

Signed-off-by: Stefano Garzarella 
---

I'm not sure if this is the best option, maybe we can add a new
function vringh_iov_skip().

Suggestions?



I might be worth to check whether we can convert vringh_iov to use iov 
iterator then we can use iov_iterator_advance() here.


Thanks



---
  drivers/vhost/vringh.c | 16 +++-
  1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
index 8bd8b403f087..ed3290946ad7 100644
--- a/drivers/vhost/vringh.c
+++ b/drivers/vhost/vringh.c
@@ -75,7 +75,9 @@ static inline int __vringh_get_head(const struct vringh *vrh,
return head;
  }
  
-/* Copy some bytes to/from the iovec.  Returns num copied. */

+/* Copy some bytes to/from the iovec.  Returns num copied.
+ * If ptr is NULL, skips at most len bytes.
+ */
  static inline ssize_t vringh_iov_xfer(struct vringh *vrh,
  struct vringh_kiov *iov,
  void *ptr, size_t len,
@@ -89,12 +91,16 @@ static inline ssize_t vringh_iov_xfer(struct vringh *vrh,
size_t partlen;
  
  		partlen = min(iov->iov[iov->i].iov_len, len);

-   err = xfer(vrh, iov->iov[iov->i].iov_base, ptr, partlen);
-   if (err)
-   return err;
+
+   if (ptr) {
+   err = xfer(vrh, iov->iov[iov->i].iov_base, ptr, 
partlen);
+   if (err)
+   return err;
+   ptr += partlen;
+   }
+
done += partlen;
len -= partlen;
-   ptr += partlen;
iov->consumed += partlen;
iov->iov[iov->i].iov_len -= partlen;
iov->iov[iov->i].iov_base += partlen;




Re: [PATCH RFC 10/12] vdpa_sim: split vdpasim_virtqueue's iov field in riov and wiov

2020-11-15 Thread Jason Wang



On 2020/11/13 下午9:47, Stefano Garzarella wrote:

vringh_getdesc_iotlb() manages 2 iovs for writable and readable
descriptors. This is very useful for the block device, where for
each request we have both types of descriptor.

Let's split the vdpasim_virtqueue's iov field in riov and wiov
to use them with vringh_getdesc_iotlb().

Signed-off-by: Stefano Garzarella 



Acked-by: Jason Wang 



---
  drivers/vdpa/vdpa_sim/vdpa_sim.h | 3 ++-
  drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 6 +++---
  drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 8 
  3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h
index cc21e07aa2f7..0d4629675e4b 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.h
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h
@@ -27,7 +27,8 @@ struct vdpasim;
  
  struct vdpasim_virtqueue {

struct vringh vring;
-   struct vringh_kiov iov;
+   struct vringh_kiov riov;
+   struct vringh_kiov wiov;
unsigned short head;
bool ready;
u64 desc_addr;
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c 
b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
index 122a3c039507..8e41b3ab98d5 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
@@ -41,13 +41,13 @@ static void vdpasim_blk_work(struct work_struct *work)
if (!vq->ready)
continue;
  
-		while (vringh_getdesc_iotlb(>vring, >iov, >iov,

+   while (vringh_getdesc_iotlb(>vring, >riov, >wiov,
>head, GFP_ATOMIC) > 0) {
  
  			int write;
  
-			vq->iov.i = vq->iov.used - 1;

-   write = vringh_iov_push_iotlb(>vring, >iov, 
, 1);
+   vq->wiov.i = vq->wiov.used - 1;
+   write = vringh_iov_push_iotlb(>vring, >wiov, 
, 1);
if (write <= 0)
break;
  
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c

index d0a1403f64b2..783b1e85b09c 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
@@ -47,12 +47,12 @@ static void vdpasim_net_work(struct work_struct *work)
  
  	while (true) {

total_write = 0;
-   err = vringh_getdesc_iotlb(>vring, >iov, NULL,
+   err = vringh_getdesc_iotlb(>vring, >riov, NULL,
   >head, GFP_ATOMIC);
if (err <= 0)
break;
  
-		err = vringh_getdesc_iotlb(>vring, NULL, >iov,

+   err = vringh_getdesc_iotlb(>vring, NULL, >wiov,
   >head, GFP_ATOMIC);
if (err <= 0) {
vringh_complete_iotlb(>vring, txq->head, 0);
@@ -60,13 +60,13 @@ static void vdpasim_net_work(struct work_struct *work)
}
  
  		while (true) {

-   read = vringh_iov_pull_iotlb(>vring, >iov,
+   read = vringh_iov_pull_iotlb(>vring, >riov,
 vdpasim->buffer,
 PAGE_SIZE);
if (read <= 0)
break;
  
-			write = vringh_iov_push_iotlb(>vring, >iov,

+   write = vringh_iov_push_iotlb(>vring, >wiov,
  vdpasim->buffer, read);
if (write <= 0)
break;




Re: [PATCH RFC 09/12] vdpa_sim: make vdpasim->buffer size configurable

2020-11-15 Thread Jason Wang



On 2020/11/13 下午9:47, Stefano Garzarella wrote:

Allow each device to specify the size of the buffer allocated
in vdpa_sim.

Signed-off-by: Stefano Garzarella 



Acked-by: Jason Wang 



---
  drivers/vdpa/vdpa_sim/vdpa_sim.h | 1 +
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +-
  drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 1 +
  drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 1 +
  4 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h
index f7e1fe0a88d3..cc21e07aa2f7 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.h
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h
@@ -49,6 +49,7 @@ struct vdpasim_device {
  
  struct vdpasim_init_attr {

struct vdpasim_device device;
+   size_t buffer_size;
int batch_mapping;
  
  	work_func_t	work_fn;

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index bd034fbf4683..3863d49e0d6d 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -223,7 +223,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_init_attr 
*attr)
if (!vdpasim->iommu)
goto err_iommu;
  
-	vdpasim->buffer = kvmalloc(PAGE_SIZE, GFP_KERNEL);

+   vdpasim->buffer = kvmalloc(attr->buffer_size, GFP_KERNEL);
if (!vdpasim->buffer)
goto err_iommu;
  
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c

index f456a0e4e097..122a3c039507 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
@@ -100,6 +100,7 @@ static int __init vdpasim_blk_init(void)
attr.device.update_config = vdpasim_blk_update_config;
  
  	attr.work_fn = vdpasim_blk_work;

+   attr.buffer_size = PAGE_SIZE;
  
  	vdpasim_blk_dev = vdpasim_create();

if (IS_ERR(vdpasim_blk_dev)) {
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c 
b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
index b9372fdf2415..d0a1403f64b2 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
@@ -124,6 +124,7 @@ static int __init vdpasim_net_init(void)
  
  	attr.work_fn = vdpasim_net_work;

attr.batch_mapping = batch_mapping;
+   attr.buffer_size = PAGE_SIZE;
  
  	vdpasim_net_dev = vdpasim_create();

if (IS_ERR(vdpasim_net_dev)) {




Re: [PATCH RFC 07/12] vdpa_sim: move config management outside of the core

2020-11-15 Thread Jason Wang



On 2020/11/13 下午9:47, Stefano Garzarella wrote:

In order to simplify the code of the vdpa_sim core, we move the
config management in each device simulator.

The device must provide the size of config structure and a callback
to update this structure called during the vdpasim_set_features().



Similarly, I suggest to do this before patch 2, then there's no need for 
the conversion of blk device.





Signed-off-by: Stefano Garzarella 
---
  drivers/vdpa/vdpa_sim/vdpa_sim.h |  5 +++--
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 29 +---
  drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 27 --
  drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 12 
  4 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h
index 76e642042eb0..f7e1fe0a88d3 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.h
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h
@@ -10,8 +10,6 @@
  #include 
  #include 
  #include 
-#include 
-#include 
  
  #define DRV_VERSION  "0.1"

  #define DRV_AUTHOR   "Jason Wang "
@@ -42,8 +40,11 @@ struct vdpasim_virtqueue {
  
  struct vdpasim_device {

u64 supported_features;
+   size_t config_size;
u32 id;
int nvqs;
+
+   void (*update_config)(struct vdpasim *vdpasim);



Let's use set_config/get_config to align with virtio/vhost.

Other looks good.

Thanks




Re: [PATCH RFC 08/12] vdpa_sim: use kvmalloc to allocate vdpasim->buffer

2020-11-15 Thread Jason Wang



On 2020/11/13 下午9:47, Stefano Garzarella wrote:

The next patch will make the buffer size configurable from each
device.
Since the buffer could be larger than a page, we use kvmalloc()
instead of kmalloc().

Signed-off-by: Stefano Garzarella 



Acked-by: Jason Wang 

Thanks



---
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 9c29c2013661..bd034fbf4683 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -223,7 +223,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_init_attr 
*attr)
if (!vdpasim->iommu)
goto err_iommu;
  
-	vdpasim->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);

+   vdpasim->buffer = kvmalloc(PAGE_SIZE, GFP_KERNEL);
if (!vdpasim->buffer)
goto err_iommu;
  
@@ -495,7 +495,7 @@ static void vdpasim_free(struct vdpa_device *vdpa)

struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
  
  	cancel_work_sync(>work);

-   kfree(vdpasim->buffer);
+   kvfree(vdpasim->buffer);
if (vdpasim->iommu)
vhost_iotlb_free(vdpasim->iommu);
kfree(vdpasim->vqs);




Re: [PATCH RFC 05/12] vdpa_sim: remove the limit of IOTLB entries

2020-11-15 Thread Jason Wang



On 2020/11/13 下午9:47, Stefano Garzarella wrote:

The simulated devices can support multiple queues, so this limit
should be defined according to the number of queues supported by
the device.

Since we are in a simulator, let's simply remove that limit.

Suggested-by: Jason Wang 
Signed-off-by: Stefano Garzarella 



Acked-by: Jason Wang 

It would be good to introduce a macro instead of using the magic 0 here.

Thanks



---
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 2b4fea354413..9c9717441bbe 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -230,7 +230,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_init_attr 
*attr)
goto err_iommu;
set_dma_ops(dev, _dma_ops);
  
-	vdpasim->iommu = vhost_iotlb_alloc(2048, 0);

+   vdpasim->iommu = vhost_iotlb_alloc(0, 0);
if (!vdpasim->iommu)
goto err_iommu;
  




Re: [PATCH RFC 06/12] vdpa_sim: add struct vdpasim_device to store device properties

2020-11-15 Thread Jason Wang



On 2020/11/13 下午9:47, Stefano Garzarella wrote:

Move device properties used during the entire life cycle in a new
structure to simplify the copy of these fields during the vdpasim
initialization.

Signed-off-by: Stefano Garzarella 



It would be better to do it before patch 2.



---
  drivers/vdpa/vdpa_sim/vdpa_sim.h | 17 --
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 33 ++--
  drivers/vdpa/vdpa_sim/vdpa_sim_blk.c |  8 +--
  drivers/vdpa/vdpa_sim/vdpa_sim_net.c |  9 +---
  4 files changed, 38 insertions(+), 29 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h
index 6a1267c40d5e..76e642042eb0 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.h
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h
@@ -40,12 +40,17 @@ struct vdpasim_virtqueue {
irqreturn_t (*cb)(void *data);
  };
  
+struct vdpasim_device {

+   u64 supported_features;
+   u32 id;
+   int nvqs;
+};
+
  struct vdpasim_init_attr {
-   u32 device_id;
-   u64 features;
+   struct vdpasim_device device;
+   int batch_mapping;
+
work_func_t work_fn;
-   int batch_mapping;
-   int nvqs;
  };
  
  /* State of each vdpasim device */

@@ -53,18 +58,16 @@ struct vdpasim {
struct vdpa_device vdpa;
struct vdpasim_virtqueue *vqs;
struct work_struct work;
+   struct vdpasim_device device;
/* spinlock to synchronize virtqueue state */
spinlock_t lock;
/* virtio config according to device type */
void *config;
struct vhost_iotlb *iommu;
void *buffer;
-   u32 device_id;
u32 status;
u32 generation;
u64 features;
-   u64 supported_features;
-   int nvqs;
/* spinlock to synchronize iommu table */
spinlock_t iommu_lock;
  };
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 9c9717441bbe..d053bd14b3f8 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -28,7 +28,7 @@ static void vdpasim_queue_ready(struct vdpasim *vdpasim, 
unsigned int idx)
  {
struct vdpasim_virtqueue *vq = >vqs[idx];
  
-	vringh_init_iotlb(>vring, vdpasim->supported_features,

+   vringh_init_iotlb(>vring, vdpasim->device.supported_features,
  VDPASIM_QUEUE_MAX, false,
  (struct vring_desc *)(uintptr_t)vq->desc_addr,
  (struct vring_avail *)
@@ -46,7 +46,7 @@ static void vdpasim_vq_reset(struct vdpasim *vdpasim,
vq->device_addr = 0;
vq->cb = NULL;
vq->private = NULL;
-   vringh_init_iotlb(>vring, vdpasim->supported_features,
+   vringh_init_iotlb(>vring, vdpasim->device.supported_features,
  VDPASIM_QUEUE_MAX, false, NULL, NULL, NULL);
  }
  
@@ -54,7 +54,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim)

  {
int i;
  
-	for (i = 0; i < vdpasim->nvqs; i++)

+   for (i = 0; i < vdpasim->device.nvqs; i++)
vdpasim_vq_reset(vdpasim, >vqs[i]);
  
  	spin_lock(>iommu_lock);

@@ -189,7 +189,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_init_attr 
*attr)
struct device *dev;
int i, size, ret = -ENOMEM;
  
-	device_id = attr->device_id;

+   device_id = attr->device.id;
/* Currently, we only accept the network and block devices. */
if (device_id != VIRTIO_ID_NET && device_id != VIRTIO_ID_BLOCK)
return ERR_PTR(-EOPNOTSUPP);
@@ -200,10 +200,12 @@ struct vdpasim *vdpasim_create(struct vdpasim_init_attr 
*attr)
ops = _config_ops;
  
  	vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops,

-   attr->nvqs);
+   attr->device.nvqs);
if (!vdpasim)
goto err_alloc;
  
+	vdpasim->device = attr->device;

+
if (device_id == VIRTIO_ID_NET)
size = sizeof(struct virtio_net_config);
else
@@ -212,14 +214,11 @@ struct vdpasim *vdpasim_create(struct vdpasim_init_attr 
*attr)
if (!vdpasim->config)
goto err_iommu;
  
-	vdpasim->vqs = kcalloc(attr->nvqs, sizeof(struct vdpasim_virtqueue),

-  GFP_KERNEL);
+   vdpasim->vqs = kcalloc(vdpasim->device.nvqs,
+  sizeof(struct vdpasim_virtqueue), GFP_KERNEL);
if (!vdpasim->vqs)
goto err_iommu;
  
-	vdpasim->device_id = device_id;

-   vdpasim->supported_features = attr->features;
-   vdpasim->nvqs = attr->nvqs;
INIT_WORK(>work, attr->work_fn);
spin_lock_init(>lock);
spin_lock_init(>iommu_lock);
@@ -238,7 +237,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_init_attr 
*attr)
if (!vdpasim->buffer)
goto err_iommu;
  
-	for (i = 0; i < vdpasim->nvqs; i++)

+   for (i = 0; i < 

Re: [PATCH RFC 04/12] vdpa: add vdpa simulator for block device

2020-11-15 Thread Jason Wang



On 2020/11/13 下午9:47, Stefano Garzarella wrote:

From: Max Gurtovoy 

This will allow running vDPA for virtio block protocol.

Signed-off-by: Max Gurtovoy 
[sgarzare: various cleanups/fixes]
Signed-off-by: Stefano Garzarella 
---
v1:
- Removed unused headers
- Used cpu_to_vdpasim*() to store config fields
- Replaced 'select VDPA_SIM' with 'depends on VDPA_SIM' since selected
   option can not depend on other [Jason]
- Start with a single queue for now [Jason]
- Add comments to memory barriers
---
  drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 124 +++
  drivers/vdpa/Kconfig |   9 ++
  drivers/vdpa/vdpa_sim/Makefile   |   1 +
  3 files changed, 134 insertions(+)
  create mode 100644 drivers/vdpa/vdpa_sim/vdpa_sim_blk.c

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c 
b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
new file mode 100644
index ..386dbb2f7138
--- /dev/null
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VDPA simulator for block device.
+ *
+ * Copyright (c) 2020, Mellanox Technologies. All rights reserved.
+ *
+ */
+
+#include 
+
+#include "vdpa_sim.h"
+
+#define VDPASIM_BLK_FEATURES   ((1ULL << VIRTIO_BLK_F_SIZE_MAX) | \
+(1ULL << VIRTIO_BLK_F_SEG_MAX)  | \
+(1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
+(1ULL << VIRTIO_BLK_F_TOPOLOGY) | \
+(1ULL << VIRTIO_BLK_F_MQ))
+
+#define VDPASIM_BLK_CAPACITY 0x4
+#define VDPASIM_BLK_SIZE_MAX 0x1000
+#define VDPASIM_BLK_SEG_MAX 32
+#define VDPASIM_BLK_VQ_NUM 1
+
+static struct vdpasim *vdpasim_blk_dev;
+
+static void vdpasim_blk_work(struct work_struct *work)
+{
+   struct vdpasim *vdpasim = container_of(work, struct vdpasim, work);
+   u8 status = VIRTIO_BLK_S_OK;
+   int i;
+
+   spin_lock(>lock);
+
+   if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK))
+   goto out;
+
+   for (i = 0; i < VDPASIM_BLK_VQ_NUM; i++) {
+   struct vdpasim_virtqueue *vq = >vqs[i];
+
+   if (!vq->ready)
+   continue;
+
+   while (vringh_getdesc_iotlb(>vring, >iov, >iov,
+   >head, GFP_ATOMIC) > 0) {
+
+   int write;
+
+   vq->iov.i = vq->iov.used - 1;
+   write = vringh_iov_push_iotlb(>vring, >iov, 
, 1);
+   if (write <= 0)
+   break;
+
+   /* Make sure data is wrote before advancing index */
+   smp_wmb();
+
+   vringh_complete_iotlb(>vring, vq->head, write);
+
+   /* Make sure used is visible before rasing the 
interrupt. */
+   smp_wmb();
+
+   if (vringh_need_notify_iotlb(>vring) > 0)
+   vringh_notify(>vring);



Do we initialize vrh->notify anywhere? And This seems duplicated with 
the following vq->cb.


I think the correct way is to initialize vrh->notify and use 
vringh_need_notify_iotlb()/vringh_notify() instead of the vq->cb here.


And while at it, it's better to convert net simulator to do the same.

Thanks



+
+   local_bh_disable();
+   if (vq->cb)
+   vq->cb(vq->private);
+   local_bh_enable();
+   }
+   }
+out:
+   spin_unlock(>lock);
+
+}
+
+static int __init vdpasim_blk_init(void)
+{
+   struct vdpasim_init_attr attr = {};
+   struct virtio_blk_config *config;
+   int ret;
+
+   attr.device_id = VIRTIO_ID_BLOCK;
+   attr.features = VDPASIM_FEATURES | VDPASIM_BLK_FEATURES;
+   attr.work_fn = vdpasim_blk_work;
+   vdpasim_blk_dev = vdpasim_create();
+   if (IS_ERR(vdpasim_blk_dev)) {
+   ret = PTR_ERR(vdpasim_blk_dev);
+   goto out;
+   }
+
+   config = (struct virtio_blk_config *)vdpasim_blk_dev->config;
+   config->capacity = cpu_to_vdpasim64(vdpasim_blk_dev, 
VDPASIM_BLK_CAPACITY);
+   config->size_max = cpu_to_vdpasim32(vdpasim_blk_dev, 
VDPASIM_BLK_SIZE_MAX);
+   config->seg_max = cpu_to_vdpasim32(vdpasim_blk_dev, 
VDPASIM_BLK_SEG_MAX);
+   config->num_queues = cpu_to_vdpasim16(vdpasim_blk_dev, 
VDPASIM_BLK_VQ_NUM);
+   config->min_io_size = cpu_to_vdpasim16(vdpasim_blk_dev, 1);
+   config->opt_io_size = cpu_to_vdpasim32(vdpasim_blk_dev, 1);
+   config->blk_size = cpu_to_vdpasim32(vdpasim_blk_dev, 512);
+
+   ret = vdpa_register_device(_blk_dev->vdpa);
+   if (ret)
+   goto put_dev;
+
+   return 0;
+
+put_dev:
+   put_device(_blk_dev->vdpa.dev);
+out:
+   return ret;
+}
+
+static void __exit vdpasim_blk_exit(void)
+{
+   struct vdpa_device *vdpa = _blk_dev->vdpa;
+
+   vdpa_unregister_device(vdpa);
+}
+

Re: [PATCH RFC 03/12] vdpa_sim: remove hard-coded virtq count

2020-11-15 Thread Jason Wang



On 2020/11/13 下午9:47, Stefano Garzarella wrote:

From: Max Gurtovoy 

Add a new attribute that will define the number of virt queues to be
created for the vdpasim device.

Signed-off-by: Max Gurtovoy 
[sgarzare: replace kmalloc_array() with kcalloc()]
Signed-off-by: Stefano Garzarella 
---
v1:
- use kcalloc() instead of kmalloc_array() since some function expects
   variables initialized to zero



Looks good, one nit, I prefer to do this before patch 2.

Thanks




Re: [PATCH RFC 02/12] vdpa: split vdpasim to core and net modules

2020-11-15 Thread Jason Wang



On 2020/11/13 下午9:47, Stefano Garzarella wrote:

From: Max Gurtovoy 

Introduce new vdpa_sim_net and vdpa_sim (core) drivers. This is a
preparation for adding a vdpa simulator module for block devices.

Signed-off-by: Max Gurtovoy 
[sgarzare: various cleanups/fixes]
Signed-off-by: Stefano Garzarella 
---
v1:
- Removed unused headers
- Removed empty module_init() module_exit()
- Moved vdpasim_is_little_endian() in vdpa_sim.h
- Moved vdpasim16_to_cpu/cpu_to_vdpasim16() in vdpa_sim.h
- Added vdpasim*_to_cpu/cpu_to_vdpasim*() also for 32 and 64
- Replaced 'select VDPA_SIM' with 'depends on VDPA_SIM' since selected
   option can not depend on other [Jason]



If possible, I would suggest to split this patch further:

1) convert to use void *config, and an attribute for setting config size 
during allocation

2) introduce supported_features
3) other attributes (#vqs)
4) rename config ops (more generic one)
5) introduce ops for set|get_config, set_get_features
6) real split



---
  drivers/vdpa/vdpa_sim/vdpa_sim.h | 110 +++
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 285 ++-
  drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 153 ++
  drivers/vdpa/Kconfig |   7 +-
  drivers/vdpa/vdpa_sim/Makefile   |   1 +
  5 files changed, 329 insertions(+), 227 deletions(-)
  create mode 100644 drivers/vdpa/vdpa_sim/vdpa_sim.h
  create mode 100644 drivers/vdpa/vdpa_sim/vdpa_sim_net.c

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h
new file mode 100644
index ..33613c49888c
--- /dev/null
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2020, Red Hat Inc. All rights reserved.
+ */
+
+#ifndef _VDPA_SIM_H
+#define _VDPA_SIM_H
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define DRV_VERSION  "0.1"
+#define DRV_AUTHOR   "Jason Wang "
+#define DRV_LICENSE  "GPL v2"
+
+#define VDPASIM_QUEUE_ALIGN PAGE_SIZE
+#define VDPASIM_QUEUE_MAX 256
+#define VDPASIM_VENDOR_ID 0
+#define VDPASIM_VQ_NUM 0x2
+
+#define VDPASIM_FEATURES   ((1ULL << VIRTIO_F_ANY_LAYOUT) | \
+(1ULL << VIRTIO_F_VERSION_1)  | \
+(1ULL << VIRTIO_F_ACCESS_PLATFORM))
+
+struct vdpasim;
+
+struct vdpasim_virtqueue {
+   struct vringh vring;
+   struct vringh_kiov iov;
+   unsigned short head;
+   bool ready;
+   u64 desc_addr;
+   u64 device_addr;
+   u64 driver_addr;
+   u32 num;
+   void *private;
+   irqreturn_t (*cb)(void *data);
+};
+
+struct vdpasim_init_attr {
+   u32 device_id;
+   u64 features;
+   work_func_t work_fn;
+   int batch_mapping;
+};
+
+/* State of each vdpasim device */
+struct vdpasim {
+   struct vdpa_device vdpa;
+   struct vdpasim_virtqueue vqs[VDPASIM_VQ_NUM];
+   struct work_struct work;
+   /* spinlock to synchronize virtqueue state */
+   spinlock_t lock;
+   /* virtio config according to device type */
+   void *config;
+   struct vhost_iotlb *iommu;
+   void *buffer;
+   u32 device_id;
+   u32 status;
+   u32 generation;
+   u64 features;
+   u64 supported_features;
+   /* spinlock to synchronize iommu table */
+   spinlock_t iommu_lock;
+};
+
+struct vdpasim *vdpasim_create(struct vdpasim_init_attr *attr);
+
+/* TODO: cross-endian support */
+static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim)
+{
+   return virtio_legacy_is_little_endian() ||
+   (vdpasim->features & (1ULL << VIRTIO_F_VERSION_1));
+}
+
+static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val)
+{
+   return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val)
+{
+   return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline u32 vdpasim32_to_cpu(struct vdpasim *vdpasim, __virtio32 val)
+{
+   return __virtio32_to_cpu(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline __virtio32 cpu_to_vdpasim32(struct vdpasim *vdpasim, u32 val)
+{
+   return __cpu_to_virtio32(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline u64 vdpasim64_to_cpu(struct vdpasim *vdpasim, __virtio64 val)
+{
+   return __virtio64_to_cpu(vdpasim_is_little_endian(vdpasim), val);
+}
+
+static inline __virtio64 cpu_to_vdpasim64(struct vdpasim *vdpasim, u64 val)
+{
+   return __cpu_to_virtio64(vdpasim_is_little_endian(vdpasim), val);
+}
+
+#endif
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 6a90fdb9cbfc..04f9dc9ce8c8 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -1,107 +1,16 @@
  // SPDX-License-Identifier: GPL-2.0-only
  /*
- * VDPA networking devi

Re: [PATCH RFC 00/12] vdpa: generalize vdpa simulator and add block device

2020-11-15 Thread Jason Wang



On 2020/11/13 下午9:47, Stefano Garzarella wrote:

Thanks to Max that started this work!
I took his patches, and extended the block simulator a bit.

This series moves the network device simulator in a new module
(vdpa_sim_net) and leaves the generic functions in the vdpa_sim core
module, allowing the possibility to add new vDPA device simulators.
Then we added a new vdpa_sim_blk module to simulate a block device.

I'm not sure about patch 11 ("vringh: allow vringh_iov_xfer() to skip
bytes when ptr is NULL"), maybe we can add a new functions instead of
modify vringh_iov_xfer().

As Max reported, I'm also seeing errors with vdpa_sim_blk related to
iotlb and vringh when there is high load, these are some of the error
messages I can see randomly:

   vringh: Failed to access avail idx at e8deb2cc
   vringh: Failed to read head: idx 6289 address e1ad1d50
   vringh: Failed to get flags at 6635d7a3

   virtio_vdpa vdpa0: vringh_iov_push_iotlb() error: -14 offset: 0x284 len: 
0x2
   virtio_vdpa vdpa0: vringh_iov_pull_iotlb() error: -14 offset: 0x58ee000 len: 
0x3000

These errors should all be related to the fact that iotlb_translate()
fails with -EINVAL, so it seems that we miss some mapping.



Is this only reproducible when there's multiple co-current accessing of 
IOTLB? If yes, it's probably a hint that some kind of synchronization is 
still missed somewhere.


It might be useful to log the dma_map/unmp in both virtio_ring and 
vringh to see who is missing the map.


Thanks




I'll debug more carefully, in the meantime can you give a first review?

Thanks,
Stefano

Max Gurtovoy (4):
   vhost-vdpa: add support for vDPA blk devices
   vdpa: split vdpasim to core and net modules
   vdpa_sim: remove hard-coded virtq count
   vdpa: add vdpa simulator for block device

Stefano Garzarella (8):
   vdpa_sim: remove the limit of IOTLB entries
   vdpa_sim: add struct vdpasim_device to store device properties
   vdpa_sim: move config management outside of the core
   vdpa_sim: use kvmalloc to allocate vdpasim->buffer
   vdpa_sim: make vdpasim->buffer size configurable
   vdpa_sim: split vdpasim_virtqueue's iov field in riov and wiov
   vringh: allow vringh_iov_xfer() to skip bytes when ptr is NULL
   vdpa_sim_blk: implement ramdisk behaviour

  drivers/vdpa/vdpa_sim/vdpa_sim.h | 117 +++
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 283 +--
  drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 251 
  drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 172 
  drivers/vhost/vdpa.c |  11 +-
  drivers/vhost/vringh.c   |  16 +-
  drivers/vdpa/Kconfig |  16 +-
  drivers/vdpa/vdpa_sim/Makefile   |   2 +
  8 files changed, 628 insertions(+), 240 deletions(-)
  create mode 100644 drivers/vdpa/vdpa_sim/vdpa_sim.h
  create mode 100644 drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
  create mode 100644 drivers/vdpa/vdpa_sim/vdpa_sim_net.c





Re: [PATCH v3] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-09 Thread Jason Wang



On 2020/11/6 上午7:26, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
Changes in v3:
- Turn explicit last_pfn check to a WARN_ON() (Jason)

Changes in v2:
- Drop the reversion patch
- Fix unhandled page leak towards the end of page_list



Acked-by: Jason Wang 

Thanks




  drivers/vhost/vdpa.c | 80 
  1 file changed, 62 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..5b13dfd 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
  
  	if (r)

vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+   else
+   atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm);
  
  	return r;

  }
@@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
unsigned int gup_flags = FOLL_LONGTERM;
unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-   unsigned long locked, lock_limit, pinned, i;
+   unsigned long lock_limit, sz2pin, nchunks, i;
u64 iova = msg->iova;
+   long pinned;
int ret = 0;
  
  	if (vhost_iotlb_itree_first(iotlb, msg->iova,

msg->iova + msg->size - 1))
return -EEXIST;
  
+	/* Limit the use of memory for bookkeeping */

page_list = (struct page **) __get_free_page(GFP_KERNEL);
if (!page_list)
return -ENOMEM;
@@ -607,52 +611,75 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
gup_flags |= FOLL_WRITE;
  
  	npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT;

-   if (!npages)
-   return -EINVAL;
+   if (!npages) {
+   ret = -EINVAL;
+   goto free;
+   }
  
  	mmap_read_lock(dev->mm);
  
-	locked = atomic64_add_return(npages, >mm->pinned_vm);

lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (locked > lock_limit) {
+   if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) {
ret = -ENOMEM;
-   goto out;
+   goto unlock;
}
  
  	cur_base = msg->uaddr & PAGE_MASK;

iova &= PAGE_MASK;
+   nchunks = 0;
  
  	while (npages) {

-   pinned = min_t(unsigned long, npages, list_size);
-   ret = pin_user_pages(cur_base, pinned,
-gup_flags, page_list, NULL);
-   if (ret != pinned)
+   sz2pin = min_t(unsigned long, npages, list_size);
+   pinned = pin_user_pages(cur_base, sz2pin,
+   gup_flags, page_list, NULL);
+   if (sz2pin != pinned) {
+   if (pinned < 0) {
+   ret = pinned;
+   } else {
+   unpin_user_pages(page_list, pinned);
+   ret = -ENOMEM;
+   }
goto out;
+   }
+   nchunks++;
  
  		if (!last_pfn)

map_pfn = page_to_pfn(page_list[0]);
  
-		for (i = 0; i < ret; i++) {

+   for (i = 0; i < pinned; i++) {
unsigned long this_pfn = page_to_pfn(page_list[i]);
u64 csize;
  
  			if (last_pfn && (this_pfn != last_pfn + 1)) {

/* Pin a contiguous chunk of memory */
csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-   if (vhost_vdpa_map(v, iova, csize,
-  map_pfn << PAGE_SHIFT,
-  msg->perm))
+   ret = vhost_vdpa_map(v, iova, csize,
+map_pfn << PAGE_SHIFT,
+msg->perm);
+   if (ret) {
+   /*
+* Unpin the pages that are left 
unmapped
+* from this point on in the current
+ 

Re: [PATCH v2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-09 Thread Jason Wang



On 2020/11/10 上午7:56, si-wei liu wrote:


On 11/9/2020 2:42 PM, Michael S. Tsirkin wrote:

On Mon, Nov 09, 2020 at 01:44:03PM -0800, si-wei liu wrote:

On 11/8/2020 7:21 PM, Jason Wang wrote:

On 2020/11/6 上午6:57, si-wei liu wrote:

On 11/4/2020 7:26 PM, Jason Wang wrote:

On 2020/11/5 上午7:33, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
Changes in v2:
- Drop the reversion patch
- Fix unhandled page leak towards the end of page_list

   drivers/vhost/vdpa.c | 79

   1 file changed, 61 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..e112854 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
 if (r)
   vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 
1);

+    else
+    atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm);
 return r;
   }
@@ -591,14 +593,16 @@ static int
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
   unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
   unsigned int gup_flags = FOLL_LONGTERM;
   unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-    unsigned long locked, lock_limit, pinned, i;
+    unsigned long lock_limit, sz2pin, nchunks, i;
   u64 iova = msg->iova;
+    long pinned;
   int ret = 0;
 if (vhost_iotlb_itree_first(iotlb, msg->iova,
   msg->iova + msg->size - 1))
   return -EEXIST;
   +    /* Limit the use of memory for bookkeeping */
   page_list = (struct page **) __get_free_page(GFP_KERNEL);
   if (!page_list)
   return -ENOMEM;
@@ -607,52 +611,75 @@ static int
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
   gup_flags |= FOLL_WRITE;
 npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK))

PAGE_SHIFT;

-    if (!npages)
-    return -EINVAL;
+    if (!npages) {
+    ret = -EINVAL;
+    goto free;
+    }
 mmap_read_lock(dev->mm);
   -    locked = atomic64_add_return(npages, >mm->pinned_vm);
   lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-    if (locked > lock_limit) {
+    if (npages + atomic64_read(>mm->pinned_vm) > 
lock_limit) {

   ret = -ENOMEM;
-    goto out;
+    goto unlock;
   }
 cur_base = msg->uaddr & PAGE_MASK;
   iova &= PAGE_MASK;
+    nchunks = 0;
 while (npages) {
-    pinned = min_t(unsigned long, npages, list_size);
-    ret = pin_user_pages(cur_base, pinned,
- gup_flags, page_list, NULL);
-    if (ret != pinned)
+    sz2pin = min_t(unsigned long, npages, list_size);
+    pinned = pin_user_pages(cur_base, sz2pin,
+    gup_flags, page_list, NULL);
+    if (sz2pin != pinned) {
+    if (pinned < 0) {
+    ret = pinned;
+    } else {
+    unpin_user_pages(page_list, pinned);
+    ret = -ENOMEM;
+    }
   goto out;
+    }
+    nchunks++;
 if (!last_pfn)
   map_pfn = page_to_pfn(page_list[0]);
   -    for (i = 0; i < ret; i++) {
+    for (i = 0; i < pinned; i++) {
   unsigned long this_pfn = page_to_pfn(page_list[i]);
   u64 csize;
 if (last_pfn && (this_pfn != last_pfn + 1)) {
   /* Pin a contiguous chunk of memory */
   csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-    if (vhost_vdpa_map(v, iova, csize,
-   map_pfn << PAGE_SHIFT,
-   msg->perm))
+    ret = vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm);
+    if (ret) {
+    /*
+ * Unpin the pages that are left unmapped
+ * from this point on in the current
+ * page_list. The remaining outstanding
+ * ones which may stride across several
+ * chunks will be covered in the common
+ * error path subsequently.
+ */
+ unpin_user_pages(_list[i],
+ pinned - i);


Can we simply do last_pfn = this_pfn here?

Nope. They are not contiguous segments

Re: [PATCH virtio] virtio: virtio_console: fix DMA memory allocation for rproc serial

2020-11-08 Thread Jason Wang



On 2020/11/5 下午8:22, Alexander Lobakin wrote:

From: Jason Wang 
Date: Thu, 5 Nov 2020 11:10:24 +0800

Hi Jason,


On 2020/11/4 下午11:31, Alexander Lobakin wrote:

Since commit 086d08725d34 ("remoteproc: create vdev subdevice with
specific dma memory pool"), every remoteproc has a DMA subdevice
("remoteprocX#vdevYbuffer") for each virtio device, which inherits
DMA capabilities from the corresponding platform device. This allowed
to associate different DMA pools with each vdev, and required from
virtio drivers to perform DMA operations with the parent device
(vdev->dev.parent) instead of grandparent (vdev->dev.parent->parent).

virtio_rpmsg_bus was already changed in the same merge cycle with
commit d999b622fcfb ("rpmsg: virtio: allocate buffer from parent"),
but virtio_console did not. In fact, operations using the grandparent
worked fine while the grandparent was the platform device, but since
commit c774ad010873 ("remoteproc: Fix and restore the parenting
hierarchy for vdev") this was changed, and now the grandparent device
is the remoteproc device without any DMA capabilities.
So, starting v5.8-rc1 the following warning is observed:

[2.483925] [ cut here ]
[2.489148] WARNING: CPU: 3 PID: 101 at kernel/dma/mapping.c:427 0x80e7eee8
[2.489152] Modules linked in: virtio_console(+)
[2.503737]  virtio_rpmsg_bus rpmsg_core
[2.508903]
[2.528898] 
[2.913043]
[2.914907] ---[ end trace 93ac8746beab612c ]---
[2.920102] virtio-ports vport1p0: Error allocating inbufs

kernel/dma/mapping.c:427 is:

WARN_ON_ONCE(!dev->coherent_dma_mask);

obviously because the grandparent now is remoteproc dev without any
DMA caps:

[3.104943] Parent: remoteproc0#vdev1buffer, grandparent: remoteproc0

Fix this the same way as it was for virtio_rpmsg_bus, using just the
parent device (vdev->dev.parent, "remoteprocX#vdevYbuffer") for DMA
operations.
This also allows now to reserve DMA pools/buffers for rproc serial
via Device Tree.

Fixes: c774ad010873 ("remoteproc: Fix and restore the parenting hierarchy for 
vdev")
Cc: sta...@vger.kernel.org # 5.1+
Signed-off-by: Alexander Lobakin 
---
   drivers/char/virtio_console.c | 8 
   1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index a2da8f768b94..1836cc56e357 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -435,12 +435,12 @@ static struct port_buffer *alloc_buf(struct virtio_device 
*vdev, size_t buf_size
/*
 * Allocate DMA memory from ancestor. When a virtio
 * device is created by remoteproc, the DMA memory is
-* associated with the grandparent device:
-* vdev => rproc => platform-dev.
+* associated with the parent device:
+* virtioY => remoteprocX#vdevYbuffer.
 */
-   if (!vdev->dev.parent || !vdev->dev.parent->parent)
+   buf->dev = vdev->dev.parent;
+   if (!buf->dev)
goto free_buf;
-   buf->dev = vdev->dev.parent->parent;


I wonder it could be the right time to introduce dma_dev for virtio
instead of depending on something magic via parent.

This patch are meant to hit RC window and stable trees as a fix of
the bug that is present since v5.8-rc1. So any new features are out
of scope of this particular fix.



Right.




The idea of DMAing through "dev->parent" is that "virtioX" itself is a
logical dev, not the real one, but its parent *is*. This logic is used
across the whole tree -- every subsystem creates its own logical device,
but drivers should always use the backing PCI/platform/etc. devices for
DMA operations, which represent the real hardware.



Yes, so what I meant is to use different variables for DMA and 
hierarchy. So it's the responsibility of the lower layer to pass a 
correct "dma_dev" to the upper layer instead of depending parent.


Anyway for this patch.

Acked-by: Jason Wang 

Thanks





(Btw I don't even notice that there's transport specific code in virtio
console, it's better to avoid it)

Thanks

Thanks,
Al


/* Increase device refcnt to avoid freeing it */
get_device(buf->dev);




Re: [PATCH v2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-08 Thread Jason Wang



On 2020/11/6 上午6:57, si-wei liu wrote:


On 11/4/2020 7:26 PM, Jason Wang wrote:


On 2020/11/5 上午7:33, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
Changes in v2:
- Drop the reversion patch
- Fix unhandled page leak towards the end of page_list

  drivers/vhost/vdpa.c | 79 


  1 file changed, 61 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..e112854 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
    if (r)
  vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+    else
+    atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm);
    return r;
  }
@@ -591,14 +593,16 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
  unsigned int gup_flags = FOLL_LONGTERM;
  unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-    unsigned long locked, lock_limit, pinned, i;
+    unsigned long lock_limit, sz2pin, nchunks, i;
  u64 iova = msg->iova;
+    long pinned;
  int ret = 0;
    if (vhost_iotlb_itree_first(iotlb, msg->iova,
  msg->iova + msg->size - 1))
  return -EEXIST;
  +    /* Limit the use of memory for bookkeeping */
  page_list = (struct page **) __get_free_page(GFP_KERNEL);
  if (!page_list)
  return -ENOMEM;
@@ -607,52 +611,75 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  gup_flags |= FOLL_WRITE;
    npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> 
PAGE_SHIFT;

-    if (!npages)
-    return -EINVAL;
+    if (!npages) {
+    ret = -EINVAL;
+    goto free;
+    }
    mmap_read_lock(dev->mm);
  -    locked = atomic64_add_return(npages, >mm->pinned_vm);
  lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-    if (locked > lock_limit) {
+    if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) {
  ret = -ENOMEM;
-    goto out;
+    goto unlock;
  }
    cur_base = msg->uaddr & PAGE_MASK;
  iova &= PAGE_MASK;
+    nchunks = 0;
    while (npages) {
-    pinned = min_t(unsigned long, npages, list_size);
-    ret = pin_user_pages(cur_base, pinned,
- gup_flags, page_list, NULL);
-    if (ret != pinned)
+    sz2pin = min_t(unsigned long, npages, list_size);
+    pinned = pin_user_pages(cur_base, sz2pin,
+    gup_flags, page_list, NULL);
+    if (sz2pin != pinned) {
+    if (pinned < 0) {
+    ret = pinned;
+    } else {
+    unpin_user_pages(page_list, pinned);
+    ret = -ENOMEM;
+    }
  goto out;
+    }
+    nchunks++;
    if (!last_pfn)
  map_pfn = page_to_pfn(page_list[0]);
  -    for (i = 0; i < ret; i++) {
+    for (i = 0; i < pinned; i++) {
  unsigned long this_pfn = page_to_pfn(page_list[i]);
  u64 csize;
    if (last_pfn && (this_pfn != last_pfn + 1)) {
  /* Pin a contiguous chunk of memory */
  csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-    if (vhost_vdpa_map(v, iova, csize,
-   map_pfn << PAGE_SHIFT,
-   msg->perm))
+    ret = vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm);
+    if (ret) {
+    /*
+ * Unpin the pages that are left unmapped
+ * from this point on in the current
+ * page_list. The remaining outstanding
+ * ones which may stride across several
+ * chunks will be covered in the common
+ * error path subsequently.
+ */
+    unpin_user_pages(_list[i],
+ pinned - i);



Can we simply do last_pfn = this_pfn here?
Nope. They are not contiguous segments of memory. Noted the 
conditional (this_pfn != last_pfn + 1) being held here.



Right.








  goto out;
+    }
+
  map_p

Re: [PATCH v2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-04 Thread Jason Wang



On 2020/11/5 上午7:33, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
Changes in v2:
- Drop the reversion patch
- Fix unhandled page leak towards the end of page_list

  drivers/vhost/vdpa.c | 79 
  1 file changed, 61 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..e112854 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
  
  	if (r)

vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+   else
+   atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm);
  
  	return r;

  }
@@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
unsigned int gup_flags = FOLL_LONGTERM;
unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-   unsigned long locked, lock_limit, pinned, i;
+   unsigned long lock_limit, sz2pin, nchunks, i;
u64 iova = msg->iova;
+   long pinned;
int ret = 0;
  
  	if (vhost_iotlb_itree_first(iotlb, msg->iova,

msg->iova + msg->size - 1))
return -EEXIST;
  
+	/* Limit the use of memory for bookkeeping */

page_list = (struct page **) __get_free_page(GFP_KERNEL);
if (!page_list)
return -ENOMEM;
@@ -607,52 +611,75 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
gup_flags |= FOLL_WRITE;
  
  	npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT;

-   if (!npages)
-   return -EINVAL;
+   if (!npages) {
+   ret = -EINVAL;
+   goto free;
+   }
  
  	mmap_read_lock(dev->mm);
  
-	locked = atomic64_add_return(npages, >mm->pinned_vm);

lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (locked > lock_limit) {
+   if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) {
ret = -ENOMEM;
-   goto out;
+   goto unlock;
}
  
  	cur_base = msg->uaddr & PAGE_MASK;

iova &= PAGE_MASK;
+   nchunks = 0;
  
  	while (npages) {

-   pinned = min_t(unsigned long, npages, list_size);
-   ret = pin_user_pages(cur_base, pinned,
-gup_flags, page_list, NULL);
-   if (ret != pinned)
+   sz2pin = min_t(unsigned long, npages, list_size);
+   pinned = pin_user_pages(cur_base, sz2pin,
+   gup_flags, page_list, NULL);
+   if (sz2pin != pinned) {
+   if (pinned < 0) {
+   ret = pinned;
+   } else {
+   unpin_user_pages(page_list, pinned);
+   ret = -ENOMEM;
+   }
goto out;
+   }
+   nchunks++;
  
  		if (!last_pfn)

map_pfn = page_to_pfn(page_list[0]);
  
-		for (i = 0; i < ret; i++) {

+   for (i = 0; i < pinned; i++) {
unsigned long this_pfn = page_to_pfn(page_list[i]);
u64 csize;
  
  			if (last_pfn && (this_pfn != last_pfn + 1)) {

/* Pin a contiguous chunk of memory */
csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-   if (vhost_vdpa_map(v, iova, csize,
-  map_pfn << PAGE_SHIFT,
-  msg->perm))
+   ret = vhost_vdpa_map(v, iova, csize,
+map_pfn << PAGE_SHIFT,
+msg->perm);
+   if (ret) {
+   /*
+* Unpin the pages that are left 
unmapped
+* from this point on in the current
+* page_list. The remaining outstanding
+* ones which may stride across several
+* chunks will be covered in the common
+* error path 

Re: [PATCH 2/2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-04 Thread Jason Wang



On 2020/11/5 上午7:40, si-wei liu wrote:


On 11/3/2020 6:42 PM, Jason Wang wrote:


On 2020/10/30 下午3:45, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
  drivers/vhost/vdpa.c | 64 
+---

  1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..8da8558 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
    if (r)
  vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+    else
+    atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm);
    return r;
  }
@@ -591,14 +593,16 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
  unsigned int gup_flags = FOLL_LONGTERM;
  unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-    unsigned long locked, lock_limit, pinned, i;
+    unsigned long lock_limit, sz2pin, nchunks, i;
  u64 iova = msg->iova;
+    long pinned;
  int ret = 0;
    if (vhost_iotlb_itree_first(iotlb, msg->iova,
  msg->iova + msg->size - 1))
  return -EEXIST;
  +    /* Limit the use of memory for bookkeeping */
  page_list = (struct page **) __get_free_page(GFP_KERNEL);
  if (!page_list)
  return -ENOMEM;
@@ -607,52 +611,64 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  gup_flags |= FOLL_WRITE;
    npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> 
PAGE_SHIFT;

-    if (!npages)
-    return -EINVAL;
+    if (!npages) {
+    ret = -EINVAL;
+    goto free;
+    }
    mmap_read_lock(dev->mm);
  -    locked = atomic64_add_return(npages, >mm->pinned_vm);
  lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-    if (locked > lock_limit) {
+    if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) {
  ret = -ENOMEM;
-    goto out;
+    goto unlock;
  }
    cur_base = msg->uaddr & PAGE_MASK;
  iova &= PAGE_MASK;
+    nchunks = 0;
    while (npages) {
-    pinned = min_t(unsigned long, npages, list_size);
-    ret = pin_user_pages(cur_base, pinned,
- gup_flags, page_list, NULL);
-    if (ret != pinned)
+    sz2pin = min_t(unsigned long, npages, list_size);
+    pinned = pin_user_pages(cur_base, sz2pin,
+    gup_flags, page_list, NULL);
+    if (sz2pin != pinned) {
+    if (pinned < 0) {
+    ret = pinned;
+    } else {
+    unpin_user_pages(page_list, pinned);
+    ret = -ENOMEM;
+    }
  goto out;
+    }
+    nchunks++;
    if (!last_pfn)
  map_pfn = page_to_pfn(page_list[0]);
  -    for (i = 0; i < ret; i++) {
+    for (i = 0; i < pinned; i++) {
  unsigned long this_pfn = page_to_pfn(page_list[i]);
  u64 csize;
    if (last_pfn && (this_pfn != last_pfn + 1)) {
  /* Pin a contiguous chunk of memory */
  csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-    if (vhost_vdpa_map(v, iova, csize,
-   map_pfn << PAGE_SHIFT,
-   msg->perm))
+    ret = vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm);
+    if (ret)
  goto out;
+
  map_pfn = this_pfn;
  iova += csize;
+    nchunks = 0;
  }
    last_pfn = this_pfn;
  }
  -    cur_base += ret << PAGE_SHIFT;
-    npages -= ret;
+    cur_base += pinned << PAGE_SHIFT;
+    npages -= pinned;
  }
    /* Pin the rest chunk */
@@ -660,10 +676,22 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

   map_pfn << PAGE_SHIFT, msg->perm);
  out:
  if (ret) {
+    if (nchunks && last_pfn) {



Can we decrease npages where you did "nchunks++" then we can check 
npages here instead?
Hmmm, I am not sure I get what you want... @nchunks gets reset to 0 
whenever a certain range of pinned pages is successfully mapped. The 
conditional (when nchunks i

Re: [PATCH virtio] virtio: virtio_console: fix DMA memory allocation for rproc serial

2020-11-04 Thread Jason Wang



On 2020/11/4 下午11:31, Alexander Lobakin wrote:

Since commit 086d08725d34 ("remoteproc: create vdev subdevice with
specific dma memory pool"), every remoteproc has a DMA subdevice
("remoteprocX#vdevYbuffer") for each virtio device, which inherits
DMA capabilities from the corresponding platform device. This allowed
to associate different DMA pools with each vdev, and required from
virtio drivers to perform DMA operations with the parent device
(vdev->dev.parent) instead of grandparent (vdev->dev.parent->parent).

virtio_rpmsg_bus was already changed in the same merge cycle with
commit d999b622fcfb ("rpmsg: virtio: allocate buffer from parent"),
but virtio_console did not. In fact, operations using the grandparent
worked fine while the grandparent was the platform device, but since
commit c774ad010873 ("remoteproc: Fix and restore the parenting
hierarchy for vdev") this was changed, and now the grandparent device
is the remoteproc device without any DMA capabilities.
So, starting v5.8-rc1 the following warning is observed:

[2.483925] [ cut here ]
[2.489148] WARNING: CPU: 3 PID: 101 at kernel/dma/mapping.c:427 0x80e7eee8
[2.489152] Modules linked in: virtio_console(+)
[2.503737]  virtio_rpmsg_bus rpmsg_core
[2.508903]
[2.528898] 
[2.913043]
[2.914907] ---[ end trace 93ac8746beab612c ]---
[2.920102] virtio-ports vport1p0: Error allocating inbufs

kernel/dma/mapping.c:427 is:

WARN_ON_ONCE(!dev->coherent_dma_mask);

obviously because the grandparent now is remoteproc dev without any
DMA caps:

[3.104943] Parent: remoteproc0#vdev1buffer, grandparent: remoteproc0

Fix this the same way as it was for virtio_rpmsg_bus, using just the
parent device (vdev->dev.parent, "remoteprocX#vdevYbuffer") for DMA
operations.
This also allows now to reserve DMA pools/buffers for rproc serial
via Device Tree.

Fixes: c774ad010873 ("remoteproc: Fix and restore the parenting hierarchy for 
vdev")
Cc: sta...@vger.kernel.org # 5.1+
Signed-off-by: Alexander Lobakin 
---
  drivers/char/virtio_console.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index a2da8f768b94..1836cc56e357 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -435,12 +435,12 @@ static struct port_buffer *alloc_buf(struct virtio_device 
*vdev, size_t buf_size
/*
 * Allocate DMA memory from ancestor. When a virtio
 * device is created by remoteproc, the DMA memory is
-* associated with the grandparent device:
-* vdev => rproc => platform-dev.
+* associated with the parent device:
+* virtioY => remoteprocX#vdevYbuffer.
 */
-   if (!vdev->dev.parent || !vdev->dev.parent->parent)
+   buf->dev = vdev->dev.parent;
+   if (!buf->dev)
goto free_buf;
-   buf->dev = vdev->dev.parent->parent;



I wonder it could be the right time to introduce dma_dev for virtio 
instead of depending on something magic via parent.


(Btw I don't even notice that there's transport specific code in virtio 
console, it's better to avoid it)


Thanks


  
  		/* Increase device refcnt to avoid freeing it */

get_device(buf->dev);




Re: [PATCH 2/2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-03 Thread Jason Wang



On 2020/10/30 下午3:45, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
  drivers/vhost/vdpa.c | 64 +---
  1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..8da8558 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
  
  	if (r)

vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+   else
+   atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm);
  
  	return r;

  }
@@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
unsigned int gup_flags = FOLL_LONGTERM;
unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-   unsigned long locked, lock_limit, pinned, i;
+   unsigned long lock_limit, sz2pin, nchunks, i;
u64 iova = msg->iova;
+   long pinned;
int ret = 0;
  
  	if (vhost_iotlb_itree_first(iotlb, msg->iova,

msg->iova + msg->size - 1))
return -EEXIST;
  
+	/* Limit the use of memory for bookkeeping */

page_list = (struct page **) __get_free_page(GFP_KERNEL);
if (!page_list)
return -ENOMEM;
@@ -607,52 +611,64 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
gup_flags |= FOLL_WRITE;
  
  	npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT;

-   if (!npages)
-   return -EINVAL;
+   if (!npages) {
+   ret = -EINVAL;
+   goto free;
+   }
  
  	mmap_read_lock(dev->mm);
  
-	locked = atomic64_add_return(npages, >mm->pinned_vm);

lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (locked > lock_limit) {
+   if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) {
ret = -ENOMEM;
-   goto out;
+   goto unlock;
}
  
  	cur_base = msg->uaddr & PAGE_MASK;

iova &= PAGE_MASK;
+   nchunks = 0;
  
  	while (npages) {

-   pinned = min_t(unsigned long, npages, list_size);
-   ret = pin_user_pages(cur_base, pinned,
-gup_flags, page_list, NULL);
-   if (ret != pinned)
+   sz2pin = min_t(unsigned long, npages, list_size);
+   pinned = pin_user_pages(cur_base, sz2pin,
+   gup_flags, page_list, NULL);
+   if (sz2pin != pinned) {
+   if (pinned < 0) {
+   ret = pinned;
+   } else {
+   unpin_user_pages(page_list, pinned);
+   ret = -ENOMEM;
+   }
goto out;
+   }
+   nchunks++;
  
  		if (!last_pfn)

map_pfn = page_to_pfn(page_list[0]);
  
-		for (i = 0; i < ret; i++) {

+   for (i = 0; i < pinned; i++) {
unsigned long this_pfn = page_to_pfn(page_list[i]);
u64 csize;
  
  			if (last_pfn && (this_pfn != last_pfn + 1)) {

/* Pin a contiguous chunk of memory */
csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-   if (vhost_vdpa_map(v, iova, csize,
-  map_pfn << PAGE_SHIFT,
-  msg->perm))
+   ret = vhost_vdpa_map(v, iova, csize,
+map_pfn << PAGE_SHIFT,
+msg->perm);
+   if (ret)
goto out;
+
map_pfn = this_pfn;
iova += csize;
+   nchunks = 0;
}
  
  			last_pfn = this_pfn;

}
  
-		cur_base += ret << PAGE_SHIFT;

-   npages -= ret;
+   cur_base += pinned << PAGE_SHIFT;
+   npages -= pinned;
}
  
  	/* Pin the rest chunk */

@@ -660,10 +676,22 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
 map_pfn << PAGE_SHIFT, 

Re: [PATCH 2/2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-03 Thread Jason Wang



On 2020/11/4 上午9:08, si-wei liu wrote:


On 11/3/2020 5:06 PM, si-wei liu wrote:


On 11/3/2020 5:00 AM, Jason Wang wrote:


On 2020/10/30 下午3:45, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
  drivers/vhost/vdpa.c | 64 
+---

  1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..8da8558 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
    if (r)
  vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+    else
+    atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm);
    return r;
  }
@@ -591,14 +593,16 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
  unsigned int gup_flags = FOLL_LONGTERM;
  unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-    unsigned long locked, lock_limit, pinned, i;
+    unsigned long lock_limit, sz2pin, nchunks, i;
  u64 iova = msg->iova;
+    long pinned;
  int ret = 0;
    if (vhost_iotlb_itree_first(iotlb, msg->iova,
  msg->iova + msg->size - 1))
  return -EEXIST;
  +    /* Limit the use of memory for bookkeeping */
  page_list = (struct page **) __get_free_page(GFP_KERNEL);
  if (!page_list)
  return -ENOMEM;
@@ -607,52 +611,64 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

  gup_flags |= FOLL_WRITE;
    npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> 
PAGE_SHIFT;

-    if (!npages)
-    return -EINVAL;
+    if (!npages) {
+    ret = -EINVAL;
+    goto free;
+    }
    mmap_read_lock(dev->mm);
  -    locked = atomic64_add_return(npages, >mm->pinned_vm);
  lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-    if (locked > lock_limit) {
+    if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) {
  ret = -ENOMEM;
-    goto out;
+    goto unlock;
  }
    cur_base = msg->uaddr & PAGE_MASK;
  iova &= PAGE_MASK;
+    nchunks = 0;
    while (npages) {
-    pinned = min_t(unsigned long, npages, list_size);
-    ret = pin_user_pages(cur_base, pinned,
- gup_flags, page_list, NULL);
-    if (ret != pinned)
+    sz2pin = min_t(unsigned long, npages, list_size);
+    pinned = pin_user_pages(cur_base, sz2pin,
+    gup_flags, page_list, NULL);
+    if (sz2pin != pinned) {
+    if (pinned < 0) {
+    ret = pinned;
+    } else {
+    unpin_user_pages(page_list, pinned);
+    ret = -ENOMEM;
+    }
  goto out;
+    }
+    nchunks++;
    if (!last_pfn)
  map_pfn = page_to_pfn(page_list[0]);
  -    for (i = 0; i < ret; i++) {
+    for (i = 0; i < pinned; i++) {
  unsigned long this_pfn = page_to_pfn(page_list[i]);
  u64 csize;
    if (last_pfn && (this_pfn != last_pfn + 1)) {
  /* Pin a contiguous chunk of memory */
  csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-    if (vhost_vdpa_map(v, iova, csize,
-   map_pfn << PAGE_SHIFT,
-   msg->perm))
+    ret = vhost_vdpa_map(v, iova, csize,
+ map_pfn << PAGE_SHIFT,
+ msg->perm);
+    if (ret)
  goto out;
+
  map_pfn = this_pfn;
  iova += csize;
+    nchunks = 0;
  }
    last_pfn = this_pfn;
  }
  -    cur_base += ret << PAGE_SHIFT;
-    npages -= ret;
+    cur_base += pinned << PAGE_SHIFT;
+    npages -= pinned;
  }
    /* Pin the rest chunk */
@@ -660,10 +676,22 @@ static int 
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

   map_pfn << PAGE_SHIFT, msg->perm);
  out:
  if (ret) {
+    if (nchunks && last_pfn) {
+    unsigned long pfn;
+
+    /*
+ * Unpin the outstanding pages which are unmapped.
+ * Mapped pages are accounted in vdpa_map(), thus
+

Re: [PATCH 2/2] vhost-vdpa: fix page pinning leakage in error path (rework)

2020-11-03 Thread Jason Wang



On 2020/10/30 下午3:45, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path.

The memory usage for bookkeeping pinned pages is reverted
to what it was before: only one single free page is needed.
This helps reduce the host memory demand for VM with a large
amount of memory, or in the situation where host is running
short of free memory.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
  drivers/vhost/vdpa.c | 64 +---
  1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index b6d9016..8da8558 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
  
  	if (r)

vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+   else
+   atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm);
  
  	return r;

  }
@@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
unsigned int gup_flags = FOLL_LONGTERM;
unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-   unsigned long locked, lock_limit, pinned, i;
+   unsigned long lock_limit, sz2pin, nchunks, i;
u64 iova = msg->iova;
+   long pinned;
int ret = 0;
  
  	if (vhost_iotlb_itree_first(iotlb, msg->iova,

msg->iova + msg->size - 1))
return -EEXIST;
  
+	/* Limit the use of memory for bookkeeping */

page_list = (struct page **) __get_free_page(GFP_KERNEL);
if (!page_list)
return -ENOMEM;
@@ -607,52 +611,64 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
gup_flags |= FOLL_WRITE;
  
  	npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT;

-   if (!npages)
-   return -EINVAL;
+   if (!npages) {
+   ret = -EINVAL;
+   goto free;
+   }
  
  	mmap_read_lock(dev->mm);
  
-	locked = atomic64_add_return(npages, >mm->pinned_vm);

lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (locked > lock_limit) {
+   if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) {
ret = -ENOMEM;
-   goto out;
+   goto unlock;
}
  
  	cur_base = msg->uaddr & PAGE_MASK;

iova &= PAGE_MASK;
+   nchunks = 0;
  
  	while (npages) {

-   pinned = min_t(unsigned long, npages, list_size);
-   ret = pin_user_pages(cur_base, pinned,
-gup_flags, page_list, NULL);
-   if (ret != pinned)
+   sz2pin = min_t(unsigned long, npages, list_size);
+   pinned = pin_user_pages(cur_base, sz2pin,
+   gup_flags, page_list, NULL);
+   if (sz2pin != pinned) {
+   if (pinned < 0) {
+   ret = pinned;
+   } else {
+   unpin_user_pages(page_list, pinned);
+   ret = -ENOMEM;
+   }
goto out;
+   }
+   nchunks++;
  
  		if (!last_pfn)

map_pfn = page_to_pfn(page_list[0]);
  
-		for (i = 0; i < ret; i++) {

+   for (i = 0; i < pinned; i++) {
unsigned long this_pfn = page_to_pfn(page_list[i]);
u64 csize;
  
  			if (last_pfn && (this_pfn != last_pfn + 1)) {

/* Pin a contiguous chunk of memory */
csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-   if (vhost_vdpa_map(v, iova, csize,
-  map_pfn << PAGE_SHIFT,
-  msg->perm))
+   ret = vhost_vdpa_map(v, iova, csize,
+map_pfn << PAGE_SHIFT,
+msg->perm);
+   if (ret)
goto out;
+
map_pfn = this_pfn;
iova += csize;
+   nchunks = 0;
}
  
  			last_pfn = this_pfn;

}
  
-		cur_base += ret << PAGE_SHIFT;

-   npages -= ret;
+   cur_base += pinned << PAGE_SHIFT;
+   npages -= pinned;
}
  
  	/* Pin the rest chunk */

@@ -660,10 +676,22 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
 map_pfn << PAGE_SHIFT, 

Re: [PATCH] vhost/vsock: add IOTLB API support

2020-11-03 Thread Jason Wang



On 2020/11/3 上午1:11, Stefano Garzarella wrote:

On Fri, Oct 30, 2020 at 07:44:43PM +0800, Jason Wang wrote:


On 2020/10/30 下午6:54, Stefano Garzarella wrote:

On Fri, Oct 30, 2020 at 06:02:18PM +0800, Jason Wang wrote:


On 2020/10/30 上午1:43, Stefano Garzarella wrote:

This patch enables the IOTLB API support for vhost-vsock devices,
allowing the userspace to emulate an IOMMU for the guest.

These changes were made following vhost-net, in details this patch:
- exposes VIRTIO_F_ACCESS_PLATFORM feature and inits the iotlb
  device if the feature is acked
- implements VHOST_GET_BACKEND_FEATURES and
  VHOST_SET_BACKEND_FEATURES ioctls
- calls vq_meta_prefetch() before vq processing to prefetch vq
  metadata address in IOTLB
- provides .read_iter, .write_iter, and .poll callbacks for the
  chardev; they are used by the userspace to exchange IOTLB messages

This patch was tested with QEMU and a patch applied [1] to fix a
simple issue:
    $ qemu -M q35,accel=kvm,kernel-irqchip=split \
   -drive file=fedora.qcow2,format=qcow2,if=virtio \
   -device intel-iommu,intremap=on \
   -device vhost-vsock-pci,guest-cid=3,iommu_platform=on



Patch looks good, but a question:

It looks to me you don't enable ATS which means vhost won't get any 
invalidation request or did I miss anything?




You're right, I didn't see invalidation requests, only miss and 
updates.
Now I have tried to enable 'ats' and 'device-iotlb' but I still 
don't see any invalidation.


How can I test it? (Sorry but I don't have much experience yet with 
vIOMMU)



I guess it's because the batched unmap. Maybe you can try to use 
"intel_iommu=strict" in guest kernel command line to see if it works.


Btw, make sure the qemu contains the patch [1]. Otherwise ATS won't 
be enabled for recent Linux Kernel in the guest.


The problem was my kernel, it was built with a tiny configuration.
Using fedora stock kernel I can see the 'invalidate' requests, but I 
also had the following issues.


Do they make you ring any bells?

$ ./qemu -m 4G -smp 4 -M q35,accel=kvm,kernel-irqchip=split \
    -drive file=fedora.qcow2,format=qcow2,if=virtio \
    -device intel-iommu,intremap=on,device-iotlb=on \
    -device vhost-vsock-pci,guest-cid=6,iommu_platform=on,ats=on,id=v1

    qemu-system-x86_64: vtd_iova_to_slpte: detected IOVA overflow     
(iova=0x1d4030c0)



It's a hint that IOVA exceeds the AW. It might be worth to check whether 
the missed IOVA reported from IOTLB is legal.


Thanks


qemu-system-x86_64: vtd_iommu_translate: detected translation failure 
(dev=00:03:00, iova=0x1d4030c0)
    qemu-system-x86_64: New fault is not recorded due to compression 
of     faults


Guest kernel messages:
    [   44.940872] DMAR: DRHD: handling fault status reg 2
    [   44.941989] DMAR: [DMA Read] Request device [00:03.0] PASID     
 fault addr 88W

    [   49.785884] DMAR: DRHD: handling fault status reg 2
    [   49.788874] DMAR: [DMA Read] Request device [00:03.0] PASID     
 fault addr 88W



QEMU: b149dea55c Merge remote-tracking branch 
'remotes/cschoenebeck/tags/pull-9p-20201102' into staging


Linux guest: 5.8.16-200.fc32.x86_64


Thanks,
Stefano





Re: [PATCH 1/2] Revert "vhost-vdpa: fix page pinning leakage in error path"

2020-11-03 Thread Jason Wang



On 2020/10/30 下午3:45, Si-Wei Liu wrote:

This reverts commit 7ed9e3d97c32d969caded2dfb6e67c1a2cc5a0b1.

Signed-off-by: Si-Wei Liu 
---
  drivers/vhost/vdpa.c | 119 +--
  1 file changed, 48 insertions(+), 71 deletions(-)



I saw this has been reverted there 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/drivers/vhost?id=5e1a3149eec8675c2767cc465903f5e4829de5b0.


:)

Thanks




diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index a2dbc85..b6d9016 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -588,19 +588,21 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
struct vhost_dev *dev = >vdev;
struct vhost_iotlb *iotlb = dev->iotlb;
struct page **page_list;
-   struct vm_area_struct **vmas;
+   unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
unsigned int gup_flags = FOLL_LONGTERM;
-   unsigned long map_pfn, last_pfn = 0;
-   unsigned long npages, lock_limit;
-   unsigned long i, nmap = 0;
+   unsigned long npages, cur_base, map_pfn, last_pfn = 0;
+   unsigned long locked, lock_limit, pinned, i;
u64 iova = msg->iova;
-   long pinned;
int ret = 0;
  
  	if (vhost_iotlb_itree_first(iotlb, msg->iova,

msg->iova + msg->size - 1))
return -EEXIST;
  
+	page_list = (struct page **) __get_free_page(GFP_KERNEL);

+   if (!page_list)
+   return -ENOMEM;
+
if (msg->perm & VHOST_ACCESS_WO)
gup_flags |= FOLL_WRITE;
  
@@ -608,86 +610,61 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

if (!npages)
return -EINVAL;
  
-	page_list = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);

-   vmas = kvmalloc_array(npages, sizeof(struct vm_area_struct *),
- GFP_KERNEL);
-   if (!page_list || !vmas) {
-   ret = -ENOMEM;
-   goto free;
-   }
-
mmap_read_lock(dev->mm);
  
+	locked = atomic64_add_return(npages, >mm->pinned_vm);

lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-   if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) {
-   ret = -ENOMEM;
-   goto unlock;
-   }
  
-	pinned = pin_user_pages(msg->uaddr & PAGE_MASK, npages, gup_flags,

-   page_list, vmas);
-   if (npages != pinned) {
-   if (pinned < 0) {
-   ret = pinned;
-   } else {
-   unpin_user_pages(page_list, pinned);
-   ret = -ENOMEM;
-   }
-   goto unlock;
+   if (locked > lock_limit) {
+   ret = -ENOMEM;
+   goto out;
}
  
+	cur_base = msg->uaddr & PAGE_MASK;

iova &= PAGE_MASK;
-   map_pfn = page_to_pfn(page_list[0]);
-
-   /* One more iteration to avoid extra vdpa_map() call out of loop. */
-   for (i = 0; i <= npages; i++) {
-   unsigned long this_pfn;
-   u64 csize;
-
-   /* The last chunk may have no valid PFN next to it */
-   this_pfn = i < npages ? page_to_pfn(page_list[i]) : -1UL;
-
-   if (last_pfn && (this_pfn == -1UL ||
-this_pfn != last_pfn + 1)) {
-   /* Pin a contiguous chunk of memory */
-   csize = last_pfn - map_pfn + 1;
-   ret = vhost_vdpa_map(v, iova, csize << PAGE_SHIFT,
-map_pfn << PAGE_SHIFT,
-msg->perm);
-   if (ret) {
-   /*
-* Unpin the rest chunks of memory on the
-* flight with no corresponding vdpa_map()
-* calls having been made yet. On the other
-* hand, vdpa_unmap() in the failure path
-* is in charge of accounting the number of
-* pinned pages for its own.
-* This asymmetrical pattern of accounting
-* is for efficiency to pin all pages at
-* once, while there is no other callsite
-* of vdpa_map() than here above.
-*/
-   unpin_user_pages(_list[nmap],
-npages - nmap);
-   goto out;
+
+   while (npages) {
+   pinned = min_t(unsigned long, npages, list_size);
+   ret = pin_user_pages(cur_base, pinned,
+gup_flags, page_list, NULL);
+   if (ret != pinned)
+

Re: [PATCH] vhost/vsock: add IOTLB API support

2020-10-30 Thread Jason Wang



On 2020/10/30 下午6:54, Stefano Garzarella wrote:

On Fri, Oct 30, 2020 at 06:02:18PM +0800, Jason Wang wrote:


On 2020/10/30 上午1:43, Stefano Garzarella wrote:

This patch enables the IOTLB API support for vhost-vsock devices,
allowing the userspace to emulate an IOMMU for the guest.

These changes were made following vhost-net, in details this patch:
- exposes VIRTIO_F_ACCESS_PLATFORM feature and inits the iotlb
  device if the feature is acked
- implements VHOST_GET_BACKEND_FEATURES and
  VHOST_SET_BACKEND_FEATURES ioctls
- calls vq_meta_prefetch() before vq processing to prefetch vq
  metadata address in IOTLB
- provides .read_iter, .write_iter, and .poll callbacks for the
  chardev; they are used by the userspace to exchange IOTLB messages

This patch was tested with QEMU and a patch applied [1] to fix a
simple issue:
    $ qemu -M q35,accel=kvm,kernel-irqchip=split \
   -drive file=fedora.qcow2,format=qcow2,if=virtio \
   -device intel-iommu,intremap=on \
   -device vhost-vsock-pci,guest-cid=3,iommu_platform=on



Patch looks good, but a question:

It looks to me you don't enable ATS which means vhost won't get any 
invalidation request or did I miss anything?




You're right, I didn't see invalidation requests, only miss and updates.
Now I have tried to enable 'ats' and 'device-iotlb' but I still don't 
see any invalidation.


How can I test it? (Sorry but I don't have much experience yet with 
vIOMMU)



I guess it's because the batched unmap. Maybe you can try to use 
"intel_iommu=strict" in guest kernel command line to see if it works.


Btw, make sure the qemu contains the patch [1]. Otherwise ATS won't be 
enabled for recent Linux Kernel in the guest.


Thanks

[1] https://patchew.org/QEMU/20200909081731.24688-1-jasow...@redhat.com/



Thanks,
Stefano





Re: [PATCH] vhost/vsock: add IOTLB API support

2020-10-30 Thread Jason Wang



On 2020/10/30 上午1:43, Stefano Garzarella wrote:

This patch enables the IOTLB API support for vhost-vsock devices,
allowing the userspace to emulate an IOMMU for the guest.

These changes were made following vhost-net, in details this patch:
- exposes VIRTIO_F_ACCESS_PLATFORM feature and inits the iotlb
   device if the feature is acked
- implements VHOST_GET_BACKEND_FEATURES and
   VHOST_SET_BACKEND_FEATURES ioctls
- calls vq_meta_prefetch() before vq processing to prefetch vq
   metadata address in IOTLB
- provides .read_iter, .write_iter, and .poll callbacks for the
   chardev; they are used by the userspace to exchange IOTLB messages

This patch was tested with QEMU and a patch applied [1] to fix a
simple issue:
 $ qemu -M q35,accel=kvm,kernel-irqchip=split \
-drive file=fedora.qcow2,format=qcow2,if=virtio \
-device intel-iommu,intremap=on \
-device vhost-vsock-pci,guest-cid=3,iommu_platform=on



Patch looks good, but a question:

It looks to me you don't enable ATS which means vhost won't get any 
invalidation request or did I miss anything?


Thanks




[1] https://lists.gnu.org/archive/html/qemu-devel/2020-10/msg09077.html

Signed-off-by: Stefano Garzarella 
---
  drivers/vhost/vsock.c | 68 +--
  1 file changed, 65 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index a483cec31d5c..5e78fb719602 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -30,7 +30,12 @@
  #define VHOST_VSOCK_PKT_WEIGHT 256
  
  enum {

-   VHOST_VSOCK_FEATURES = VHOST_FEATURES,
+   VHOST_VSOCK_FEATURES = VHOST_FEATURES |
+  (1ULL << VIRTIO_F_ACCESS_PLATFORM)
+};
+
+enum {
+   VHOST_VSOCK_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)
  };
  
  /* Used to track all the vhost_vsock instances on the system. */

@@ -94,6 +99,9 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
if (!vhost_vq_get_backend(vq))
goto out;
  
+	if (!vq_meta_prefetch(vq))

+   goto out;
+
/* Avoid further vmexits, we're already processing the virtqueue */
vhost_disable_notify(>dev, vq);
  
@@ -449,6 +457,9 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)

if (!vhost_vq_get_backend(vq))
goto out;
  
+	if (!vq_meta_prefetch(vq))

+   goto out;
+
vhost_disable_notify(>dev, vq);
do {
u32 len;
@@ -766,8 +777,12 @@ static int vhost_vsock_set_features(struct vhost_vsock 
*vsock, u64 features)
mutex_lock(>dev.mutex);
if ((features & (1 << VHOST_F_LOG_ALL)) &&
!vhost_log_access_ok(>dev)) {
-   mutex_unlock(>dev.mutex);
-   return -EFAULT;
+   goto err;
+   }
+
+   if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) {
+   if (vhost_init_device_iotlb(>dev, true))
+   goto err;
}
  
  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {

@@ -778,6 +793,10 @@ static int vhost_vsock_set_features(struct vhost_vsock 
*vsock, u64 features)
}
mutex_unlock(>dev.mutex);
return 0;
+
+err:
+   mutex_unlock(>dev.mutex);
+   return -EFAULT;
  }
  
  static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,

@@ -811,6 +830,18 @@ static long vhost_vsock_dev_ioctl(struct file *f, unsigned 
int ioctl,
if (copy_from_user(, argp, sizeof(features)))
return -EFAULT;
return vhost_vsock_set_features(vsock, features);
+   case VHOST_GET_BACKEND_FEATURES:
+   features = VHOST_VSOCK_BACKEND_FEATURES;
+   if (copy_to_user(argp, , sizeof(features)))
+   return -EFAULT;
+   return 0;
+   case VHOST_SET_BACKEND_FEATURES:
+   if (copy_from_user(, argp, sizeof(features)))
+   return -EFAULT;
+   if (features & ~VHOST_VSOCK_BACKEND_FEATURES)
+   return -EOPNOTSUPP;
+   vhost_set_backend_features(>dev, features);
+   return 0;
default:
mutex_lock(>dev.mutex);
r = vhost_dev_ioctl(>dev, ioctl, argp);
@@ -823,6 +854,34 @@ static long vhost_vsock_dev_ioctl(struct file *f, unsigned 
int ioctl,
}
  }
  
+static ssize_t vhost_vsock_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)

+{
+   struct file *file = iocb->ki_filp;
+   struct vhost_vsock *vsock = file->private_data;
+   struct vhost_dev *dev = >dev;
+   int noblock = file->f_flags & O_NONBLOCK;
+
+   return vhost_chr_read_iter(dev, to, noblock);
+}
+
+static ssize_t vhost_vsock_chr_write_iter(struct kiocb *iocb,
+   struct iov_iter *from)
+{
+   struct file *file = iocb->ki_filp;
+   struct vhost_vsock *vsock = file->private_data;
+   struct 

Re: [PATCH 0/2] vdpasim: allow to set MAC address

2020-10-29 Thread Jason Wang



On 2020/10/29 下午8:20, Laurent Vivier wrote:

This series starts by fixing a bug:
vdpa_sim generates a MAC address that is never show to
upper layer, and thus virtio-net generates another random
MAC address, that changes each time virtio-net is loaded
(even if vdpa_sim is not unloaded).

Then it adds a parameter to vpa_sim module to allow the user to
set the MAC address. With that we use vdpa_sim with a stable
MAC addres, that doesn't change between reboots.

Laurent Vivier (2):
   vdpasim: fix MAC address configuration
   vdpasim: allow to assign a MAC address

  drivers/vdpa/vdpa_sim/vdpa_sim.c | 17 +++--
  1 file changed, 15 insertions(+), 2 deletions(-)



Acked-by: Jason Wang 




Re: [PATCH] vdpa_sim: Fix DMA mask

2020-10-28 Thread Jason Wang
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -361,7 +361,9 @@ static struct vdpasim *vdpasim_create(void)
spin_lock_init(>iommu_lock);
  
  	dev = >vdpa.dev;

-   dev->coherent_dma_mask = DMA_BIT_MASK(64);
+   dev->dma_mask = >coherent_dma_mask;
+   if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)))
+   goto err_iommu;
set_dma_ops(dev, _dma_ops);
  
  	vdpasim->iommu = vhost_iotlb_alloc(2048, 0);



Acked-by: Jason Wang 





Re: [PATCH] vdpa/mlx5: Fix error return in map_direct_mr()

2020-10-26 Thread Jason Wang



On 2020/10/26 下午3:06, Jing Xiangfeng wrote:

Fix to return the variable "err" from the error handling case instead
of "ret".

Fixes: 94abbccdf291 ("vdpa/mlx5: Add shared memory registration code")
Signed-off-by: Jing Xiangfeng 
---
  drivers/vdpa/mlx5/core/mr.c | 5 ++---
  1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index ef1c550f8266..4b6195666c58 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -239,7 +239,6 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, 
struct mlx5_vdpa_direct_mr
u64 paend;
struct scatterlist *sg;
struct device *dma = mvdev->mdev->device;
-   int ret;
  
  	for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1);

 map; map = vhost_iotlb_itree_next(map, start, mr->end - 1)) {
@@ -277,8 +276,8 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, 
struct mlx5_vdpa_direct_mr
  done:
mr->log_size = log_entity_size;
mr->nsg = nsg;
-   ret = dma_map_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, 
DMA_BIDIRECTIONAL, 0);
-   if (!ret)
+   err = dma_map_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, 
DMA_BIDIRECTIONAL, 0);
+   if (!err)
goto err_map;
  
  	err = create_direct_mr(mvdev, mr);



Acked-by: Jason Wang 




[PATCH V4 3/3] vdpa_sim: implement get_iova_range()

2020-10-23 Thread Jason Wang
This implements a sample get_iova_range() for the simulator which
advertise [0, ULLONG_MAX] as the valid range.

Signed-off-by: Jason Wang 
---
 drivers/vdpa/vdpa_sim/vdpa_sim.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 62d640327145..ff6c9fd8d879 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -574,6 +574,16 @@ static u32 vdpasim_get_generation(struct vdpa_device *vdpa)
return vdpasim->generation;
 }
 
+static struct vdpa_iova_range vdpasim_get_iova_range(struct vdpa_device *vdpa)
+{
+   struct vdpa_iova_range range = {
+   .first = 0ULL,
+   .last = ULLONG_MAX,
+   };
+
+   return range;
+}
+
 static int vdpasim_set_map(struct vdpa_device *vdpa,
   struct vhost_iotlb *iotlb)
 {
@@ -657,6 +667,7 @@ static const struct vdpa_config_ops vdpasim_net_config_ops 
= {
.get_config = vdpasim_get_config,
.set_config = vdpasim_set_config,
.get_generation = vdpasim_get_generation,
+   .get_iova_range = vdpasim_get_iova_range,
.dma_map= vdpasim_dma_map,
.dma_unmap  = vdpasim_dma_unmap,
.free   = vdpasim_free,
@@ -683,6 +694,7 @@ static const struct vdpa_config_ops 
vdpasim_net_batch_config_ops = {
.get_config = vdpasim_get_config,
.set_config = vdpasim_set_config,
.get_generation = vdpasim_get_generation,
+   .get_iova_range = vdpasim_get_iova_range,
.set_map= vdpasim_set_map,
.free   = vdpasim_free,
 };
-- 
2.20.1



[PATCH V4 0/3] vDPA: API for reporting IOVA range

2020-10-23 Thread Jason Wang
Hi All:

This series introduces API for reporing IOVA range. This is a must for
userspace to work correclty:

- for the process that uses vhost-vDPA directly, the IOVA must be
  allocated from this range.
- for VM(qemu), when vIOMMU is not enabled, fail early if GPA is out
  of range
- for VM(qemu), when vIOMMU is enabled, determine a valid guest
  address width and then guest IOVA allocator can behave correctly.

Please review.

Changes from V3:

- really silent build warnings

Changes from V2:

- silent build warnings

Changes from V1:

- do not mandate get_iova_range() for device with its own DMA
  translation logic and assume a [0, ULLONG_MAX] range
- mandate IOVA range only for IOMMU that forcing aperture
- forbid the map which is out of the IOVA range in vhost-vDPA

Jason Wang (3):
  vdpa: introduce config op to get valid iova range
  vhost: vdpa: report iova range
  vdpa_sim: implement get_iova_range()

 drivers/vdpa/vdpa_sim/vdpa_sim.c | 12 ++
 drivers/vhost/vdpa.c | 41 
 include/linux/vdpa.h | 15 
 include/uapi/linux/vhost.h   |  4 
 include/uapi/linux/vhost_types.h |  9 +++
 5 files changed, 81 insertions(+)

-- 
2.20.1



[PATCH V4 2/3] vhost: vdpa: report iova range

2020-10-23 Thread Jason Wang
This patch introduces a new ioctl for vhost-vdpa device that can
report the iova range by the device.

For device that implements get_iova_range() method, we fetch it from
the vDPA device. If device doesn't implement get_iova_range() but
depends on platform IOMMU, we will query via DOMAIN_ATTR_GEOMETRY,
otherwise [0, ULLONG_MAX] is assumed.

For safety, this patch also rules out the map request which is not in
the valid range.

Signed-off-by: Jason Wang 
---
 drivers/vhost/vdpa.c | 41 
 include/uapi/linux/vhost.h   |  4 
 include/uapi/linux/vhost_types.h |  9 +++
 3 files changed, 54 insertions(+)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index a2dbc85e0b0d..846de69d9c01 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -47,6 +47,7 @@ struct vhost_vdpa {
int minor;
struct eventfd_ctx *config_ctx;
int in_batch;
+   struct vdpa_iova_range range;
 };
 
 static DEFINE_IDA(vhost_vdpa_ida);
@@ -337,6 +338,16 @@ static long vhost_vdpa_set_config_call(struct vhost_vdpa 
*v, u32 __user *argp)
return 0;
 }
 
+static long vhost_vdpa_get_iova_range(struct vhost_vdpa *v, u32 __user *argp)
+{
+   struct vhost_vdpa_iova_range range = {
+   .first = v->range.first,
+   .last = v->range.last,
+   };
+
+   return copy_to_user(argp, , sizeof(range));
+}
+
 static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
   void __user *argp)
 {
@@ -471,6 +482,9 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
features = VHOST_VDPA_BACKEND_FEATURES;
r = copy_to_user(featurep, , sizeof(features));
break;
+   case VHOST_VDPA_GET_IOVA_RANGE:
+   r = vhost_vdpa_get_iova_range(v, argp);
+   break;
default:
r = vhost_dev_ioctl(>vdev, cmd, argp);
if (r == -ENOIOCTLCMD)
@@ -597,6 +611,10 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
long pinned;
int ret = 0;
 
+   if (msg->iova < v->range.first ||
+   msg->iova + msg->size - 1 > v->range.last)
+   return -EINVAL;
+
if (vhost_iotlb_itree_first(iotlb, msg->iova,
msg->iova + msg->size - 1))
return -EEXIST;
@@ -783,6 +801,27 @@ static void vhost_vdpa_free_domain(struct vhost_vdpa *v)
v->domain = NULL;
 }
 
+static void vhost_vdpa_set_iova_range(struct vhost_vdpa *v)
+{
+   struct vdpa_iova_range *range = >range;
+   struct iommu_domain_geometry geo;
+   struct vdpa_device *vdpa = v->vdpa;
+   const struct vdpa_config_ops *ops = vdpa->config;
+
+   if (ops->get_iova_range) {
+   *range = ops->get_iova_range(vdpa);
+   } else if (v->domain &&
+  !iommu_domain_get_attr(v->domain,
+  DOMAIN_ATTR_GEOMETRY, ) &&
+  geo.force_aperture) {
+   range->first = geo.aperture_start;
+   range->last = geo.aperture_end;
+   } else {
+   range->first = 0;
+   range->last = ULLONG_MAX;
+   }
+}
+
 static int vhost_vdpa_open(struct inode *inode, struct file *filep)
 {
struct vhost_vdpa *v;
@@ -823,6 +862,8 @@ static int vhost_vdpa_open(struct inode *inode, struct file 
*filep)
if (r)
goto err_init_iotlb;
 
+   vhost_vdpa_set_iova_range(v);
+
filep->private_data = v;
 
return 0;
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 75232185324a..c998860d7bbc 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -146,4 +146,8 @@
 
 /* Set event fd for config interrupt*/
 #define VHOST_VDPA_SET_CONFIG_CALL _IOW(VHOST_VIRTIO, 0x77, int)
+
+/* Get the valid iova range */
+#define VHOST_VDPA_GET_IOVA_RANGE  _IOR(VHOST_VIRTIO, 0x78, \
+struct vhost_vdpa_iova_range)
 #endif
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index 9a269a88a6ff..f7f6a3a28977 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -138,6 +138,15 @@ struct vhost_vdpa_config {
__u8 buf[0];
 };
 
+/* vhost vdpa IOVA range
+ * @first: First address that can be mapped by vhost-vDPA
+ * @last: Last address that can be mapped by vhost-vDPA
+ */
+struct vhost_vdpa_iova_range {
+   __u64 first;
+   __u64 last;
+};
+
 /* Feature bits */
 /* Log all write descriptors. Can be changed while device is active. */
 #define VHOST_F_LOG_ALL 26
-- 
2.20.1



[PATCH V4 1/3] vdpa: introduce config op to get valid iova range

2020-10-23 Thread Jason Wang
This patch introduce a config op to get valid iova range from the vDPA
device.

Signed-off-by: Jason Wang 
---
 include/linux/vdpa.h | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index eae0bfd87d91..30bc7a7223bb 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -52,6 +52,16 @@ struct vdpa_device {
int nvqs;
 };
 
+/**
+ * vDPA IOVA range - the IOVA range support by the device
+ * @first: start of the IOVA range
+ * @last: end of the IOVA range
+ */
+struct vdpa_iova_range {
+   u64 first;
+   u64 last;
+};
+
 /**
  * vDPA_config_ops - operations for configuring a vDPA device.
  * Note: vDPA device drivers are required to implement all of the
@@ -151,6 +161,10 @@ struct vdpa_device {
  * @get_generation:Get device config generation (optional)
  * @vdev: vdpa device
  * Returns u32: device generation
+ * @get_iova_range:Get supported iova range (optional)
+ * @vdev: vdpa device
+ * Returns the iova range supported by
+ * the device.
  * @set_map:   Set device memory mapping (optional)
  * Needed for device that using device
  * specific DMA translation (on-chip IOMMU)
@@ -216,6 +230,7 @@ struct vdpa_config_ops {
void (*set_config)(struct vdpa_device *vdev, unsigned int offset,
   const void *buf, unsigned int len);
u32 (*get_generation)(struct vdpa_device *vdev);
+   struct vdpa_iova_range (*get_iova_range)(struct vdpa_device *vdev);
 
/* DMA ops */
int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb);
-- 
2.20.1



Re: [PATCH V3 2/3] vhost: vdpa: report iova range

2020-10-23 Thread Jason Wang



On 2020/10/23 下午1:28, kernel test robot wrote:

Hi Jason,

I love your patch! Perhaps something to improve:

[auto build test WARNING on vhost/linux-next]
[also build test WARNING on linus/master v5.9 next-20201023]
[cannot apply to linux/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Jason-Wang/vDPA-API-for-reporting-IOVA-range/20201023-102708
base:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git linux-next
config: m68k-randconfig-r034-20201022 (attached as .config)
compiler: m68k-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
 wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
 chmod +x ~/bin/make.cross
 # 
https://github.com/0day-ci/linux/commit/446e7b97838ebf87f1acd61580137716fdad104a
 git remote add linux-review https://github.com/0day-ci/linux
 git fetch --no-tags linux-review 
Jason-Wang/vDPA-API-for-reporting-IOVA-range/20201023-102708
 git checkout 446e7b97838ebf87f1acd61580137716fdad104a
 # save the attached .config to linux build tree
 COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross 
ARCH=m68k

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All warnings (new ones prefixed by >>):

drivers/vhost/vdpa.c: In function 'vhost_vdpa_setup_vq_irq':
drivers/vhost/vdpa.c:94:6: warning: variable 'ret' set but not used 
[-Wunused-but-set-variable]
   94 |  int ret, irq;
  |  ^~~
drivers/vhost/vdpa.c: In function 'vhost_vdpa_unlocked_ioctl':



This looks like another issue that needs to be fixed.



drivers/vhost/vdpa.c:483:5: warning: this statement may fall through 
[-Wimplicit-fallthrough=]

  483 |   r = copy_to_user(featurep, , sizeof(features));
  |   ~~^
drivers/vhost/vdpa.c:484:2: note: here
  484 |  case VHOST_VDPA_GET_IOVA_RANGE:
  |  ^~~~

vim +483 drivers/vhost/vdpa.c



My bad. V4 is on the road.

Thanks




4c8cf31885f69e8 Tiwei Bie2020-03-26  426
4c8cf31885f69e8 Tiwei Bie2020-03-26  427  static long 
vhost_vdpa_unlocked_ioctl(struct file *filep,
4c8cf31885f69e8 Tiwei Bie2020-03-26  428  
unsigned int cmd, unsigned long arg)
4c8cf31885f69e8 Tiwei Bie2020-03-26  429  {
4c8cf31885f69e8 Tiwei Bie2020-03-26  430struct vhost_vdpa *v = 
filep->private_data;
4c8cf31885f69e8 Tiwei Bie2020-03-26  431struct vhost_dev *d = >vdev;
4c8cf31885f69e8 Tiwei Bie2020-03-26  432void __user *argp = (void 
__user *)arg;
a127c5bbb6a8eee Jason Wang   2020-09-07  433u64 __user *featurep = argp;
a127c5bbb6a8eee Jason Wang   2020-09-07  434u64 features;
4c8cf31885f69e8 Tiwei Bie2020-03-26  435long r;
4c8cf31885f69e8 Tiwei Bie2020-03-26  436
a127c5bbb6a8eee Jason Wang   2020-09-07  437if (cmd == 
VHOST_SET_BACKEND_FEATURES) {
a127c5bbb6a8eee Jason Wang   2020-09-07  438r = 
copy_from_user(, featurep, sizeof(features));
a127c5bbb6a8eee Jason Wang   2020-09-07  439if (r)
a127c5bbb6a8eee Jason Wang   2020-09-07  440return r;
a127c5bbb6a8eee Jason Wang   2020-09-07  441if (features & 
~VHOST_VDPA_BACKEND_FEATURES)
a127c5bbb6a8eee Jason Wang   2020-09-07  442return 
-EOPNOTSUPP;
a127c5bbb6a8eee Jason Wang   2020-09-07  443
vhost_set_backend_features(>vdev, features);
a127c5bbb6a8eee Jason Wang   2020-09-07  444return 0;
a127c5bbb6a8eee Jason Wang   2020-09-07  445    }
a127c5bbb6a8eee Jason Wang   2020-09-07  446
4c8cf31885f69e8 Tiwei Bie2020-03-26  447mutex_lock(>mutex);
4c8cf31885f69e8 Tiwei Bie2020-03-26  448
4c8cf31885f69e8 Tiwei Bie2020-03-26  449switch (cmd) {
4c8cf31885f69e8 Tiwei Bie2020-03-26  450case VHOST_VDPA_GET_DEVICE_ID:
4c8cf31885f69e8 Tiwei Bie2020-03-26  451r = 
vhost_vdpa_get_device_id(v, argp);
4c8cf31885f69e8 Tiwei Bie2020-03-26  452break;
4c8cf31885f69e8 Tiwei Bie2020-03-26  453case VHOST_VDPA_GET_STATUS:
4c8cf31885f69e8 Tiwei Bie2020-03-26  454r = 
vhost_vdpa_get_status(v, argp);
4c8cf31885f69e8 Tiwei Bie2020-03-26  455break;
4c8cf31885f69e8 Tiwei Bie2020-03-26  456case VHOST_VDPA_SET_STATUS:
4c8cf31885f69e8 Tiwei Bie2020-03-26  457r = 
vhost_vdpa_set_status(v, argp);
4c8cf31885f69e8 Tiwei Bie2020-03-26  458break;
4c8cf31885f69e8 Tiwei Bie2020-03-26  459case VHOST_VDPA_GET_CONFIG:
4c8cf31885f69e8 Tiwei Bie2020-03-26  460r = 
vhost_vdpa_get_config(v, argp);
4c8cf31885f69e8 Tiwei Bie2020-03-26  461break;
4c8

[PATCH V3 2/3] vhost: vdpa: report iova range

2020-10-22 Thread Jason Wang
This patch introduces a new ioctl for vhost-vdpa device that can
report the iova range by the device.

For device that implements get_iova_range() method, we fetch it from
the vDPA device. If device doesn't implement get_iova_range() but
depends on platform IOMMU, we will query via DOMAIN_ATTR_GEOMETRY,
otherwise [0, ULLONG_MAX] is assumed.

For safety, this patch also rules out the map request which is not in
the valid range.

Signed-off-by: Jason Wang 
---
 drivers/vhost/vdpa.c | 40 
 include/uapi/linux/vhost.h   |  4 
 include/uapi/linux/vhost_types.h |  9 +++
 3 files changed, 53 insertions(+)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index a2dbc85e0b0d..562ed99116d1 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -47,6 +47,7 @@ struct vhost_vdpa {
int minor;
struct eventfd_ctx *config_ctx;
int in_batch;
+   struct vdpa_iova_range range;
 };
 
 static DEFINE_IDA(vhost_vdpa_ida);
@@ -337,6 +338,16 @@ static long vhost_vdpa_set_config_call(struct vhost_vdpa 
*v, u32 __user *argp)
return 0;
 }
 
+static long vhost_vdpa_get_iova_range(struct vhost_vdpa *v, u32 __user *argp)
+{
+   struct vhost_vdpa_iova_range range = {
+   .first = v->range.first,
+   .last = v->range.last,
+   };
+
+   return copy_to_user(argp, , sizeof(range));
+}
+
 static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
   void __user *argp)
 {
@@ -470,6 +481,8 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
case VHOST_GET_BACKEND_FEATURES:
features = VHOST_VDPA_BACKEND_FEATURES;
r = copy_to_user(featurep, , sizeof(features));
+   case VHOST_VDPA_GET_IOVA_RANGE:
+   r = vhost_vdpa_get_iova_range(v, argp);
break;
default:
r = vhost_dev_ioctl(>vdev, cmd, argp);
@@ -597,6 +610,10 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
long pinned;
int ret = 0;
 
+   if (msg->iova < v->range.first ||
+   msg->iova + msg->size - 1 > v->range.last)
+   return -EINVAL;
+
if (vhost_iotlb_itree_first(iotlb, msg->iova,
msg->iova + msg->size - 1))
return -EEXIST;
@@ -783,6 +800,27 @@ static void vhost_vdpa_free_domain(struct vhost_vdpa *v)
v->domain = NULL;
 }
 
+static void vhost_vdpa_set_iova_range(struct vhost_vdpa *v)
+{
+   struct vdpa_iova_range *range = >range;
+   struct iommu_domain_geometry geo;
+   struct vdpa_device *vdpa = v->vdpa;
+   const struct vdpa_config_ops *ops = vdpa->config;
+
+   if (ops->get_iova_range) {
+   *range = ops->get_iova_range(vdpa);
+   } else if (v->domain &&
+  !iommu_domain_get_attr(v->domain,
+  DOMAIN_ATTR_GEOMETRY, ) &&
+  geo.force_aperture) {
+   range->first = geo.aperture_start;
+   range->last = geo.aperture_end;
+   } else {
+   range->first = 0;
+   range->last = ULLONG_MAX;
+   }
+}
+
 static int vhost_vdpa_open(struct inode *inode, struct file *filep)
 {
struct vhost_vdpa *v;
@@ -823,6 +861,8 @@ static int vhost_vdpa_open(struct inode *inode, struct file 
*filep)
if (r)
goto err_init_iotlb;
 
+   vhost_vdpa_set_iova_range(v);
+
filep->private_data = v;
 
return 0;
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 75232185324a..c998860d7bbc 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -146,4 +146,8 @@
 
 /* Set event fd for config interrupt*/
 #define VHOST_VDPA_SET_CONFIG_CALL _IOW(VHOST_VIRTIO, 0x77, int)
+
+/* Get the valid iova range */
+#define VHOST_VDPA_GET_IOVA_RANGE  _IOR(VHOST_VIRTIO, 0x78, \
+struct vhost_vdpa_iova_range)
 #endif
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index 9a269a88a6ff..f7f6a3a28977 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -138,6 +138,15 @@ struct vhost_vdpa_config {
__u8 buf[0];
 };
 
+/* vhost vdpa IOVA range
+ * @first: First address that can be mapped by vhost-vDPA
+ * @last: Last address that can be mapped by vhost-vDPA
+ */
+struct vhost_vdpa_iova_range {
+   __u64 first;
+   __u64 last;
+};
+
 /* Feature bits */
 /* Log all write descriptors. Can be changed while device is active. */
 #define VHOST_F_LOG_ALL 26
-- 
2.20.1



[PATCH V3 3/3] vdpa_sim: implement get_iova_range()

2020-10-22 Thread Jason Wang
This implements a sample get_iova_range() for the simulator which
advertise [0, ULLONG_MAX] as the valid range.

Signed-off-by: Jason Wang 
---
 drivers/vdpa/vdpa_sim/vdpa_sim.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 62d640327145..ff6c9fd8d879 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -574,6 +574,16 @@ static u32 vdpasim_get_generation(struct vdpa_device *vdpa)
return vdpasim->generation;
 }
 
+static struct vdpa_iova_range vdpasim_get_iova_range(struct vdpa_device *vdpa)
+{
+   struct vdpa_iova_range range = {
+   .first = 0ULL,
+   .last = ULLONG_MAX,
+   };
+
+   return range;
+}
+
 static int vdpasim_set_map(struct vdpa_device *vdpa,
   struct vhost_iotlb *iotlb)
 {
@@ -657,6 +667,7 @@ static const struct vdpa_config_ops vdpasim_net_config_ops 
= {
.get_config = vdpasim_get_config,
.set_config = vdpasim_set_config,
.get_generation = vdpasim_get_generation,
+   .get_iova_range = vdpasim_get_iova_range,
.dma_map= vdpasim_dma_map,
.dma_unmap  = vdpasim_dma_unmap,
.free   = vdpasim_free,
@@ -683,6 +694,7 @@ static const struct vdpa_config_ops 
vdpasim_net_batch_config_ops = {
.get_config = vdpasim_get_config,
.set_config = vdpasim_set_config,
.get_generation = vdpasim_get_generation,
+   .get_iova_range = vdpasim_get_iova_range,
.set_map= vdpasim_set_map,
.free   = vdpasim_free,
 };
-- 
2.20.1



[PATCH V3 0/3] vDPA: API for reporting IOVA range

2020-10-22 Thread Jason Wang
Hi All:

This series introduces API for reporing IOVA range. This is a must for
userspace to work correclty:

- for the process that uses vhost-vDPA directly, the IOVA must be
  allocated from this range.
- for VM(qemu), when vIOMMU is not enabled, fail early if GPA is out
  of range
- for VM(qemu), when vIOMMU is enabled, determine a valid guest
  address width and then guest IOVA allocator can behave correctly.

Please review.

Changes from V2:
- silent build warnings

Changes from V1:

- do not mandate get_iova_range() for device with its own DMA
  translation logic and assume a [0, ULLONG_MAX] range
- mandate IOVA range only for IOMMU that forcing aperture
- forbid the map which is out of the IOVA range in vhost-vDPA

Jason Wang (3):
  vdpa: introduce config op to get valid iova range
  vhost: vdpa: report iova range
  vdpa_sim: implement get_iova_range()

 drivers/vdpa/vdpa_sim/vdpa_sim.c | 12 ++
 drivers/vhost/vdpa.c | 40 
 include/linux/vdpa.h | 15 
 include/uapi/linux/vhost.h   |  4 
 include/uapi/linux/vhost_types.h |  9 +++
 5 files changed, 80 insertions(+)

-- 
2.20.1



[PATCH V3 1/3] vdpa: introduce config op to get valid iova range

2020-10-22 Thread Jason Wang
This patch introduce a config op to get valid iova range from the vDPA
device.

Signed-off-by: Jason Wang 
---
 include/linux/vdpa.h | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index eae0bfd87d91..30bc7a7223bb 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -52,6 +52,16 @@ struct vdpa_device {
int nvqs;
 };
 
+/**
+ * vDPA IOVA range - the IOVA range support by the device
+ * @first: start of the IOVA range
+ * @last: end of the IOVA range
+ */
+struct vdpa_iova_range {
+   u64 first;
+   u64 last;
+};
+
 /**
  * vDPA_config_ops - operations for configuring a vDPA device.
  * Note: vDPA device drivers are required to implement all of the
@@ -151,6 +161,10 @@ struct vdpa_device {
  * @get_generation:Get device config generation (optional)
  * @vdev: vdpa device
  * Returns u32: device generation
+ * @get_iova_range:Get supported iova range (optional)
+ * @vdev: vdpa device
+ * Returns the iova range supported by
+ * the device.
  * @set_map:   Set device memory mapping (optional)
  * Needed for device that using device
  * specific DMA translation (on-chip IOMMU)
@@ -216,6 +230,7 @@ struct vdpa_config_ops {
void (*set_config)(struct vdpa_device *vdev, unsigned int offset,
   const void *buf, unsigned int len);
u32 (*get_generation)(struct vdpa_device *vdev);
+   struct vdpa_iova_range (*get_iova_range)(struct vdpa_device *vdev);
 
/* DMA ops */
int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb);
-- 
2.20.1



Re: [PATCH 0/4] vDPA: API for reporting IOVA range

2020-10-21 Thread Jason Wang



On 2020/10/21 下午10:45, Michael S. Tsirkin wrote:

On Wed, Jun 17, 2020 at 11:29:43AM +0800, Jason Wang wrote:

Hi All:

This series introduces API for reporing IOVA range. This is a must for
userspace to work correclty:

- for the process that uses vhost-vDPA directly to properly allocate
   IOVA
- for VM(qemu), when vIOMMU is not enabled, fail early if GPA is out
   of range
- for VM(qemu), when vIOMMU is enabled, determine a valid guest
   address width

Please review.

Thanks

OK so what is the plan here? Change begin-end->first-last and repost?



I've posted V2 with this change, but it get some warning for buildbot.

Will post a V3.

Thanks





Jason Wang (4):
   vdpa: introduce config op to get valid iova range
   vdpa_sim: implement get_iova_range bus operation
   vdpa: get_iova_range() is mandatory for device specific DMA
 translation
   vhost: vdpa: report iova range

  drivers/vdpa/vdpa.c  |  4 
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 11 +++
  drivers/vhost/vdpa.c | 27 +++
  include/linux/vdpa.h | 14 ++
  include/uapi/linux/vhost.h   |  4 
  include/uapi/linux/vhost_types.h |  5 +
  6 files changed, 65 insertions(+)

--
2.20.1




Re: [PATCH v4] Revert "virtio-net: ethtool configurable RXCSUM"

2020-10-21 Thread Jason Wang



On 2020/10/21 下午10:30, Michael S. Tsirkin wrote:

This reverts commit 3618ad2a7c0e78e4258386394d5d5f92a3dbccf8.

When control vq is not negotiated, that commit causes a crash:

[   72.229171] kernel BUG at drivers/net/virtio_net.c:1667!
[   72.230266] invalid opcode:  [#1] PREEMPT SMP
[   72.231172] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
5.9.0-rc8-02934-g3618ad2a7c0e7 #1
[   72.231172] EIP: virtnet_send_command+0x120/0x140
[   72.231172] Code: 00 0f 94 c0 8b 7d f0 65 33 3d 14 00 00 00 75 1c 8d 65 f4 
5b 5e 5f 5d c3 66 90 be 01 00 00 00 e9 6e ff ff ff 8d b6 00
+00 00 00 <0f> 0b e8 d9 bb 82 00 eb 17 8d b4 26 00 00 00 00 8d b4 26 00 00 00
[   72.231172] EAX: 000d EBX: f72895c0 ECX: 0017 EDX: 0011
[   72.231172] ESI: f7197800 EDI: ed69bd00 EBP: ed69bcf4 ESP: ed69bc98
[   72.231172] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010246
[   72.231172] CR0: 80050033 CR2:  CR3: 02c84000 CR4: 000406f0
[   72.231172] Call Trace:
[   72.231172]  ? __virt_addr_valid+0x45/0x60
[   72.231172]  ? ___cache_free+0x51f/0x760
[   72.231172]  ? kobject_uevent_env+0xf4/0x560
[   72.231172]  virtnet_set_guest_offloads+0x4d/0x80
[   72.231172]  virtnet_set_features+0x85/0x120
[   72.231172]  ? virtnet_set_guest_offloads+0x80/0x80
[   72.231172]  __netdev_update_features+0x27a/0x8e0
[   72.231172]  ? kobject_uevent+0xa/0x20
[   72.231172]  ? netdev_register_kobject+0x12c/0x160
[   72.231172]  register_netdevice+0x4fe/0x740
[   72.231172]  register_netdev+0x1c/0x40
[   72.231172]  virtnet_probe+0x728/0xb60
[   72.231172]  ? _raw_spin_unlock+0x1d/0x40
[   72.231172]  ? virtio_vdpa_get_status+0x1c/0x20
[   72.231172]  virtio_dev_probe+0x1c6/0x271
[   72.231172]  really_probe+0x195/0x2e0
[   72.231172]  driver_probe_device+0x26/0x60
[   72.231172]  device_driver_attach+0x49/0x60
[   72.231172]  __driver_attach+0x46/0xc0
[   72.231172]  ? device_driver_attach+0x60/0x60
[   72.231172]  bus_add_driver+0x197/0x1c0
[   72.231172]  driver_register+0x66/0xc0
[   72.231172]  register_virtio_driver+0x1b/0x40
[   72.231172]  virtio_net_driver_init+0x61/0x86
[   72.231172]  ? veth_init+0x14/0x14
[   72.231172]  do_one_initcall+0x76/0x2e4
[   72.231172]  ? rdinit_setup+0x2a/0x2a
[   72.231172]  do_initcalls+0xb2/0xd5
[   72.231172]  kernel_init_freeable+0x14f/0x179
[   72.231172]  ? rest_init+0x100/0x100
[   72.231172]  kernel_init+0xd/0xe0
[   72.231172]  ret_from_fork+0x1c/0x30
[   72.231172] Modules linked in:
[   72.269563] ---[ end trace a6ebc4afea0e6cb1 ]---

The reason is that virtnet_set_features now calls virtnet_set_guest_offloads
unconditionally, it used to only call it when there is something
to configure.

If device does not have a control vq, everything breaks.

Revert the original commit for now.

Cc: Tonghao Zhang 
Cc: Willem de Bruijn 
Fixes: 3618ad2a7c0e7 ("virtio-net: ethtool configurable RXCSUM")
Reported-by: kernel test robot 
Signed-off-by: Michael S. Tsirkin 
---



Acked-by: Jason Wang 





Same patch as all of v1-v3, just tweaking the commit log.

  drivers/net/virtio_net.c | 50 +++-
  1 file changed, 13 insertions(+), 37 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index d2d2c4a53cf2..21b71148c532 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -68,8 +68,6 @@ static const unsigned long guest_offloads[] = {
(1ULL << VIRTIO_NET_F_GUEST_ECN)  | \
(1ULL << VIRTIO_NET_F_GUEST_UFO))
  
-#define GUEST_OFFLOAD_CSUM_MASK (1ULL << VIRTIO_NET_F_GUEST_CSUM)

-
  struct virtnet_stat_desc {
char desc[ETH_GSTRING_LEN];
size_t offset;
@@ -2524,48 +2522,29 @@ static int virtnet_get_phys_port_name(struct net_device 
*dev, char *buf,
return 0;
  }
  
-static netdev_features_t virtnet_fix_features(struct net_device *netdev,

- netdev_features_t features)
-{
-   /* If Rx checksum is disabled, LRO should also be disabled. */
-   if (!(features & NETIF_F_RXCSUM))
-   features &= ~NETIF_F_LRO;
-
-   return features;
-}
-
  static int virtnet_set_features(struct net_device *dev,
netdev_features_t features)
  {
struct virtnet_info *vi = netdev_priv(dev);
-   u64 offloads = vi->guest_offloads;
+   u64 offloads;
int err;
  
-	/* Don't allow configuration while XDP is active. */

-   if (vi->xdp_queue_pairs)
-   return -EBUSY;
-
if ((dev->features ^ features) & NETIF_F_LRO) {
+   if (vi->xdp_queue_pairs)
+   return -EBUSY;
+
if (features & NETIF_F_LRO)
-   offloads |= GUEST_OFFLOAD_LRO_MASK &
-   vi->guest_offloads_capable;
+   offloads = vi->guest_offloads_capable;

Re: [PATCH 2/2] KVM: not link irqfd with a fake IRQ bypass producer

2020-10-20 Thread Jason Wang



On 2020/10/19 下午5:06, Zhenzhong Duan wrote:

In case failure to setup Post interrupt for an IRQ, it make no sense
to assign irqfd->producer to the producer.

This change makes code more robust.



It's better to describe what issue we will get without this patch.

Thanks




Signed-off-by: Zhenzhong Duan 
---
  arch/x86/kvm/x86.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ce856e0..277e961 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10683,13 +10683,14 @@ int kvm_arch_irq_bypass_add_producer(struct 
irq_bypass_consumer *cons,
container_of(cons, struct kvm_kernel_irqfd, consumer);
int ret;
  
-	irqfd->producer = prod;

kvm_arch_start_assignment(irqfd->kvm);
ret = kvm_x86_ops.update_pi_irte(irqfd->kvm,
 prod->irq, irqfd->gsi, 1);
  
  	if (ret)

kvm_arch_end_assignment(irqfd->kvm);
+   else
+   irqfd->producer = prod;
  
  	return ret;

  }




Re: [PATCH 1/2] KVM: not register a IRQ bypass producer if unsupported or disabled

2020-10-20 Thread Jason Wang



On 2020/10/19 下午5:06, Zhenzhong Duan wrote:

If Post interrupt is disabled due to hardware limit or forcely disabled
by "intremap=nopost" parameter, return -EINVAL so that the legacy mode IRQ
isn't registered as IRQ bypass producer.



Is there any side effect if it was still registered?




With this change, below message is printed:
"vfio-pci :db:00.0: irq bypass producer (token 60c8cda5) registration 
fails: -22"



I may miss something, but the patch only touches vhost-vDPA instead of VFIO?

Thanks




..which also hints us if a vfio or vdpa device works in PI mode or legacy
remapping mode.

Add a print to vdpa code just like what vfio_msi_set_vector_signal() does.

Signed-off-by: Zhenzhong Duan 
---
  arch/x86/kvm/svm/avic.c | 3 +--
  arch/x86/kvm/vmx/vmx.c  | 5 ++---
  drivers/vhost/vdpa.c| 5 +
  3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index ac830cd..316142a 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -814,7 +814,7 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int 
host_irq,
  
  	if (!kvm_arch_has_assigned_device(kvm) ||

!irq_remapping_cap(IRQ_POSTING_CAP))
-   return 0;
+   return ret;
  
  	pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",

 __func__, host_irq, guest_irq, set);
@@ -899,7 +899,6 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int 
host_irq,
}
}
  
-	ret = 0;

  out:
srcu_read_unlock(>irq_srcu, idx);
return ret;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f0a9954..1fed6d6 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7716,12 +7716,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned 
int host_irq,
struct kvm_lapic_irq irq;
struct kvm_vcpu *vcpu;
struct vcpu_data vcpu_info;
-   int idx, ret = 0;
+   int idx, ret = -EINVAL;
  
  	if (!kvm_arch_has_assigned_device(kvm) ||

!irq_remapping_cap(IRQ_POSTING_CAP) ||
!kvm_vcpu_apicv_active(kvm->vcpus[0]))
-   return 0;
+   return ret;
  
  	idx = srcu_read_lock(>irq_srcu);

irq_rt = srcu_dereference(kvm->irq_routing, >irq_srcu);
@@ -7787,7 +7787,6 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned 
int host_irq,
}
}
  
-	ret = 0;

  out:
srcu_read_unlock(>irq_srcu, idx);
return ret;
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 62a9bb0..b20060a 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -107,6 +107,11 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, 
u16 qid)
vq->call_ctx.producer.token = vq->call_ctx.ctx;
vq->call_ctx.producer.irq = irq;
ret = irq_bypass_register_producer(>call_ctx.producer);
+   if (unlikely(ret))
+   dev_info(>dev,
+   "irq bypass producer (token %p) registration fails: %d\n",
+   vq->call_ctx.producer.token, ret);
+
spin_unlock(>call_ctx.ctx_lock);
  }
  




Re: [PATCH net v2] Revert "virtio-net: ethtool configurable RXCSUM"

2020-10-20 Thread Jason Wang



On 2020/10/20 上午1:32, Michael S. Tsirkin wrote:

This reverts commit 3618ad2a7c0e78e4258386394d5d5f92a3dbccf8.

When the device does not have a control vq (e.g. when using a
version of QEMU based on upstream v0.10 or older, or when specifying
ctrl_vq=off,ctrl_rx=off,ctrl_vlan=off,ctrl_rx_extra=off,ctrl_mac_addr=off
for the device on the QEMU command line), that commit causes a crash:

[   72.229171] kernel BUG at drivers/net/virtio_net.c:1667!
[   72.230266] invalid opcode:  [#1] PREEMPT SMP
[   72.231172] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
5.9.0-rc8-02934-g3618ad2a7c0e7 #1
[   72.231172] EIP: virtnet_send_command+0x120/0x140
[   72.231172] Code: 00 0f 94 c0 8b 7d f0 65 33 3d 14 00 00 00 75 1c 8d 65 f4 
5b 5e 5f 5d c3 66 90 be 01 00 00 00 e9 6e ff ff ff 8d b6 00
+00 00 00 <0f> 0b e8 d9 bb 82 00 eb 17 8d b4 26 00 00 00 00 8d b4 26 00 00 00
[   72.231172] EAX: 000d EBX: f72895c0 ECX: 0017 EDX: 0011
[   72.231172] ESI: f7197800 EDI: ed69bd00 EBP: ed69bcf4 ESP: ed69bc98
[   72.231172] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010246
[   72.231172] CR0: 80050033 CR2:  CR3: 02c84000 CR4: 000406f0
[   72.231172] Call Trace:
[   72.231172]  ? __virt_addr_valid+0x45/0x60
[   72.231172]  ? ___cache_free+0x51f/0x760
[   72.231172]  ? kobject_uevent_env+0xf4/0x560
[   72.231172]  virtnet_set_guest_offloads+0x4d/0x80
[   72.231172]  virtnet_set_features+0x85/0x120
[   72.231172]  ? virtnet_set_guest_offloads+0x80/0x80
[   72.231172]  __netdev_update_features+0x27a/0x8e0
[   72.231172]  ? kobject_uevent+0xa/0x20
[   72.231172]  ? netdev_register_kobject+0x12c/0x160
[   72.231172]  register_netdevice+0x4fe/0x740
[   72.231172]  register_netdev+0x1c/0x40
[   72.231172]  virtnet_probe+0x728/0xb60
[   72.231172]  ? _raw_spin_unlock+0x1d/0x40
[   72.231172]  ? virtio_vdpa_get_status+0x1c/0x20
[   72.231172]  virtio_dev_probe+0x1c6/0x271
[   72.231172]  really_probe+0x195/0x2e0
[   72.231172]  driver_probe_device+0x26/0x60
[   72.231172]  device_driver_attach+0x49/0x60
[   72.231172]  __driver_attach+0x46/0xc0
[   72.231172]  ? device_driver_attach+0x60/0x60
[   72.231172]  bus_add_driver+0x197/0x1c0
[   72.231172]  driver_register+0x66/0xc0
[   72.231172]  register_virtio_driver+0x1b/0x40
[   72.231172]  virtio_net_driver_init+0x61/0x86
[   72.231172]  ? veth_init+0x14/0x14
[   72.231172]  do_one_initcall+0x76/0x2e4
[   72.231172]  ? rdinit_setup+0x2a/0x2a
[   72.231172]  do_initcalls+0xb2/0xd5
[   72.231172]  kernel_init_freeable+0x14f/0x179
[   72.231172]  ? rest_init+0x100/0x100
[   72.231172]  kernel_init+0xd/0xe0
[   72.231172]  ret_from_fork+0x1c/0x30
[   72.231172] Modules linked in:
[   72.269563] ---[ end trace a6ebc4afea0e6cb1 ]---

The reason is that virtnet_set_features now calls virtnet_set_guest_offloads
unconditionally, it used to only call it when there is something
to configure.

If device does not have a control vq, everything breaks.

Looking at this some more, I noticed that it's not really checking the
hardware too much. E.g.

 if ((dev->features ^ features) & NETIF_F_LRO) {
 if (features & NETIF_F_LRO)
 offloads |= GUEST_OFFLOAD_LRO_MASK &
 vi->guest_offloads_capable;
 else
 offloads &= ~GUEST_OFFLOAD_LRO_MASK;
 }

and

 (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
 (1ULL << VIRTIO_NET_F_GUEST_ECN)  | \
 (1ULL << VIRTIO_NET_F_GUEST_UFO))

But there's no guarantee that e.g. VIRTIO_NET_F_GUEST_TSO6 is set.

If it isn't command should not send it.

Further

static int virtnet_set_features(struct net_device *dev,
 netdev_features_t features)
{
 struct virtnet_info *vi = netdev_priv(dev);
 u64 offloads = vi->guest_offloads;

seems wrong since guest_offloads is zero initialized,



I'm not sure I get here.

Did you mean vi->guest_offloads?

We initialize it during probe

    for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
        if (virtio_has_feature(vi->vdev, guest_offloads[i]))
            set_bit(guest_offloads[i], >guest_offloads);



it does not reflect the state after reset which comes from
the features.

Revert the original commit for now.

Cc: Tonghao Zhang 
Cc: Willem de Bruijn 
Fixes: 3618ad2a7c0e7 ("virtio-net: ethtool configurable RXCSUM")
Reported-by: kernel test robot 
Signed-off-by: Michael S. Tsirkin 
---

changes from v1:
- clarify how to reproduce the bug in the log


  drivers/net/virtio_net.c | 50 +++-
  1 file changed, 13 insertions(+), 37 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index d2d2c4a53cf2..21b71148c532 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -68,8 +68,6 @@ static const unsigned long guest_offloads[] = {
(1ULL << 

Re: [PATCH v3] i2c: virtio: add a virtio i2c frontend driver

2020-10-15 Thread Jason Wang



On 2020/10/14 下午4:37, Jie Deng wrote:


On 2020/10/13 16:00, Jason Wang wrote:



+
+    virtqueue_kick(vq);
+
+    time_left = 
wait_for_completion_timeout(>completion, adap->timeout);

+    if (!time_left) {
+    dev_err(>dev, "msg[%d]: addr=0x%x 
timeout.\n", i, msgs[i].addr);

+    break;
+    }



You don't set error number here. Is this intended?

And using a timeout here is not good, and if the request is 
finished just after the timeout, in the next xfer you may hit the 
following check.


It's better to use either interrupt here.

Could you check the I2C drivers in the kernel ? The 
"wait_for_completion_timeout" mechanism
is commonly used by I2C bus drivers in their 
i2c_algorithm.master_xfer.



There's a major difference between virtio-i2c and other drivers. In 
the case of virtio, the device could be a software device emulated 
by a remote process. This means the timeout might not be rare.


I don't see how timeout is properly handled in this patch (e.g did 
you notice that you don't set any error when timeout? or is this 
intended?)


The backend software may operate the physical device. The timeout 
depends on how the backend is designed.
Here if the timeout happens, it will return the actual number of 
messages successfully processed to the I2C core.

Let the I2C core decides how to do next.



So let's consider the following case:

1) driver:virtio_i2c_add_msg(msgA)
2) driver:timeout, and return to I2C core
3) driver:virtio_i2c_add_msg(msgB)
4) device: complete msgA
5) driver: virtqueue_get_buf() returns msgA, since the token is 
always vi->vmsg, the driver may think msgB has been completed.



If this case does happen, it is exactly a case that the condition 
"((!vmsg) || (vmsg != >vmsg))" are met.



I may miss something, but you always use vi->vmsg as token so vmsg is 
equal to >vmsg here



Currently, the timeout value is hard-coded in the driver. Generally 
speaking, timeout rarely happens.



Well, it's better to no have such assumption consider the device could 
be a emulated one.




It can also be designed as a device configuration if needed.



In any case, the timeout should be handled correctly regardless of its 
frequency.





Thanks.



Thanks.








+
+    vmsg = (struct virtio_i2c_msg *)virtqueue_get_buf(vq, 
);

+    /* vmsg should point to the same address with >vmsg */
+    if ((!vmsg) || (vmsg != >vmsg)) {
+    dev_err(>dev, "msg[%d]: addr=0x%x virtqueue 
error.\n",

+    i, msgs[i].addr);
+    break;
+    }



So I think we can remove this check. Consider only one descriptor 
will be used at most, unless there's a bug in the device (and no 
other driver to the similar check), we should not hit this.


Btw, as I replied in the previous version, the device should be 
cacpable of dealing of a batch of requests through the virtqueue, 
otherwise it's meaningless to use a queue here.


We should not assume there is no bug in the device. I don't think 
we can remove this check if we want our code to be robust.



Can you tell when at which case you may hit !vmsg or vmsg != vi->vmsg?


Normally, it won't hit here. But the API "virtqueue_get_buf" tells me
"It *may *return NULL or the "data" token handed to virtqueue_add_*()."



Note that we had the following check already in 
virtqueue_get_buf_ctx(), so the the virtio core had already have the 
ability to figure out the wrong head.


    if (unlikely(id >= vq->packed.vring.num)) {
        BAD_RING(vq, "id %u out of range\n", id);
        return NULL;
    }
    if (unlikely(!vq->packed.desc_state[id].data)) {
        BAD_RING(vq, "id %u is not a head!\n", id);
        return NULL;
    }

And when it returns a NULL, it's not necessarily an error of the 
device, it might just require more time to finish the processing.




That's why we just returned the actual number of messages successfully 
processed in this case,

and let the I2C core to try one more time.

Actually we have no idea if this is a device error or not. Try one 
more time can also fail if it is a backend error.
Of course, there is another option. We can return error for timeout, 
no matter what reason.


Thanks.






From the perspective of a caller, I just don't care when it happens.
To make the code robust, what I care about is what I should do if 
this is not our case

since the doc says it*may *happen.

If you insist on removing this check, I will remove "vmsg != 
vi->vmsg" and keep the check for !vmsg.

As Dan reported in v2, we should at least check here for NULL.

Thanks.




As I said, currently, we are using the virtqueue to send the msg 
one by one to the backend. The mechanism is described in the spec. 



Which part of the spec describes such "one by one" mechanism? If 
there is one, I'd happily give a NACK since it 

Re: [PATCH v3 2/2] vhost-vdpa: fix page pinning leakage in error path

2020-10-15 Thread Jason Wang



On 2020/10/14 上午7:42, si-wei liu wrote:



So what I suggest is to fix the pinning leakage first and do the 
possible optimization on top (which is still questionable to me).
OK. Unfortunately, this was picked and got merged in upstream. So I 
will post a follow up patch set to 1) revert the commit to the 
original __get_free_page() implementation, and 2) fix the accounting 
and leakage on top. Will it be fine?



Fine.

Thanks



Re: [PATCH v3 2/2] vhost-vdpa: fix page pinning leakage in error path

2020-10-14 Thread Jason Wang



On 2020/10/14 下午2:52, Michael S. Tsirkin wrote:

On Tue, Oct 13, 2020 at 04:42:59PM -0700, si-wei liu wrote:

On 10/9/2020 7:27 PM, Jason Wang wrote:

On 2020/10/3 下午1:02, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path. As the inflight pinned
pages, specifically for memory region that strides across
multiple chunks, would need more than one free page for
book keeping and accounting. For simplicity, pin pages
for all memory in the IOVA range in one go rather than
have multiple pin_user_pages calls to make up the entire
region. This way it's easier to track and account the
pages already mapped, particularly for clean-up in the
error path.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu
---
Changes in v3:
- Factor out vhost_vdpa_map() change to a separate patch

Changes in v2:
- Fix incorrect target SHA1 referenced

   drivers/vhost/vdpa.c | 119
++-
   1 file changed, 71 insertions(+), 48 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 0f27919..dad41dae 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -595,21 +595,19 @@ static int
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
   struct vhost_dev *dev = >vdev;
   struct vhost_iotlb *iotlb = dev->iotlb;
   struct page **page_list;
-unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
+struct vm_area_struct **vmas;
   unsigned int gup_flags = FOLL_LONGTERM;
-unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-unsigned long locked, lock_limit, pinned, i;
+unsigned long map_pfn, last_pfn = 0;
+unsigned long npages, lock_limit;
+unsigned long i, nmap = 0;
   u64 iova = msg->iova;
+long pinned;
   int ret = 0;
 if (vhost_iotlb_itree_first(iotlb, msg->iova,
   msg->iova + msg->size - 1))
   return -EEXIST;
   -page_list = (struct page **) __get_free_page(GFP_KERNEL);
-if (!page_list)
-return -ENOMEM;
-
   if (msg->perm & VHOST_ACCESS_WO)
   gup_flags |= FOLL_WRITE;
   @@ -617,61 +615,86 @@ static int
vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
   if (!npages)
   return -EINVAL;
   +page_list = kvmalloc_array(npages, sizeof(struct page *),
GFP_KERNEL);
+vmas = kvmalloc_array(npages, sizeof(struct vm_area_struct *),
+  GFP_KERNEL);

This will result high order memory allocation which was what the code
tried to avoid originally.

Using an unlimited size will cause a lot of side effects consider VM or
userspace may try to pin several TB of memory.

Hmmm, that's a good point. Indeed, if the guest memory demand is huge or the
host system is running short of free pages, kvmalloc will be problematic and
less efficient than the __get_free_page implementation.

OK so ... Jason, what's the plan?

How about you send a patchset with
1. revert this change
2. fix error handling leak



Work for me, but it looks like siwei want to do this.

So it's better for to send the patchset.

Thanks









Re: [PATCH v3] i2c: virtio: add a virtio i2c frontend driver

2020-10-13 Thread Jason Wang



On 2020/10/13 下午3:16, Jie Deng wrote:


On 2020/10/12 11:43, Jason Wang wrote:


On 2020/10/12 上午10:45, Jie Deng wrote:



On 2020/10/10 11:14, Jason Wang wrote:



+
+    virtqueue_kick(vq);
+
+    time_left = wait_for_completion_timeout(>completion, 
adap->timeout);

+    if (!time_left) {
+    dev_err(>dev, "msg[%d]: addr=0x%x timeout.\n", 
i, msgs[i].addr);

+    break;
+    }



You don't set error number here. Is this intended?

And using a timeout here is not good, and if the request is 
finished just after the timeout, in the next xfer you may hit the 
following check.


It's better to use either interrupt here.

Could you check the I2C drivers in the kernel ? The 
"wait_for_completion_timeout" mechanism

is commonly used by I2C bus drivers in their i2c_algorithm.master_xfer.



There's a major difference between virtio-i2c and other drivers. In 
the case of virtio, the device could be a software device emulated by 
a remote process. This means the timeout might not be rare.


I don't see how timeout is properly handled in this patch (e.g did 
you notice that you don't set any error when timeout? or is this 
intended?)


The backend software may operate the physical device. The timeout 
depends on how the backend is designed.
Here if the timeout happens, it will return the actual number of 
messages successfully processed to the I2C core.

Let the I2C core decides how to do next.



So let's consider the following case:

1) driver:virtio_i2c_add_msg(msgA)
2) driver:timeout, and return to I2C core
3) driver:virtio_i2c_add_msg(msgB)
4) device: complete msgA
5) driver: virtqueue_get_buf() returns msgA, since the token is always 
vi->vmsg, the driver may think msgB has been completed.





Thanks.








+
+    vmsg = (struct virtio_i2c_msg *)virtqueue_get_buf(vq, );
+    /* vmsg should point to the same address with >vmsg */
+    if ((!vmsg) || (vmsg != >vmsg)) {
+    dev_err(>dev, "msg[%d]: addr=0x%x virtqueue 
error.\n",

+    i, msgs[i].addr);
+    break;
+    }



So I think we can remove this check. Consider only one descriptor 
will be used at most, unless there's a bug in the device (and no 
other driver to the similar check), we should not hit this.


Btw, as I replied in the previous version, the device should be 
cacpable of dealing of a batch of requests through the virtqueue, 
otherwise it's meaningless to use a queue here.


We should not assume there is no bug in the device. I don't think we 
can remove this check if we want our code to be robust.



Can you tell when at which case you may hit !vmsg or vmsg != vi->vmsg?


Normally, it won't hit here. But the API "virtqueue_get_buf" tells me
"It *may *return NULL or the "data" token handed to virtqueue_add_*()."



Note that we had the following check already in virtqueue_get_buf_ctx(), 
so the the virtio core had already have the ability to figure out the 
wrong head.


    if (unlikely(id >= vq->packed.vring.num)) {
        BAD_RING(vq, "id %u out of range\n", id);
        return NULL;
    }
    if (unlikely(!vq->packed.desc_state[id].data)) {
        BAD_RING(vq, "id %u is not a head!\n", id);
        return NULL;
    }

And when it returns a NULL, it's not necessarily an error of the device, 
it might just require more time to finish the processing.





From the perspective of a caller, I just don't care when it happens.
To make the code robust, what I care about is what I should do if this 
is not our case

since the doc says it*may *happen.

If you insist on removing this check, I will remove "vmsg != vi->vmsg" 
and keep the check for !vmsg.

As Dan reported in v2, we should at least check here for NULL.

Thanks.




As I said, currently, we are using the virtqueue to send the msg one 
by one to the backend. The mechanism is described in the spec. 



Which part of the spec describes such "one by one" mechanism? If 
there is one, I'd happily give a NACK since it doesn't require a 
queue to work which is conflict with the concept of the virtqueue.



What's the concept of the virtqueue ?  Why do you want to restrict how 
users use virtqueue ?



So I think there's some misunderstanding here. The point is not to 
restrict how to use virtqueue.


What I meant is:

- we should not invent a device with a virtqueue that can only accept 
one buffer at a time
- I don't see any mechanism like "one by one" described in the spec, so 
it's ok but if it'd happen to have, I will NACK





It's like you provide a water glass to user. The user can fill a full 
glass of water and drinks once or
fill half a glass of water and drink twice. It is a user behavior and 
should not be restricted by

the glass provider.



That's my point as well, we should not describe the "once" behavior in 
the spec.





Thanks.

Re: [RFC PATCH 10/24] vdpa: introduce config operations for associating ASID to a virtqueue group

2020-10-12 Thread Jason Wang



On 2020/10/12 下午4:17, Eli Cohen wrote:

On Mon, Oct 12, 2020 at 03:45:10PM +0800, Jason Wang wrote:

So in theory we can have several asid's (for different virtqueues), each
one should be followed by a specific set_map call. If this is so, how do
I know if I met all the conditions run my driver? Maybe we need another
callback to let the driver know it should not expect more set_maps().


This should work similarly as in the past. Two parts of the work is expected
to be done by the driver:

1) store the mapping somewhere (e.g hardware) during set_map()
2) associating mapping with a specific virtqueue

The only difference is that more than one mapping is used now.

ok, so like today, I will always get DRIVER_OK after I got all the
set_maps(), right?



Yes.

Thanks





For the issue of more set_maps(), driver should be always ready for the new
set_maps() call instead of not expecting new set_maps() since guest memory
topology could be changed due to several reasons.

Qemu or vhost-vDPA will try their best to avoid the frequency of set_maps()
for better performance (e.g through batched IOTLB updating). E.g there
should be at most one set_map() during one time of guest booting.






Re: [RFC PATCH 10/24] vdpa: introduce config operations for associating ASID to a virtqueue group

2020-10-12 Thread Jason Wang



On 2020/10/12 下午2:59, Eli Cohen wrote:

On Fri, Oct 09, 2020 at 11:56:45AM +0800, Jason Wang wrote:

On 2020/10/1 下午9:29, Eli Cohen wrote:

On Thu, Sep 24, 2020 at 11:21:11AM +0800, Jason Wang wrote:

This patch introduces a new bus operation to allow the vDPA bus driver
to associate an ASID to a virtqueue group.


So in case of virtio_net, I would expect that all the data virtqueues
will be associated with the same address space identifier.


Right.

I will add the codes to do this in the next version. It should be more
explicit than have this assumption by default.



Moreover,
this assignment should be provided before the set_map call that provides
the iotlb for the address space, correct?


I think it's better not have this limitation, note that set_map() now takes
a asid argument.

So for hardware if the associated as is changed, the driver needs to program
the hardware to switch to the new mapping.

Does this work for mlx5?


So in theory we can have several asid's (for different virtqueues), each
one should be followed by a specific set_map call. If this is so, how do
I know if I met all the conditions run my driver? Maybe we need another
callback to let the driver know it should not expect more set_maps().



This should work similarly as in the past. Two parts of the work is 
expected to be done by the driver:


1) store the mapping somewhere (e.g hardware) during set_map()
2) associating mapping with a specific virtqueue

The only difference is that more than one mapping is used now.

For the issue of more set_maps(), driver should be always ready for the 
new set_maps() call instead of not expecting new set_maps() since guest 
memory topology could be changed due to several reasons.


Qemu or vhost-vDPA will try their best to avoid the frequency of 
set_maps() for better performance (e.g through batched IOTLB updating). 
E.g there should be at most one set_map() during one time of guest booting.


Thanks








Re: [PATCH v3] i2c: virtio: add a virtio i2c frontend driver

2020-10-11 Thread Jason Wang



On 2020/10/12 上午10:45, Jie Deng wrote:



On 2020/10/10 11:14, Jason Wang wrote:



+
+    virtqueue_kick(vq);
+
+    time_left = wait_for_completion_timeout(>completion, 
adap->timeout);

+    if (!time_left) {
+    dev_err(>dev, "msg[%d]: addr=0x%x timeout.\n", i, 
msgs[i].addr);

+    break;
+    }



You don't set error number here. Is this intended?

And using a timeout here is not good, and if the request is finished 
just after the timeout, in the next xfer you may hit the following 
check.


It's better to use either interrupt here.

Could you check the I2C drivers in the kernel ? The 
"wait_for_completion_timeout" mechanism

is commonly used by I2C bus drivers in their i2c_algorithm.master_xfer.



There's a major difference between virtio-i2c and other drivers. In the 
case of virtio, the device could be a software device emulated by a 
remote process. This means the timeout might not be rare.


I don't see how timeout is properly handled in this patch (e.g did you 
notice that you don't set any error when timeout? or is this intended?)








+
+    vmsg = (struct virtio_i2c_msg *)virtqueue_get_buf(vq, );
+    /* vmsg should point to the same address with >vmsg */
+    if ((!vmsg) || (vmsg != >vmsg)) {
+    dev_err(>dev, "msg[%d]: addr=0x%x virtqueue 
error.\n",

+    i, msgs[i].addr);
+    break;
+    }



So I think we can remove this check. Consider only one descriptor 
will be used at most, unless there's a bug in the device (and no 
other driver to the similar check), we should not hit this.


Btw, as I replied in the previous version, the device should be 
cacpable of dealing of a batch of requests through the virtqueue, 
otherwise it's meaningless to use a queue here.


We should not assume there is no bug in the device. I don't think we 
can remove this check if we want our code to be robust.



Can you tell when at which case you may hit !vmsg or vmsg != vi->vmsg?



As I said, currently, we are using the virtqueue to send the msg one 
by one to the backend. The mechanism is described in the spec. 



Which part of the spec describes such "one by one" mechanism? If there 
is one, I'd happily give a NACK since it doesn't require a queue to work 
which is conflict with the concept of the virtqueue.




Thanks.





+

+
+#ifndef _UAPI_LINUX_VIRTIO_I2C_H
+#define _UAPI_LINUX_VIRTIO_I2C_H
+
+#include 
+#include 
+#include 
+
+/**
+ * struct virtio_i2c_hdr - the virtio I2C message header structure
+ * @addr: i2c_msg addr, the slave address
+ * @flags: i2c_msg flags
+ * @len: i2c_msg len
+ */
+struct virtio_i2c_hdr {
+    __le16 addr;
+    __le16 flags;
+    __le16 len;
+};



I'm afraid this is not complete. E.g the status is missed.

I suspect what virtio-scsi use is better. Which split the in from the 
out instead of reusing the same buffer. And it can ease the uAPI 
header export.


Thanks




I think following definition in uAPI for the status is enough.
There is no need to provide a "u8" status in the structure.

/* The final status written by the device */
#define VIRTIO_I2C_MSG_OK    0
#define VIRTIO_I2C_MSG_ERR    1

You can see an example in virtio_blk.

In the spec:

struct virtio_blk_req {
le32 type;
le32 reserved;
le64 sector;
u8 data[];
u8 status;
};

In virtio_blk.h, there is only following definitions.

#define VIRTIO_BLK_S_OK        0
#define VIRTIO_BLK_S_IOERR    1
#define VIRTIO_BLK_S_UNSUPP    2



virtio-blk is a bad example, it's just too late to fix. For any new 
introduced uAPI it should be a complete one.


Thanks



Thanks.







Re: [PATCH] vdpa/mlx5: should keep avail_index despite device status

2020-10-10 Thread Jason Wang



On 2020/10/2 上午4:18, Si-Wei Liu wrote:

A VM with mlx5 vDPA has below warnings while being reset:

vhost VQ 0 ring restore failed: -1: Resource temporarily unavailable (11)
vhost VQ 1 ring restore failed: -1: Resource temporarily unavailable (11)

We should allow userspace emulating the virtio device be
able to get to vq's avail_index, regardless of vDPA device
status. Save the index that was last seen when virtq was
stopped, so that userspace doesn't complain.

Signed-off-by: Si-Wei Liu 



Acked-by: Jason Wang 



---
  drivers/vdpa/mlx5/net/mlx5_vnet.c | 20 ++--
  1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 70676a6..74264e59 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1133,15 +1133,17 @@ static void suspend_vq(struct mlx5_vdpa_net *ndev, 
struct mlx5_vdpa_virtqueue *m
if (!mvq->initialized)
return;
  
-	if (query_virtqueue(ndev, mvq, )) {

-   mlx5_vdpa_warn(>mvdev, "failed to query virtqueue\n");
-   return;
-   }
if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
return;
  
  	if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))

mlx5_vdpa_warn(>mvdev, "modify to suspend failed\n");
+
+   if (query_virtqueue(ndev, mvq, )) {
+   mlx5_vdpa_warn(>mvdev, "failed to query virtqueue\n");
+   return;
+   }
+   mvq->avail_idx = attr.available_index;
  }
  
  static void suspend_vqs(struct mlx5_vdpa_net *ndev)

@@ -1411,8 +1413,14 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device 
*vdev, u16 idx, struct vdpa
struct mlx5_virtq_attr attr;
int err;
  
-	if (!mvq->initialized)

-   return -EAGAIN;
+   /* If the virtq object was destroyed, use the value saved at
+* the last minute of suspend_vq. This caters for userspace
+* that cares about emulating the index after vq is stopped.
+*/
+   if (!mvq->initialized) {
+   state->avail_index = mvq->avail_idx;
+   return 0;
+   }
  
  	err = query_virtqueue(ndev, mvq, );

if (err) {




Re: [PATCH v3] i2c: virtio: add a virtio i2c frontend driver

2020-10-09 Thread Jason Wang



On 2020/9/22 上午10:58, Jie Deng wrote:

Add an I2C bus driver for virtio para-virtualization.

The controller can be emulated by the backend driver in
any device model software by following the virtio protocol.

This driver communicates with the backend driver through a
virtio I2C message structure which includes following parts:

- Header: i2c_msg addr, flags, len.
- Data buffer: the pointer to the I2C msg data.
- Status: the processing result from the backend.

People may implement different backend drivers to emulate
different controllers according to their needs. A backend
example can be found in the device model of the open source
project ACRN. For more information, please refer to
https://projectacrn.org.

The virtio device ID 34 is used for this I2C adpter since IDs
before 34 have been reserved by other virtio devices.

Co-developed-by: Conghui Chen 
Signed-off-by: Conghui Chen 
Signed-off-by: Jie Deng 
Reviewed-by: Shuo Liu 
Reviewed-by: Andy Shevchenko 
---
The device ID request:
 https://github.com/oasis-tcs/virtio-spec/issues/85

The specification:

https://lists.oasis-open.org/archives/virtio-comment/202009/msg00021.html

Changes in v3:
 - Move the interface into uAPI according to Jason.
 - Fix issues reported by Dan Carpenter.
- Fix typo reported by Randy.

Changes in v2:
 - Addressed comments received from Michael, Andy and Jason.

  drivers/i2c/busses/Kconfig  |  11 ++
  drivers/i2c/busses/Makefile |   3 +
  drivers/i2c/busses/i2c-virtio.c | 256 
  include/uapi/linux/virtio_i2c.h |  31 +
  include/uapi/linux/virtio_ids.h |   1 +
  5 files changed, 302 insertions(+)
  create mode 100644 drivers/i2c/busses/i2c-virtio.c
  create mode 100644 include/uapi/linux/virtio_i2c.h

diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 293e7a0..f2f6543 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -21,6 +21,17 @@ config I2C_ALI1535
  This driver can also be built as a module.  If so, the module
  will be called i2c-ali1535.
  
+config I2C_VIRTIO

+   tristate "Virtio I2C Adapter"
+   depends on VIRTIO
+   help
+ If you say yes to this option, support will be included for the virtio
+ I2C adapter driver. The hardware can be emulated by any device model
+ software according to the virtio protocol.
+
+ This driver can also be built as a module. If so, the module
+ will be called i2c-virtio.
+
  config I2C_ALI1563
tristate "ALI 1563"
depends on PCI
diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index 19aff0e..821acfa 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -6,6 +6,9 @@
  # ACPI drivers
  obj-$(CONFIG_I2C_SCMI)+= i2c-scmi.o
  
+# VIRTIO I2C host controller driver

+obj-$(CONFIG_I2C_VIRTIO)   += i2c-virtio.o
+
  # PC SMBus host controller drivers
  obj-$(CONFIG_I2C_ALI1535) += i2c-ali1535.o
  obj-$(CONFIG_I2C_ALI1563) += i2c-ali1563.o
diff --git a/drivers/i2c/busses/i2c-virtio.c b/drivers/i2c/busses/i2c-virtio.c
new file mode 100644
index 000..48fd780
--- /dev/null
+++ b/drivers/i2c/busses/i2c-virtio.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtio I2C Bus Driver
+ *
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+/**
+ * struct virtio_i2c_msg - the virtio I2C message structure
+ * @hdr: the virtio I2C message header
+ * @buf: virtio I2C message data buffer
+ * @status: the processing result from the backend
+ */
+struct virtio_i2c_msg {
+   struct virtio_i2c_hdr hdr;
+   u8 *buf;
+   u8 status;
+};
+
+/**
+ * struct virtio_i2c - virtio I2C data
+ * @vdev: virtio device for this controller
+ * @completion: completion of virtio I2C message
+ * @vmsg: the virtio I2C message for communication
+ * @adap: I2C adapter for this controller
+ * @i2c_lock: lock for virtqueue processing
+ * @vq: the virtio virtqueue for communication
+ */
+struct virtio_i2c {
+   struct virtio_device *vdev;
+   struct completion completion;
+   struct virtio_i2c_msg vmsg;
+   struct i2c_adapter adap;
+   struct mutex i2c_lock;
+   struct virtqueue *vq;
+};
+
+static void virtio_i2c_msg_done(struct virtqueue *vq)
+{
+   struct virtio_i2c *vi = vq->vdev->priv;
+
+   complete(>completion);
+}
+
+static int virtio_i2c_add_msg(struct virtqueue *vq,
+ struct virtio_i2c_msg *vmsg,
+ struct i2c_msg *msg)
+{
+   struct scatterlist *sgs[3], hdr, bout, bin, status;
+   int outcnt = 0, incnt = 0;
+
+   if (!msg->len)
+   return -EINVAL;
+
+   vmsg->hdr.addr = cpu_to_le16(msg->addr);
+   vmsg->hdr.flags = 

Re: [PATCH v3 3/3] vhost: Don't call log_access_ok() when using IOTLB

2020-10-09 Thread Jason Wang



On 2020/10/3 下午6:02, Greg Kurz wrote:

When the IOTLB device is enabled, the log_guest_addr that is passed by
userspace to the VHOST_SET_VRING_ADDR ioctl, and which is then written
to vq->log_addr, is a GIOVA. All writes to this address are translated
by log_user() to writes to an HVA, and then ultimately logged through
the corresponding GPAs in log_write_hva(). No logging will ever occur
with vq->log_addr in this case. It is thus wrong to pass vq->log_addr
and log_guest_addr to log_access_vq() which assumes they are actual
GPAs.

Introduce a new vq_log_used_access_ok() helper that only checks accesses
to the log for the used structure when there isn't an IOTLB device around.

Signed-off-by: Greg Kurz 



Acked-by: Jason Wang 

In the future, we may consider to deprecate log_guest_addr since in any 
case regardless of IOTLB ennoblement we can get GPA from either IOTLB or 
MEM table.


Thanks



---
  drivers/vhost/vhost.c |   23 ++-
  1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 9d2c225fb518..9ad45e1d27f0 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1370,6 +1370,20 @@ bool vhost_log_access_ok(struct vhost_dev *dev)
  }
  EXPORT_SYMBOL_GPL(vhost_log_access_ok);
  
+static bool vq_log_used_access_ok(struct vhost_virtqueue *vq,

+ void __user *log_base,
+ bool log_used,
+ u64 log_addr)
+{
+   /* If an IOTLB device is present, log_addr is a GIOVA that
+* will never be logged by log_used(). */
+   if (vq->iotlb)
+   return true;
+
+   return !log_used || log_access_ok(log_base, log_addr,
+ vhost_get_used_size(vq, vq->num));
+}
+
  /* Verify access for write logging. */
  /* Caller should have vq mutex and device mutex */
  static bool vq_log_access_ok(struct vhost_virtqueue *vq,
@@ -1377,8 +1391,7 @@ static bool vq_log_access_ok(struct vhost_virtqueue *vq,
  {
return vq_memory_access_ok(log_base, vq->umem,
   vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
-   (!vq->log_used || log_access_ok(log_base, vq->log_addr,
- vhost_get_used_size(vq, vq->num)));
+   vq_log_used_access_ok(vq, log_base, vq->log_used, vq->log_addr);
  }
  
  /* Can we start vq? */

@@ -1517,9 +1530,9 @@ static long vhost_vring_set_addr(struct vhost_dev *d,
return -EINVAL;
  
  		/* Also validate log access for used ring if enabled. */

-   if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) &&
-   !log_access_ok(vq->log_base, a.log_guest_addr,
-  vhost_get_used_size(vq, vq->num)))
+   if (!vq_log_used_access_ok(vq, vq->log_base,
+   a.flags & (0x1 << VHOST_VRING_F_LOG),
+   a.log_guest_addr))
return -EINVAL;
}
  







Re: [PATCH v3 2/3] vhost: Use vhost_get_used_size() in vhost_vring_set_addr()

2020-10-09 Thread Jason Wang



On 2020/10/3 下午6:02, Greg Kurz wrote:

The open-coded computation of the used size doesn't take the event
into account when the VIRTIO_RING_F_EVENT_IDX feature is present.
Fix that by using vhost_get_used_size().

Signed-off-by: Greg Kurz 
---
  drivers/vhost/vhost.c |3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c3b49975dc28..9d2c225fb518 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1519,8 +1519,7 @@ static long vhost_vring_set_addr(struct vhost_dev *d,
/* Also validate log access for used ring if enabled. */
if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) &&
!log_access_ok(vq->log_base, a.log_guest_addr,
-   sizeof *vq->used +
-   vq->num * sizeof *vq->used->ring))
+  vhost_get_used_size(vq, vq->num)))
return -EINVAL;
    }
  





Acked-by: Jason Wang 




Re: [PATCH v3 2/2] vhost-vdpa: fix page pinning leakage in error path

2020-10-09 Thread Jason Wang



On 2020/10/3 下午1:02, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path. As the inflight pinned
pages, specifically for memory region that strides across
multiple chunks, would need more than one free page for
book keeping and accounting. For simplicity, pin pages
for all memory in the IOVA range in one go rather than
have multiple pin_user_pages calls to make up the entire
region. This way it's easier to track and account the
pages already mapped, particularly for clean-up in the
error path.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
Changes in v3:
- Factor out vhost_vdpa_map() change to a separate patch

Changes in v2:
- Fix incorrect target SHA1 referenced

  drivers/vhost/vdpa.c | 119 ++-
  1 file changed, 71 insertions(+), 48 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 0f27919..dad41dae 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -595,21 +595,19 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
struct vhost_dev *dev = >vdev;
struct vhost_iotlb *iotlb = dev->iotlb;
struct page **page_list;
-   unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
+   struct vm_area_struct **vmas;
unsigned int gup_flags = FOLL_LONGTERM;
-   unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-   unsigned long locked, lock_limit, pinned, i;
+   unsigned long map_pfn, last_pfn = 0;
+   unsigned long npages, lock_limit;
+   unsigned long i, nmap = 0;
u64 iova = msg->iova;
+   long pinned;
int ret = 0;
  
  	if (vhost_iotlb_itree_first(iotlb, msg->iova,

msg->iova + msg->size - 1))
return -EEXIST;
  
-	page_list = (struct page **) __get_free_page(GFP_KERNEL);

-   if (!page_list)
-   return -ENOMEM;
-
if (msg->perm & VHOST_ACCESS_WO)
gup_flags |= FOLL_WRITE;
  
@@ -617,61 +615,86 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

if (!npages)
return -EINVAL;
  
+	page_list = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);

+   vmas = kvmalloc_array(npages, sizeof(struct vm_area_struct *),
+ GFP_KERNEL);



This will result high order memory allocation which was what the code 
tried to avoid originally.


Using an unlimited size will cause a lot of side effects consider VM or 
userspace may try to pin several TB of memory.




+   if (!page_list || !vmas) {
+   ret = -ENOMEM;
+   goto free;
+   }



Any reason that you want to use vmas?



+
mmap_read_lock(dev->mm);
  
-	locked = atomic64_add_return(npages, >mm->pinned_vm);

lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (locked > lock_limit) {
+   if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) {
ret = -ENOMEM;
-   goto out;
+   goto unlock;
}
  
-	cur_base = msg->uaddr & PAGE_MASK;

-   iova &= PAGE_MASK;
+   pinned = pin_user_pages(msg->uaddr & PAGE_MASK, npages, gup_flags,
+   page_list, vmas);
+   if (npages != pinned) {
+   if (pinned < 0) {
+   ret = pinned;
+   } else {
+   unpin_user_pages(page_list, pinned);
+   ret = -ENOMEM;
+   }
+   goto unlock;
+   }
  
-	while (npages) {

-   pinned = min_t(unsigned long, npages, list_size);
-   ret = pin_user_pages(cur_base, pinned,
-gup_flags, page_list, NULL);
-   if (ret != pinned)
-   goto out;
-
-   if (!last_pfn)
-   map_pfn = page_to_pfn(page_list[0]);
-
-   for (i = 0; i < ret; i++) {
-   unsigned long this_pfn = page_to_pfn(page_list[i]);
-   u64 csize;
-
-   if (last_pfn && (this_pfn != last_pfn + 1)) {
-   /* Pin a contiguous chunk of memory */
-   csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-   if (vhost_vdpa_map(v, iova, csize,
-  map_pfn << PAGE_SHIFT,
-  msg->perm))
-   goto out;
-   map_pfn = this_pfn;
-   iova += csize;
+   iova &= PAGE_MASK;
+   map_pfn = page_to_pfn(page_list[0]);
+
+   /* One more iteration to avoid extra vdpa_map() call out of loop. */
+   for (i = 0; i <= npages; i++) {
+   

Re: [PATCH v3 1/2] vhost-vdpa: fix vhost_vdpa_map() on error condition

2020-10-09 Thread Jason Wang



On 2020/10/3 下午1:02, Si-Wei Liu wrote:

vhost_vdpa_map() should remove the iotlb entry just added
if the corresponding mapping fails to set up properly.

Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
  drivers/vhost/vdpa.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 796fe97..0f27919 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -565,6 +565,9 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
  perm_to_iommu_flags(perm));
}
  
+	if (r)

+   vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
+
return r;
  }
  



Acked-by: Jason Wang 




Re: [PATCH v2] vringh: fix __vringh_iov() when riov and wiov are different

2020-10-08 Thread Jason Wang



On 2020/10/9 上午4:42, Stefano Garzarella wrote:

If riov and wiov are both defined and they point to different
objects, only riov is initialized. If the wiov is not initialized
by the caller, the function fails returning -EINVAL and printing
"Readable desc 0x... after writable" error message.

This issue happens when descriptors have both readable and writable
buffers (eg. virtio-blk devices has virtio_blk_outhdr in the readable
buffer and status as last byte of writable buffer) and we call
__vringh_iov() to get both type of buffers in two different iovecs.

Let's replace the 'else if' clause with 'if' to initialize both
riov and wiov if they are not NULL.

As checkpatch pointed out, we also avoid crashing the kernel
when riov and wiov are both NULL, replacing BUG() with WARN_ON()
and returning -EINVAL.



It looks like I met the exact similar issue when developing ctrl vq 
support (which requires both READ and WRITE descriptor).


While I was trying to fix the issue I found the following comment:

 * Note that you may need to clean up riov and wiov, even on error!
 */
int vringh_getdesc_iotlb(struct vringh *vrh,

I saw some driver call vringh_kiov_cleanup().

So I just follow to use that.

I'm not quite sure which one is better.

Thanks




Fixes: f87d0fbb5798 ("vringh: host-side implementation of virtio rings.")
Cc: sta...@vger.kernel.org
Signed-off-by: Stefano Garzarella 
---
  drivers/vhost/vringh.c | 9 +
  1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
index e059a9a47cdf..8bd8b403f087 100644
--- a/drivers/vhost/vringh.c
+++ b/drivers/vhost/vringh.c
@@ -284,13 +284,14 @@ __vringh_iov(struct vringh *vrh, u16 i,
desc_max = vrh->vring.num;
up_next = -1;
  
+	/* You must want something! */

+   if (WARN_ON(!riov && !wiov))
+   return -EINVAL;
+
if (riov)
riov->i = riov->used = 0;
-   else if (wiov)
+   if (wiov)
wiov->i = wiov->used = 0;
-   else
-   /* You must want something! */
-   BUG();
  
  	for (;;) {

void *addr;




Re: [RFC PATCH 18/24] vhost-vdpa: support ASID based IOTLB API

2020-10-08 Thread Jason Wang



On 2020/9/28 下午11:44, Eugenio Perez Martin wrote:

-u64 iova, u64 size)
+static int vhost_vdpa_unmap(struct vhost_vdpa *v,
+   struct vhost_iotlb *iotlb,
+   u64 iova, u64 size)
  {
 struct vdpa_device *vdpa = v->vdpa;
 const struct vdpa_config_ops *ops = vdpa->config;
+   u32 asid = (iotlb);
+
+   if (!iotlb)
+   return -EINVAL;

This should be reorder to check for (!iotlb) before use at `asid =
iotlb_to_asid()`, isn't it?

Thanks!



Yes, will fix in the next version.

Thanks



Re: [RFC PATCH 10/24] vdpa: introduce config operations for associating ASID to a virtqueue group

2020-10-08 Thread Jason Wang



On 2020/10/1 下午9:29, Eli Cohen wrote:

On Thu, Sep 24, 2020 at 11:21:11AM +0800, Jason Wang wrote:

This patch introduces a new bus operation to allow the vDPA bus driver
to associate an ASID to a virtqueue group.


So in case of virtio_net, I would expect that all the data virtqueues
will be associated with the same address space identifier.



Right.

I will add the codes to do this in the next version. It should be more 
explicit than have this assumption by default.




Moreover,
this assignment should be provided before the set_map call that provides
the iotlb for the address space, correct?



I think it's better not have this limitation, note that set_map() now 
takes a asid argument.


So for hardware if the associated as is changed, the driver needs to 
program the hardware to switch to the new mapping.


Does this work for mlx5?



Signed-off-by: Jason Wang 
---
  include/linux/vdpa.h | 10 ++
  1 file changed, 10 insertions(+)

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 1e1163daa352..e2394995a3cd 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -160,6 +160,12 @@ struct vdpa_device {
   * @get_generation:   Get device config generation (optional)
   *@vdev: vdpa device
   *Returns u32: device generation
+ * @set_group_asid:Set address space identifier for a
+ * virtqueue group
+ * @vdev: vdpa device
+ * @group: virtqueue group
+ * @asid: address space id for this group
+ * Returns integer: success (0) or error (< 0)
   * @set_map:  Set device memory mapping (optional)
   *Needed for device that using device
   *specific DMA translation (on-chip IOMMU)
@@ -237,6 +243,10 @@ struct vdpa_config_ops {
   u64 iova, u64 size, u64 pa, u32 perm);
int (*dma_unmap)(struct vdpa_device *vdev, unsigned int asid,
 u64 iova, u64 size);
+   int (*set_group_asid)(struct vdpa_device *vdev, unsigned int group,
+ unsigned int asid);
+
+

Extra space



Will fix.

Thanks


  
  	/* Free device resources */

void (*free)(struct vdpa_device *vdev);
--
2.20.1





Re: [RFC PATCH 09/24] vdpa: multiple address spaces support

2020-10-08 Thread Jason Wang



On 2020/10/1 下午9:23, Eli Cohen wrote:
  
+	/* Only support 1 address space */

+   if (vdpa->ngroups != 1)
+   return -ENOTSUPP;

Checkpatch warning:  prefer EOPNOTSUPP



Will fix.

Thanks



Re: [RFC PATCH 09/24] vdpa: multiple address spaces support

2020-10-08 Thread Jason Wang



On 2020/10/1 下午9:21, Eli Cohen wrote:

On Thu, Sep 24, 2020 at 11:21:10AM +0800, Jason Wang wrote:

This patches introduces the multiple address spaces support for vDPA
device. This idea is to identify a specific address space via an
dedicated identifier - ASID.

During vDPA device allocation, vDPA device driver needs to report the
number of address spaces supported by the device then the DMA mapping
ops of the vDPA device needs to be extended to support ASID.

This helps to isolate the DMA among the virtqueues. E.g in the case of
virtio-net, the control virtqueue will not be assigned directly to
guest.

This RFC patch only converts for the device that wants its own
IOMMU/DMA translation logic. So it will rejects the device with more
that 1 address space that depends on platform IOMMU. The plan to

This is not apparent from the code. Instead you enforce number of groups
to 1.



Yes, will fix.





moving all the DMA mapping logic to the vDPA device driver instead of
doing it in vhost-vDPA (otherwise it could result a very complicated
APIs and actually vhost-vDPA doesn't care about how the actual
composition/emulation were done in the device driver).

Signed-off-by: Jason Wang 
---
  drivers/vdpa/ifcvf/ifcvf_main.c   |  2 +-
  drivers/vdpa/mlx5/net/mlx5_vnet.c |  5 +++--
  drivers/vdpa/vdpa.c   |  4 +++-
  drivers/vdpa/vdpa_sim/vdpa_sim.c  | 10 ++
  drivers/vhost/vdpa.c  | 14 +-
  include/linux/vdpa.h  | 23 ---
  6 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index e6a0be374e51..86cdf5f8bcae 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -440,7 +440,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct 
pci_device_id *id)
  
  	adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa,

dev, _vdpa_ops,
-   IFCVF_MAX_QUEUE_PAIRS * 2, 1);
+   IFCVF_MAX_QUEUE_PAIRS * 2, 1, 1);
  
  	if (adapter == NULL) {

IFCVF_ERR(pdev, "Failed to allocate vDPA structure");
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 4e480f4f754e..db7404e121bf 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1788,7 +1788,8 @@ static u32 mlx5_vdpa_get_generation(struct vdpa_device 
*vdev)
return mvdev->generation;
  }
  
-static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb)

+static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid,
+struct vhost_iotlb *iotlb)
  {
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
@@ -1931,7 +1932,7 @@ void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev)
max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
  
  	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, _vdpa_ops,

-2 * mlx5_vdpa_max_qps(max_vqs), 1);
+2 * mlx5_vdpa_max_qps(max_vqs), 1, 1);
if (IS_ERR(ndev))
return ndev;
  
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c

index 46399746ec7c..05195fa7865d 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -63,6 +63,7 @@ static void vdpa_release_dev(struct device *d)
   * @config: the bus operations that is supported by this device
   * @nvqs: number of virtqueues supported by this device
   * @ngroups: number of groups supported by this device
+ * @nas: number of address spaces supported by this device
   * @size: size of the parent structure that contains private data
   *
   * Driver should use vdpa_alloc_device() wrapper macro instead of
@@ -74,7 +75,7 @@ static void vdpa_release_dev(struct device *d)
  struct vdpa_device *__vdpa_alloc_device(struct device *parent,
const struct vdpa_config_ops *config,
int nvqs, unsigned int ngroups,
-   size_t size)
+   unsigned int nas, size_t size)
  {
struct vdpa_device *vdev;
int err = -EINVAL;
@@ -102,6 +103,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device 
*parent,
vdev->features_valid = false;
vdev->nvqs = nvqs;
vdev->ngroups = ngroups;
+   vdev->nas = nas;
  
  	err = dev_set_name(>dev, "vdpa%u", vdev->index);

if (err)
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 6669c561bc6e..5dc04ec271bb 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -354,7 +354,7 @@ static struct vdpasim *vdpasim_create(void)
ops = _net_config_o

Re: [RFC PATCH 08/24] vdpa: introduce virtqueue groups

2020-10-08 Thread Jason Wang



On 2020/9/28 下午11:44, Eugenio Perez Martin wrote:

On Thu, Sep 24, 2020 at 5:23 AM Jason Wang  wrote:

This patch introduces virtqueue groups to vDPA device. The virtqueue
group is the minimal set of virtqueues that must share an address
space. And the adddress space identifier could only be attached to
a specific virtqueue group.

A new mandated bus operation is introduced to get the virtqueue group
ID for a specific virtqueue.

All the vDPA device drivers were converted to simply support a single
virtqueue group.

Signed-off-by: Jason Wang
---
  drivers/vdpa/ifcvf/ifcvf_main.c   |  9 -
  drivers/vdpa/mlx5/net/mlx5_vnet.c |  8 +++-
  drivers/vdpa/vdpa.c   |  4 +++-
  drivers/vdpa/vdpa_sim/vdpa_sim.c  | 11 ++-
  include/linux/vdpa.h  | 12 +---
  5 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index 076d7ac5e723..e6a0be374e51 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -327,6 +327,11 @@ static u32 ifcvf_vdpa_get_vq_align(struct vdpa_device 
*vdpa_dev)
 return IFCVF_QUEUE_ALIGNMENT;
  }

+static u32 ifcvf_vdpa_get_vq_group(struct vdpa_device *vdpa, u16 idx)
+{
+   return 0;
+}
+
  static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev,
   unsigned int offset,
   void *buf, unsigned int len)
@@ -387,6 +392,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = {
 .get_device_id  = ifcvf_vdpa_get_device_id,
 .get_vendor_id  = ifcvf_vdpa_get_vendor_id,
 .get_vq_align   = ifcvf_vdpa_get_vq_align,
+   .get_vq_group   = ifcvf_vdpa_get_vq_group,
 .get_config = ifcvf_vdpa_get_config,
 .set_config = ifcvf_vdpa_set_config,
 .set_config_cb  = ifcvf_vdpa_set_config_cb,
@@ -434,7 +440,8 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct 
pci_device_id *id)

 adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa,
 dev, _vdpa_ops,
-   IFCVF_MAX_QUEUE_PAIRS * 2);
+   IFCVF_MAX_QUEUE_PAIRS * 2, 1);
+
 if (adapter == NULL) {
 IFCVF_ERR(pdev, "Failed to allocate vDPA structure");
 return -ENOMEM;
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 9df69d5efe8c..4e480f4f754e 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1428,6 +1428,11 @@ static u32 mlx5_vdpa_get_vq_align(struct vdpa_device 
*vdev)
 return PAGE_SIZE;
  }

+static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdpa, u16 idx)
+{
+   return 0;
+}
+
  enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9,
 MLX5_VIRTIO_NET_F_CSUM = 1 << 10,
 MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11,
@@ -1838,6 +1843,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = {
 .get_vq_notification = mlx5_get_vq_notification,
 .get_vq_irq = mlx5_get_vq_irq,
 .get_vq_align = mlx5_vdpa_get_vq_align,
+   .get_vq_group = mlx5_vdpa_get_vq_group,
 .get_features = mlx5_vdpa_get_features,
 .set_features = mlx5_vdpa_set_features,
 .set_config_cb = mlx5_vdpa_set_config_cb,
@@ -1925,7 +1931,7 @@ void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev)
 max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);

 ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, 
_vdpa_ops,
-2 * mlx5_vdpa_max_qps(max_vqs));
+2 * mlx5_vdpa_max_qps(max_vqs), 1);
 if (IS_ERR(ndev))
 return ndev;

diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index a69ffc991e13..46399746ec7c 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -62,6 +62,7 @@ static void vdpa_release_dev(struct device *d)
   * @parent: the parent device
   * @config: the bus operations that is supported by this device
   * @nvqs: number of virtqueues supported by this device
+ * @ngroups: number of groups supported by this device

Hi!

Maybe the description of "ngroups" could be "number of*virtqueue*
groups supported by this device"? I think that it could be needed in
some contexts reading the code.



Exactly.

Will fix.

Thanks




Thanks!





Re: [RFC PATCH 06/24] vhost-vdpa: switch to use vhost-vdpa specific IOTLB

2020-10-08 Thread Jason Wang



On 2020/9/30 下午8:02, Eli Cohen wrote:

On Thu, Sep 24, 2020 at 11:21:07AM +0800, Jason Wang wrote:

To ease the implementation of per group ASID support for vDPA
device. This patch switches to use a vhost-vdpa specific IOTLB to
avoid the unnecessary refactoring of the vhost core.

Signed-off-by: Jason Wang 
---
  drivers/vhost/vdpa.c | 14 --
  1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 74bef1c15a70..ec3c94f706c1 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -40,6 +40,7 @@ struct vhost_vdpa {
struct vhost_virtqueue *vqs;
struct completion completion;
struct vdpa_device *vdpa;
+   struct vhost_iotlb *iotlb;
struct device dev;
struct cdev cdev;
atomic_t opened;
@@ -514,12 +515,11 @@ static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v,
  
  static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v)

  {
-   struct vhost_dev *dev = >vdev;
-   struct vhost_iotlb *iotlb = dev->iotlb;
+   struct vhost_iotlb *iotlb = v->iotlb;
  
  	vhost_vdpa_iotlb_unmap(v, iotlb, 0ULL, 0ULL - 1);

-   kfree(dev->iotlb);
-   dev->iotlb = NULL;
+   kfree(v->iotlb);
+   v->iotlb = NULL;
  }
  
  static int perm_to_iommu_flags(u32 perm)

@@ -681,7 +681,7 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev 
*dev,
struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev);
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
-   struct vhost_iotlb *iotlb = dev->iotlb;
+   struct vhost_iotlb *iotlb = v->iotlb;
int r = 0;
  
  	r = vhost_dev_check_owner(dev);

@@ -812,12 +812,14 @@ static int vhost_vdpa_open(struct inode *inode, struct 
file *filep)
  
  	r = vhost_vdpa_alloc_domain(v);

if (r)
-   goto err_init_iotlb;
+   goto err_alloc_domain;

You're still using this:
dev->iotlb = vhost_iotlb_alloc(0, 0);

Shouldn't you use
v->iotlb = host_iotlb_alloc(0, 0);

to set the vdpa device iotlb field?



Yes, you're right.

Will fix.

Thanks




  
  	filep->private_data = v;
  
  	return 0;
  
+err_alloc_domain:

+   vhost_vdpa_iotlb_free(v);
  err_init_iotlb:
vhost_vdpa_cleanup(v);
  err:
--
2.20.1





Re: [RFC PATCH 05/24] vhost-vdpa: passing iotlb to IOMMU mapping helpers

2020-10-08 Thread Jason Wang



On 2020/9/30 下午7:26, Eli Cohen wrote:

On Thu, Sep 24, 2020 at 11:21:06AM +0800, Jason Wang wrote:

To prepare for the ASID support for vhost-vdpa, try to pass IOTLB
object to dma helpers.

Maybe it's worth mentioning here that this patch does not change any
functionality and is presented as a preparation for passing different
iotlb's instead of using dev->iotlb



Right, let me add them in the next version.

Thanks





Signed-off-by: Jason Wang 
---
  drivers/vhost/vdpa.c | 40 ++--
  1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 9c641274b9f3..74bef1c15a70 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -489,10 +489,11 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
return r;
  }
  
-static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last)

+static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v,
+  struct vhost_iotlb *iotlb,
+  u64 start, u64 last)
  {
struct vhost_dev *dev = >vdev;
-   struct vhost_iotlb *iotlb = dev->iotlb;
struct vhost_iotlb_map *map;
struct page *page;
unsigned long pfn, pinned;
@@ -514,8 +515,9 @@ static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, 
u64 start, u64 last)
  static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v)
  {
struct vhost_dev *dev = >vdev;
+   struct vhost_iotlb *iotlb = dev->iotlb;
  
-	vhost_vdpa_iotlb_unmap(v, 0ULL, 0ULL - 1);

+   vhost_vdpa_iotlb_unmap(v, iotlb, 0ULL, 0ULL - 1);
kfree(dev->iotlb);
dev->iotlb = NULL;
  }
@@ -542,15 +544,14 @@ static int perm_to_iommu_flags(u32 perm)
return flags | IOMMU_CACHE;
  }
  
-static int vhost_vdpa_map(struct vhost_vdpa *v,

+static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb,
  u64 iova, u64 size, u64 pa, u32 perm)
  {
-   struct vhost_dev *dev = >vdev;
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
int r = 0;
  
-	r = vhost_iotlb_add_range(dev->iotlb, iova, iova + size - 1,

+   r = vhost_iotlb_add_range(iotlb, iova, iova + size - 1,
  pa, perm);
if (r)
return r;
@@ -559,7 +560,7 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
r = ops->dma_map(vdpa, iova, size, pa, perm);
} else if (ops->set_map) {
if (!v->in_batch)
-   r = ops->set_map(vdpa, dev->iotlb);
+   r = ops->set_map(vdpa, iotlb);
} else {
r = iommu_map(v->domain, iova, pa, size,
  perm_to_iommu_flags(perm));
@@ -568,29 +569,30 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
return r;
  }
  
-static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size)

+static void vhost_vdpa_unmap(struct vhost_vdpa *v,
+struct vhost_iotlb *iotlb,
+u64 iova, u64 size)
  {
-   struct vhost_dev *dev = >vdev;
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
  
-	vhost_vdpa_iotlb_unmap(v, iova, iova + size - 1);

+   vhost_vdpa_iotlb_unmap(v, iotlb, iova, iova + size - 1);
  
  	if (ops->dma_map) {

ops->dma_unmap(vdpa, iova, size);
} else if (ops->set_map) {
if (!v->in_batch)
-   ops->set_map(vdpa, dev->iotlb);
+   ops->set_map(vdpa, iotlb);
} else {
iommu_unmap(v->domain, iova, size);
}
  }
  
  static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

+  struct vhost_iotlb *iotlb,
   struct vhost_iotlb_msg *msg)
  {
struct vhost_dev *dev = >vdev;
-   struct vhost_iotlb *iotlb = dev->iotlb;
struct page **page_list;
unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
unsigned int gup_flags = FOLL_LONGTERM;
@@ -644,7 +646,7 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
if (last_pfn && (this_pfn != last_pfn + 1)) {
/* Pin a contiguous chunk of memory */
csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-   if (vhost_vdpa_map(v, iova, csize,
+   if (vhost_vdpa_map(v, iotlb, iova, csize,
   map_pfn << PAGE_SHIFT,
   msg->perm))
goto out;
@@ -660,11 +662,12 @@ static int vhost_vdpa_

Re: [PATCH] vhost-vdpa: fix page pinning leakage in error path

2020-10-02 Thread Jason Wang



On 2020/10/2 上午4:23, Si-Wei Liu wrote:

Pinned pages are not properly accounted particularly when
mapping error occurs on IOTLB update. Clean up dangling
pinned pages for the error path. As the inflight pinned
pages, specifically for memory region that strides across
multiple chunks, would need more than one free page for
book keeping and accounting. For simplicity, pin pages
for all memory in the IOVA range in one go rather than
have multiple pin_user_pages calls to make up the entire
region. This way it's easier to track and account the
pages already mapped, particularly for clean-up in the
error path.

Fixes: 20453a45fb06 ("vhost: introduce vDPA-based backend")
Signed-off-by: Si-Wei Liu 
---
  drivers/vhost/vdpa.c | 121 +++
  1 file changed, 73 insertions(+), 48 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 796fe97..abc4aa2 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -565,6 +565,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v,
  perm_to_iommu_flags(perm));
}
  
+	if (r)

+   vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
return r;
  }



Please use a separate patch for this fix.


  
@@ -592,21 +594,19 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

struct vhost_dev *dev = >vdev;
struct vhost_iotlb *iotlb = dev->iotlb;
struct page **page_list;
-   unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
+   struct vm_area_struct **vmas;
unsigned int gup_flags = FOLL_LONGTERM;
-   unsigned long npages, cur_base, map_pfn, last_pfn = 0;
-   unsigned long locked, lock_limit, pinned, i;
+   unsigned long map_pfn, last_pfn = 0;
+   unsigned long npages, lock_limit;
+   unsigned long i, nmap = 0;
u64 iova = msg->iova;
+   long pinned;
int ret = 0;
  
  	if (vhost_iotlb_itree_first(iotlb, msg->iova,

msg->iova + msg->size - 1))
return -EEXIST;
  
-	page_list = (struct page **) __get_free_page(GFP_KERNEL);

-   if (!page_list)
-   return -ENOMEM;
-
if (msg->perm & VHOST_ACCESS_WO)
gup_flags |= FOLL_WRITE;
  
@@ -614,61 +614,86 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,

if (!npages)
return -EINVAL;
  
+	page_list = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);

+   vmas = kvmalloc_array(npages, sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+   if (!page_list || !vmas) {
+   ret = -ENOMEM;
+   goto free;
+   }
+
mmap_read_lock(dev->mm);
  
-	locked = atomic64_add_return(npages, >mm->pinned_vm);

lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (locked > lock_limit) {
+   if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) {
ret = -ENOMEM;
-   goto out;
+   goto unlock;
}
  
-	cur_base = msg->uaddr & PAGE_MASK;

-   iova &= PAGE_MASK;
+   pinned = pin_user_pages(msg->uaddr & PAGE_MASK, npages, gup_flags,
+   page_list, vmas);
+   if (npages != pinned) {
+   if (pinned < 0) {
+   ret = pinned;
+   } else {
+   unpin_user_pages(page_list, pinned);
+   ret = -ENOMEM;
+   }
+   goto unlock;
+   }
  
-	while (npages) {

-   pinned = min_t(unsigned long, npages, list_size);
-   ret = pin_user_pages(cur_base, pinned,
-gup_flags, page_list, NULL);
-   if (ret != pinned)
-   goto out;
-
-   if (!last_pfn)
-   map_pfn = page_to_pfn(page_list[0]);
-
-   for (i = 0; i < ret; i++) {
-   unsigned long this_pfn = page_to_pfn(page_list[i]);
-   u64 csize;
-
-   if (last_pfn && (this_pfn != last_pfn + 1)) {
-   /* Pin a contiguous chunk of memory */
-   csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
-   if (vhost_vdpa_map(v, iova, csize,
-  map_pfn << PAGE_SHIFT,
-  msg->perm))
-   goto out;
-   map_pfn = this_pfn;
-   iova += csize;
+   iova &= PAGE_MASK;
+   map_pfn = page_to_pfn(page_list[0]);
+
+   /* One more iteration to avoid extra vdpa_map() call out of loop. */
+   for (i = 0; i <= npages; i++) {
+   unsigned long this_pfn;
+   u64 csize;
+
+   /* The last chunk may have no valid PFN 

Re: [RFC PATCH 02/24] vhost-vdpa: fix vqs leak in vhost_vdpa_open()

2020-09-25 Thread Jason Wang



On 2020/9/24 下午3:48, Eli Cohen wrote:

On Thu, Sep 24, 2020 at 11:21:03AM +0800, Jason Wang wrote:

We need to free vqs during the err path after it has been allocated
since vhost won't do that for us.

Signed-off-by: Jason Wang 
---
  drivers/vhost/vdpa.c | 11 ---
  1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 796fe979f997..9c641274b9f3 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -764,6 +764,12 @@ static void vhost_vdpa_free_domain(struct vhost_vdpa *v)
v->domain = NULL;
  }
  
+static void vhost_vdpa_cleanup(struct vhost_vdpa *v)

+{
+   vhost_dev_cleanup(>vdev);
+   kfree(v->vdev.vqs);
+}
+

Wouldn't it be cleaner to call kfree(vqs) explicilty inside
vhost_vdpa_open() in case of failure and keep the symetry of
vhost_dev_init()/vhost_dev_cleanup()?



That's also fine.

See 
https://www.mail-archive.com/virtualization@lists.linux-foundation.org/msg42558.html


I will use that for the next version.

Thanks.





  static int vhost_vdpa_open(struct inode *inode, struct file *filep)
  {
struct vhost_vdpa *v;
@@ -809,7 +815,7 @@ static int vhost_vdpa_open(struct inode *inode, struct file 
*filep)
return 0;
  
  err_init_iotlb:

-   vhost_dev_cleanup(>vdev);
+   vhost_vdpa_cleanup(v);
  err:
atomic_dec(>opened);
return r;
@@ -840,8 +846,7 @@ static int vhost_vdpa_release(struct inode *inode, struct 
file *filep)
vhost_vdpa_free_domain(v);
vhost_vdpa_config_put(v);
vhost_vdpa_clean_irq(v);
-   vhost_dev_cleanup(>vdev);
-   kfree(v->vdev.vqs);
+   vhost_vdpa_cleanup(v);
mutex_unlock(>mutex);
  
  	atomic_dec(>opened);

--
2.20.1





Re: [RFC PATCH 00/24] Control VQ support in vDPA

2020-09-25 Thread Jason Wang



On 2020/9/24 下午6:17, Stefan Hajnoczi wrote:

On Thu, Sep 24, 2020 at 11:21:01AM +0800, Jason Wang wrote:

This series tries to add the support for control virtqueue in vDPA.

Please include documentation for both driver authors and vhost-vdpa
ioctl users. vhost-vdpa ioctls are only documented with a single
sentence. Please add full information on arguments, return values, and a
high-level explanation of the feature (like this cover letter) to
introduce the API.



Right, this is in the TODO list. (And we probably need to start with 
documenting vDPA bus operations first).





What is the policy for using virtqueue groups? My guess is:
1. virtio_vdpa simply enables all virtqueue groups.
2. vhost_vdpa relies on userspace policy on how to use virtqueue groups.
Are the semantics of virtqueue groups documented somewhere so
userspace knows what to do? If a vDPA driver author decides to create
N virtqueue groups, N/2 virtqueue groups, or just 1 virtqueue group,
how will userspace know what to do?



So the mapping from virtqueue to virtqueue group is mandated by the vDPA 
device(driver). vDPA bus driver (like vhost-vDPA), can only change the 
association between virtqueue groups and ASID.


By default, it is required all virtqueue groups to be associated to 
address space 0. This make sure virtio_vdpa can work without any special 
groups/asid configuration.


I admit we need document all those semantics/polices.




Maybe a document is needed to describe the recommended device-specific
virtqueue groups that vDPA drivers should implement (e.g. "put the net
control vq into its own virtqueue group")?



Yes, note that this depends on the hardware capability actually. It can 
only put control vq in other virtqueue group if:


1) hardware support to isolate control vq DMA from the rest virtqueues 
(PASID or simply using PA (translated address) for control vq)

or
2) the control vq is emulated by vDPA device driver (like vdpa_sim did).




This could become messy with guidelines. For example, drivers might be
shipped that aren't usable for certain use cases just because the author
didn't know that a certain virtqueue grouping is advantageous.



Right.




BTW I like how general this feature is. It seems to allow vDPA devices
to be split into sub-devices for further passthrough. Who will write the
first vDPA-on-vDPA driver? :)



Yes, that's an interesting question. For now, I can imagine we can 
emulated a SRIOV based virtio-net VFs via this.


If we want to expose the ASID setting to guest as well, it probably 
needs more thought.


Thanks




Stefan




Re: [PATCH v3 -next] vdpa: mlx5: change Kconfig depends to fix build errors

2020-09-25 Thread Jason Wang



On 2020/9/25 下午6:19, Michael S. Tsirkin wrote:

On Fri, Sep 25, 2020 at 10:20:05AM +0300, Leon Romanovsky wrote:

On Thu, Sep 24, 2020 at 12:02:43PM -0400, Michael S. Tsirkin wrote:

On Thu, Sep 24, 2020 at 08:47:05AM -0700, Randy Dunlap wrote:

On 9/24/20 3:24 AM, Eli Cohen wrote:

On Thu, Sep 24, 2020 at 05:30:55AM -0400, Michael S. Tsirkin wrote:

--- linux-next-20200917.orig/drivers/vdpa/Kconfig
+++ linux-next-20200917/drivers/vdpa/Kconfig
@@ -31,7 +31,7 @@ config IFCVF

  config MLX5_VDPA
bool "MLX5 VDPA support library for ConnectX devices"
-   depends on MLX5_CORE
+   depends on VHOST_IOTLB && MLX5_CORE
default n

While we are here, can anyone who apply this patch delete the "default n" line?
It is by default "n".

I can do that


Thanks

Hmm other drivers select VHOST_IOTLB, why not do the same?

v1 used select, but Saeed requested use of depends instead because
select can cause problems.


I can't see another driver doing that. Perhaps I can set dependency on
VHOST which by itself depends on VHOST_IOTLB?



help
  Support library for Mellanox VDPA drivers. Provides code that is


Saeed what kind of problems? It's used with select in other places,
isn't it?

IMHO, "depends" is much more explicit than "select".

Thanks

This is now how VHOST_IOTLB has been designed though.
If you want to change VHOST_IOTLB to depends I think
we should do it consistently all over.


config VHOST_IOTLB
 tristate
 help
   Generic IOTLB implementation for vhost and vringh.
   This option is selected by any driver which needs to support
   an IOMMU in software.



Yes, since there's no prompt for VHOST_IOTLB which means, if there's no 
other symbol that select VHOST_IOTLB, you can't enable MLX5 at all.


See kconfig-language.rst:


    In general use select only for non-visible symbols
    (no prompts anywhere) and for symbols with no dependencies.
    That will limit the usefulness but on the other hand avoid
    the illegal configurations all over.

Thanks






--
~Randy




Re: [RFC PATCH 02/24] vhost-vdpa: fix vqs leak in vhost_vdpa_open()

2020-09-25 Thread Jason Wang



On 2020/9/24 下午5:31, Michael S. Tsirkin wrote:

On Thu, Sep 24, 2020 at 11:21:03AM +0800, Jason Wang wrote:

We need to free vqs during the err path after it has been allocated
since vhost won't do that for us.

Signed-off-by: Jason Wang 

This is a bugfix too right? I don't see it posted separately ...



A patch that is functional equivalent is posted here:

https://www.mail-archive.com/virtualization@lists.linux-foundation.org/msg42558.html

I'm a little bit lazy to use that one since this patch is probably wrote 
before that one.


Thanks





---
  drivers/vhost/vdpa.c | 11 ---
  1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 796fe979f997..9c641274b9f3 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -764,6 +764,12 @@ static void vhost_vdpa_free_domain(struct vhost_vdpa *v)
v->domain = NULL;
  }
  
+static void vhost_vdpa_cleanup(struct vhost_vdpa *v)

+{
+   vhost_dev_cleanup(>vdev);
+   kfree(v->vdev.vqs);
+}
+
  static int vhost_vdpa_open(struct inode *inode, struct file *filep)
  {
struct vhost_vdpa *v;
@@ -809,7 +815,7 @@ static int vhost_vdpa_open(struct inode *inode, struct file 
*filep)
return 0;
  
  err_init_iotlb:

-   vhost_dev_cleanup(>vdev);
+   vhost_vdpa_cleanup(v);
  err:
atomic_dec(>opened);
return r;
@@ -840,8 +846,7 @@ static int vhost_vdpa_release(struct inode *inode, struct 
file *filep)
vhost_vdpa_free_domain(v);
vhost_vdpa_config_put(v);
vhost_vdpa_clean_irq(v);
-   vhost_dev_cleanup(>vdev);
-   kfree(v->vdev.vqs);
+   vhost_vdpa_cleanup(v);
mutex_unlock(>mutex);
  
  	atomic_dec(>opened);

--
2.20.1




Re: [RFC PATCH 01/24] vhost-vdpa: fix backend feature ioctls

2020-09-24 Thread Jason Wang



On 2020/9/24 下午3:50, Michael S. Tsirkin wrote:

On Thu, Sep 24, 2020 at 11:21:02AM +0800, Jason Wang wrote:

Commit 653055b9acd4 ("vhost-vdpa: support get/set backend features")
introduces two malfunction backend features ioctls:

1) the ioctls was blindly added to vring ioctl instead of vdpa device
ioctl
2) vhost_set_backend_features() was called when dev mutex has already
been held which will lead a deadlock

This patch fixes the above issues.

Cc: Eli Cohen
Reported-by: Zhu Lingshan
Fixes: 653055b9acd4 ("vhost-vdpa: support get/set backend features")
Signed-off-by: Jason Wang

Don't we want the fixes queued right now, as opposed to the rest of the
RFC?



Yes, actually I've posted in before[1].

Adding the patch here is to simplify the work for the guys that want to 
do the work on top. E.g for Cindy to start the Qemu prototype.


Thanks

[1] https://www.spinics.net/lists/netdev/msg681247.html




<    1   2   3   4   5   6   7   8   9   10   >